File size: 6,116 Bytes
9f4fc64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import os
import time
import fitz  # PyMuPDF
from docx import Document
from docx.shared import Pt, RGBColor, Inches
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from PIL import Image
import io
import gradio as gr
import subprocess

# Install required system dependencies
def install_dependencies():
    subprocess.run(["apt-get", "update"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    subprocess.run(["apt-get", "install", "-y", "poppler-utils", "libreoffice"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

install_dependencies()

def convert_pdf_to_word(pdf_file, filename):
    """Convert PDF to Word with maximum fidelity"""
    try:
        # Prepare output path
        docx_path = filename.replace('.pdf', '_converted.docx')

        # First try with pdf2docx
        try:
            from pdf2docx import Converter
            cv = Converter(pdf_file.name)
            cv.convert(docx_path,
                      start=0,
                      end=None,
                      keep_layout=True)
            cv.close()
        except Exception as e:
            print(f"Primary conversion method failed: {e}, trying fallback...")
            # Fallback to libreoffice
            subprocess.run(["libreoffice", "--headless", "--convert-to", "docx", "--outdir", "/tmp", pdf_file.name], 
                          stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            temp_docx = os.path.join("/tmp", os.path.basename(pdf_file.name).replace(".pdf", ".docx"))
            if os.path.exists(temp_docx):
                os.rename(temp_docx, docx_path)
            else:
                raise Exception("LibreOffice conversion failed")

        # Enhance the output with PyMuPDF for better formatting
        try:
            doc = Document(docx_path)
            pdf_doc = fitz.open(pdf_file.name)

            # Process each page for precise formatting
            for page_num in range(len(pdf_doc)):
                page = pdf_doc.load_page(page_num)
                blocks = page.get_text("dict")["blocks"]

                for b in blocks:
                    if "lines" in b:
                        for line in b["lines"]:
                            for span in line["spans"]:
                                # Match text style in Word doc
                                for paragraph in doc.paragraphs:
                                    if span["text"].strip() and span["text"].strip() in paragraph.text:
                                        for run in paragraph.runs:
                                            if span["text"].strip() in run.text:
                                                # Set font properties
                                                run.font.name = span["font"]
                                                run.font.size = Pt(span["size"])

                                                # Set color
                                                if "color" in span:
                                                    color = span["color"]
                                                    r = (color >> 16) & 0xff
                                                    g = (color >> 8) & 0xff
                                                    b = color & 0xff
                                                    run.font.color.rgb = RGBColor(r, g, b)

                                                # Set styles
                                                run.font.bold = bool(span["flags"] & 2 ** 4)
                                                run.font.italic = bool(span["flags"] & 2 ** 1)
                                                run.font.underline = bool(span["flags"] & 2 ** 2)

            # Handle images with precise positioning
            for page_num in range(len(pdf_doc)):
                page = pdf_doc.load_page(page_num)
                image_list = page.get_images(full=True)

                for img_index, img in enumerate(image_list):
                    xref = img[0]
                    base_image = pdf_doc.extract_image(xref)
                    image_bytes = base_image["image"]

                    # Convert to PIL Image
                    image = Image.open(io.BytesIO(image_bytes))
                    image_path = f"/tmp/img_{page_num}_{img_index}.png"
                    image.save(image_path)

                    # Add to document with original dimensions
                    doc.add_picture(image_path, width=Inches(image.width/72), height=Inches(image.height/72))
                    os.remove(image_path)

            pdf_doc.close()
            doc.save(docx_path)
        except Exception as e:
            print(f"Formatting enhancement failed: {e}, using basic conversion")

        return docx_path

    except Exception as e:
        raise Exception(f"Conversion failed: {str(e)}")

def process_pdf(file):
    if not file:
        raise gr.Error("Please upload a PDF file first")
    
    if not file.name.lower().endswith('.pdf'):
        raise gr.Error("Please upload a PDF file")
    
    try:
        start_time = time.time()
        output_path = convert_pdf_to_word(file, file.name)
        conversion_time = time.time() - start_time
        
        return output_path, f"✅ Conversion completed in {conversion_time:.1f} seconds"
    except Exception as e:
        raise gr.Error(f"Conversion failed: {str(e)}")

# Create Gradio interface
with gr.Blocks(title="PDF to Word Converter") as demo:
    gr.Markdown("# PDF to Word Converter")
    gr.Markdown("Upload a PDF file and convert it to an editable Word document while preserving formatting.")
    
    with gr.Row():
        with gr.Column():
            file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
            convert_btn = gr.Button("Convert to Word", variant="primary")
        with gr.Column():
            status = gr.Textbox(label="Status")
            file_output = gr.File(label="Download Word File")
    
    convert_btn.click(
        fn=process_pdf,
        inputs=file_input,
        outputs=[file_output, status]
    )

if __name__ == "__main__":
    demo.launch()