Spaces:

itsVilen
/

Pdf-To-Word

Running

App Files Files Community

itsVilen commited on Apr 19

Commit

9f4fc64

verified ·

1 Parent(s): 468810d

Create app.py

Browse files

Files changed (1) hide show

app.py +146 -0

app.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import os
+import time
+import fitz  # PyMuPDF
+from docx import Document
+from docx.shared import Pt, RGBColor, Inches
+from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
+from PIL import Image
+import io
+import gradio as gr
+import subprocess
+# Install required system dependencies
+def install_dependencies():
+    subprocess.run(["apt-get", "update"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    subprocess.run(["apt-get", "install", "-y", "poppler-utils", "libreoffice"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+install_dependencies()
+def convert_pdf_to_word(pdf_file, filename):
+    """Convert PDF to Word with maximum fidelity"""
+    try:
+        # Prepare output path
+        docx_path = filename.replace('.pdf', '_converted.docx')
+        # First try with pdf2docx
+        try:
+            from pdf2docx import Converter
+            cv = Converter(pdf_file.name)
+            cv.convert(docx_path,
+                      start=0,
+                      end=None,
+                      keep_layout=True)
+            cv.close()
+        except Exception as e:
+            print(f"Primary conversion method failed: {e}, trying fallback...")
+            # Fallback to libreoffice
+            subprocess.run(["libreoffice", "--headless", "--convert-to", "docx", "--outdir", "/tmp", pdf_file.name],
+                          stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            temp_docx = os.path.join("/tmp", os.path.basename(pdf_file.name).replace(".pdf", ".docx"))
+            if os.path.exists(temp_docx):
+                os.rename(temp_docx, docx_path)
+            else:
+                raise Exception("LibreOffice conversion failed")
+        # Enhance the output with PyMuPDF for better formatting
+        try:
+            doc = Document(docx_path)
+            pdf_doc = fitz.open(pdf_file.name)
+            # Process each page for precise formatting
+            for page_num in range(len(pdf_doc)):
+                page = pdf_doc.load_page(page_num)
+                blocks = page.get_text("dict")["blocks"]
+                for b in blocks:
+                    if "lines" in b:
+                        for line in b["lines"]:
+                            for span in line["spans"]:
+                                # Match text style in Word doc
+                                for paragraph in doc.paragraphs:
+                                    if span["text"].strip() and span["text"].strip() in paragraph.text:
+                                        for run in paragraph.runs:
+                                            if span["text"].strip() in run.text:
+                                                # Set font properties
+                                                run.font.name = span["font"]
+                                                run.font.size = Pt(span["size"])
+                                                # Set color
+                                                if "color" in span:
+                                                    color = span["color"]
+                                                    r = (color >> 16) & 0xff
+                                                    g = (color >> 8) & 0xff
+                                                    b = color & 0xff
+                                                    run.font.color.rgb = RGBColor(r, g, b)
+                                                # Set styles
+                                                run.font.bold = bool(span["flags"] & 2 ** 4)
+                                                run.font.italic = bool(span["flags"] & 2 ** 1)
+                                                run.font.underline = bool(span["flags"] & 2 ** 2)
+            # Handle images with precise positioning
+            for page_num in range(len(pdf_doc)):
+                page = pdf_doc.load_page(page_num)
+                image_list = page.get_images(full=True)
+                for img_index, img in enumerate(image_list):
+                    xref = img[0]
+                    base_image = pdf_doc.extract_image(xref)
+                    image_bytes = base_image["image"]
+                    # Convert to PIL Image
+                    image = Image.open(io.BytesIO(image_bytes))
+                    image_path = f"/tmp/img_{page_num}_{img_index}.png"
+                    image.save(image_path)
+                    # Add to document with original dimensions
+                    doc.add_picture(image_path, width=Inches(image.width/72), height=Inches(image.height/72))
+                    os.remove(image_path)
+            pdf_doc.close()
+            doc.save(docx_path)
+        except Exception as e:
+            print(f"Formatting enhancement failed: {e}, using basic conversion")
+        return docx_path
+    except Exception as e:
+        raise Exception(f"Conversion failed: {str(e)}")
+def process_pdf(file):
+    if not file:
+        raise gr.Error("Please upload a PDF file first")
+    if not file.name.lower().endswith('.pdf'):
+        raise gr.Error("Please upload a PDF file")
+    try:
+        start_time = time.time()
+        output_path = convert_pdf_to_word(file, file.name)
+        conversion_time = time.time() - start_time
+        return output_path, f"✅ Conversion completed in {conversion_time:.1f} seconds"
+    except Exception as e:
+        raise gr.Error(f"Conversion failed: {str(e)}")
+# Create Gradio interface
+with gr.Blocks(title="PDF to Word Converter") as demo:
+    gr.Markdown("# PDF to Word Converter")
+    gr.Markdown("Upload a PDF file and convert it to an editable Word document while preserving formatting.")
+    with gr.Row():
+        with gr.Column():
+            file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
+            convert_btn = gr.Button("Convert to Word", variant="primary")
+        with gr.Column():
+            status = gr.Textbox(label="Status")
+            file_output = gr.File(label="Download Word File")
+    convert_btn.click(
+        fn=process_pdf,
+        inputs=file_input,
+        outputs=[file_output, status]
+    )
+if __name__ == "__main__":
+    demo.launch()