Spaces:

itsVilen
/

Pdf-To-Word

Running

File size: 6,116 Bytes

9f4fc64

import os
import time
import fitz  # PyMuPDF
from docx import Document
from docx.shared import Pt, RGBColor, Inches
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from PIL import Image
import io
import gradio as gr
import subprocess

# Install required system dependencies
def install_dependencies():
    subprocess.run(["apt-get", "update"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    subprocess.run(["apt-get", "install", "-y", "poppler-utils", "libreoffice"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

install_dependencies()

def convert_pdf_to_word(pdf_file, filename):
    """Convert PDF to Word with maximum fidelity"""
    try:
        # Prepare output path
        docx_path = filename.replace('.pdf', '_converted.docx')

        # First try with pdf2docx
        try:
            from pdf2docx import Converter
            cv = Converter(pdf_file.name)
            cv.convert(docx_path,
                      start=0,
                      end=None,
                      keep_layout=True)
            cv.close()
        except Exception as e:
            print(f"Primary conversion method failed: {e}, trying fallback...")
            # Fallback to libreoffice
            subprocess.run(["libreoffice", "--headless", "--convert-to", "docx", "--outdir", "/tmp", pdf_file.name], 
                          stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
            temp_docx = os.path.join("/tmp", os.path.basename(pdf_file.name).replace(".pdf", ".docx"))
            if os.path.exists(temp_docx):
                os.rename(temp_docx, docx_path)
            else:
                raise Exception("LibreOffice conversion failed")

        # Enhance the output with PyMuPDF for better formatting
        try:
            doc = Document(docx_path)
            pdf_doc = fitz.open(pdf_file.name)

            # Process each page for precise formatting
            for page_num in range(len(pdf_doc)):
                page = pdf_doc.load_page(page_num)
                blocks = page.get_text("dict")["blocks"]

                for b in blocks:
                    if "lines" in b:
                        for line in b["lines"]:
                            for span in line["spans"]:
                                # Match text style in Word doc
                                for paragraph in doc.paragraphs:
                                    if span["text"].strip() and span["text"].strip() in paragraph.text:
                                        for run in paragraph.runs:
                                            if span["text"].strip() in run.text:
                                                # Set font properties
                                                run.font.name = span["font"]
                                                run.font.size = Pt(span["size"])

                                                # Set color
                                                if "color" in span:
                                                    color = span["color"]
                                                    r = (color >> 16) & 0xff
                                                    g = (color >> 8) & 0xff
                                                    b = color & 0xff
                                                    run.font.color.rgb = RGBColor(r, g, b)

                                                # Set styles
                                                run.font.bold = bool(span["flags"] & 2 ** 4)
                                                run.font.italic = bool(span["flags"] & 2 ** 1)
                                                run.font.underline = bool(span["flags"] & 2 ** 2)

            # Handle images with precise positioning
            for page_num in range(len(pdf_doc)):
                page = pdf_doc.load_page(page_num)
                image_list = page.get_images(full=True)

                for img_index, img in enumerate(image_list):
                    xref = img[0]
                    base_image = pdf_doc.extract_image(xref)
                    image_bytes = base_image["image"]

                    # Convert to PIL Image
                    image = Image.open(io.BytesIO(image_bytes))
                    image_path = f"/tmp/img_{page_num}_{img_index}.png"
                    image.save(image_path)

                    # Add to document with original dimensions
                    doc.add_picture(image_path, width=Inches(image.width/72), height=Inches(image.height/72))
                    os.remove(image_path)

            pdf_doc.close()
            doc.save(docx_path)
        except Exception as e:
            print(f"Formatting enhancement failed: {e}, using basic conversion")

        return docx_path

    except Exception as e:
        raise Exception(f"Conversion failed: {str(e)}")

def process_pdf(file):
    if not file:
        raise gr.Error("Please upload a PDF file first")
    
    if not file.name.lower().endswith('.pdf'):
        raise gr.Error("Please upload a PDF file")
    
    try:
        start_time = time.time()
        output_path = convert_pdf_to_word(file, file.name)
        conversion_time = time.time() - start_time
        
        return output_path, f"✅ Conversion completed in {conversion_time:.1f} seconds"
    except Exception as e:
        raise gr.Error(f"Conversion failed: {str(e)}")

# Create Gradio interface
with gr.Blocks(title="PDF to Word Converter") as demo:
    gr.Markdown("# PDF to Word Converter")
    gr.Markdown("Upload a PDF file and convert it to an editable Word document while preserving formatting.")
    
    with gr.Row():
        with gr.Column():
            file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
            convert_btn = gr.Button("Convert to Word", variant="primary")
        with gr.Column():
            status = gr.Textbox(label="Status")
            file_output = gr.File(label="Download Word File")
    
    convert_btn.click(
        fn=process_pdf,
        inputs=file_input,
        outputs=[file_output, status]
    )

if __name__ == "__main__":
    demo.launch()