import os import time import fitz # PyMuPDF from docx import Document from docx.shared import Pt, RGBColor, Inches from docx.enum.text import WD_PARAGRAPH_ALIGNMENT from PIL import Image import io import gradio as gr import subprocess # Install required system dependencies def install_dependencies(): subprocess.run(["apt-get", "update"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) subprocess.run(["apt-get", "install", "-y", "poppler-utils", "libreoffice"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) install_dependencies() def convert_pdf_to_word(pdf_file, filename): """Convert PDF to Word with maximum fidelity""" try: # Prepare output path docx_path = filename.replace('.pdf', '_converted.docx') # First try with pdf2docx try: from pdf2docx import Converter cv = Converter(pdf_file.name) cv.convert(docx_path, start=0, end=None, keep_layout=True) cv.close() except Exception as e: print(f"Primary conversion method failed: {e}, trying fallback...") # Fallback to libreoffice subprocess.run(["libreoffice", "--headless", "--convert-to", "docx", "--outdir", "/tmp", pdf_file.name], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) temp_docx = os.path.join("/tmp", os.path.basename(pdf_file.name).replace(".pdf", ".docx")) if os.path.exists(temp_docx): os.rename(temp_docx, docx_path) else: raise Exception("LibreOffice conversion failed") # Enhance the output with PyMuPDF for better formatting try: doc = Document(docx_path) pdf_doc = fitz.open(pdf_file.name) # Process each page for precise formatting for page_num in range(len(pdf_doc)): page = pdf_doc.load_page(page_num) blocks = page.get_text("dict")["blocks"] for b in blocks: if "lines" in b: for line in b["lines"]: for span in line["spans"]: # Match text style in Word doc for paragraph in doc.paragraphs: if span["text"].strip() and span["text"].strip() in paragraph.text: for run in paragraph.runs: if span["text"].strip() in run.text: # Set font properties run.font.name = span["font"] run.font.size = Pt(span["size"]) # Set color if "color" in span: color = span["color"] r = (color >> 16) & 0xff g = (color >> 8) & 0xff b = color & 0xff run.font.color.rgb = RGBColor(r, g, b) # Set styles run.font.bold = bool(span["flags"] & 2 ** 4) run.font.italic = bool(span["flags"] & 2 ** 1) run.font.underline = bool(span["flags"] & 2 ** 2) # Handle images with precise positioning for page_num in range(len(pdf_doc)): page = pdf_doc.load_page(page_num) image_list = page.get_images(full=True) for img_index, img in enumerate(image_list): xref = img[0] base_image = pdf_doc.extract_image(xref) image_bytes = base_image["image"] # Convert to PIL Image image = Image.open(io.BytesIO(image_bytes)) image_path = f"/tmp/img_{page_num}_{img_index}.png" image.save(image_path) # Add to document with original dimensions doc.add_picture(image_path, width=Inches(image.width/72), height=Inches(image.height/72)) os.remove(image_path) pdf_doc.close() doc.save(docx_path) except Exception as e: print(f"Formatting enhancement failed: {e}, using basic conversion") return docx_path except Exception as e: raise Exception(f"Conversion failed: {str(e)}") def process_pdf(file): if not file: raise gr.Error("Please upload a PDF file first") if not file.name.lower().endswith('.pdf'): raise gr.Error("Please upload a PDF file") try: start_time = time.time() output_path = convert_pdf_to_word(file, file.name) conversion_time = time.time() - start_time return output_path, f"✅ Conversion completed in {conversion_time:.1f} seconds" except Exception as e: raise gr.Error(f"Conversion failed: {str(e)}") # Create Gradio interface with gr.Blocks(title="PDF to Word Converter") as demo: gr.Markdown("# PDF to Word Converter") gr.Markdown("Upload a PDF file and convert it to an editable Word document while preserving formatting.") with gr.Row(): with gr.Column(): file_input = gr.File(label="Upload PDF", file_types=[".pdf"]) convert_btn = gr.Button("Convert to Word", variant="primary") with gr.Column(): status = gr.Textbox(label="Status") file_output = gr.File(label="Download Word File") convert_btn.click( fn=process_pdf, inputs=file_input, outputs=[file_output, status] ) if __name__ == "__main__": demo.launch()