Pdf-To-Word / app.py
itsVilen's picture
Create app.py
9f4fc64 verified
import os
import time
import fitz # PyMuPDF
from docx import Document
from docx.shared import Pt, RGBColor, Inches
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from PIL import Image
import io
import gradio as gr
import subprocess
# Install required system dependencies
def install_dependencies():
subprocess.run(["apt-get", "update"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
subprocess.run(["apt-get", "install", "-y", "poppler-utils", "libreoffice"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
install_dependencies()
def convert_pdf_to_word(pdf_file, filename):
"""Convert PDF to Word with maximum fidelity"""
try:
# Prepare output path
docx_path = filename.replace('.pdf', '_converted.docx')
# First try with pdf2docx
try:
from pdf2docx import Converter
cv = Converter(pdf_file.name)
cv.convert(docx_path,
start=0,
end=None,
keep_layout=True)
cv.close()
except Exception as e:
print(f"Primary conversion method failed: {e}, trying fallback...")
# Fallback to libreoffice
subprocess.run(["libreoffice", "--headless", "--convert-to", "docx", "--outdir", "/tmp", pdf_file.name],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
temp_docx = os.path.join("/tmp", os.path.basename(pdf_file.name).replace(".pdf", ".docx"))
if os.path.exists(temp_docx):
os.rename(temp_docx, docx_path)
else:
raise Exception("LibreOffice conversion failed")
# Enhance the output with PyMuPDF for better formatting
try:
doc = Document(docx_path)
pdf_doc = fitz.open(pdf_file.name)
# Process each page for precise formatting
for page_num in range(len(pdf_doc)):
page = pdf_doc.load_page(page_num)
blocks = page.get_text("dict")["blocks"]
for b in blocks:
if "lines" in b:
for line in b["lines"]:
for span in line["spans"]:
# Match text style in Word doc
for paragraph in doc.paragraphs:
if span["text"].strip() and span["text"].strip() in paragraph.text:
for run in paragraph.runs:
if span["text"].strip() in run.text:
# Set font properties
run.font.name = span["font"]
run.font.size = Pt(span["size"])
# Set color
if "color" in span:
color = span["color"]
r = (color >> 16) & 0xff
g = (color >> 8) & 0xff
b = color & 0xff
run.font.color.rgb = RGBColor(r, g, b)
# Set styles
run.font.bold = bool(span["flags"] & 2 ** 4)
run.font.italic = bool(span["flags"] & 2 ** 1)
run.font.underline = bool(span["flags"] & 2 ** 2)
# Handle images with precise positioning
for page_num in range(len(pdf_doc)):
page = pdf_doc.load_page(page_num)
image_list = page.get_images(full=True)
for img_index, img in enumerate(image_list):
xref = img[0]
base_image = pdf_doc.extract_image(xref)
image_bytes = base_image["image"]
# Convert to PIL Image
image = Image.open(io.BytesIO(image_bytes))
image_path = f"/tmp/img_{page_num}_{img_index}.png"
image.save(image_path)
# Add to document with original dimensions
doc.add_picture(image_path, width=Inches(image.width/72), height=Inches(image.height/72))
os.remove(image_path)
pdf_doc.close()
doc.save(docx_path)
except Exception as e:
print(f"Formatting enhancement failed: {e}, using basic conversion")
return docx_path
except Exception as e:
raise Exception(f"Conversion failed: {str(e)}")
def process_pdf(file):
if not file:
raise gr.Error("Please upload a PDF file first")
if not file.name.lower().endswith('.pdf'):
raise gr.Error("Please upload a PDF file")
try:
start_time = time.time()
output_path = convert_pdf_to_word(file, file.name)
conversion_time = time.time() - start_time
return output_path, f"✅ Conversion completed in {conversion_time:.1f} seconds"
except Exception as e:
raise gr.Error(f"Conversion failed: {str(e)}")
# Create Gradio interface
with gr.Blocks(title="PDF to Word Converter") as demo:
gr.Markdown("# PDF to Word Converter")
gr.Markdown("Upload a PDF file and convert it to an editable Word document while preserving formatting.")
with gr.Row():
with gr.Column():
file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
convert_btn = gr.Button("Convert to Word", variant="primary")
with gr.Column():
status = gr.Textbox(label="Status")
file_output = gr.File(label="Download Word File")
convert_btn.click(
fn=process_pdf,
inputs=file_input,
outputs=[file_output, status]
)
if __name__ == "__main__":
demo.launch()