Spaces:
Running
Running
File size: 6,116 Bytes
9f4fc64 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 |
import os
import time
import fitz # PyMuPDF
from docx import Document
from docx.shared import Pt, RGBColor, Inches
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
from PIL import Image
import io
import gradio as gr
import subprocess
# Install required system dependencies
def install_dependencies():
subprocess.run(["apt-get", "update"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
subprocess.run(["apt-get", "install", "-y", "poppler-utils", "libreoffice"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
install_dependencies()
def convert_pdf_to_word(pdf_file, filename):
"""Convert PDF to Word with maximum fidelity"""
try:
# Prepare output path
docx_path = filename.replace('.pdf', '_converted.docx')
# First try with pdf2docx
try:
from pdf2docx import Converter
cv = Converter(pdf_file.name)
cv.convert(docx_path,
start=0,
end=None,
keep_layout=True)
cv.close()
except Exception as e:
print(f"Primary conversion method failed: {e}, trying fallback...")
# Fallback to libreoffice
subprocess.run(["libreoffice", "--headless", "--convert-to", "docx", "--outdir", "/tmp", pdf_file.name],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
temp_docx = os.path.join("/tmp", os.path.basename(pdf_file.name).replace(".pdf", ".docx"))
if os.path.exists(temp_docx):
os.rename(temp_docx, docx_path)
else:
raise Exception("LibreOffice conversion failed")
# Enhance the output with PyMuPDF for better formatting
try:
doc = Document(docx_path)
pdf_doc = fitz.open(pdf_file.name)
# Process each page for precise formatting
for page_num in range(len(pdf_doc)):
page = pdf_doc.load_page(page_num)
blocks = page.get_text("dict")["blocks"]
for b in blocks:
if "lines" in b:
for line in b["lines"]:
for span in line["spans"]:
# Match text style in Word doc
for paragraph in doc.paragraphs:
if span["text"].strip() and span["text"].strip() in paragraph.text:
for run in paragraph.runs:
if span["text"].strip() in run.text:
# Set font properties
run.font.name = span["font"]
run.font.size = Pt(span["size"])
# Set color
if "color" in span:
color = span["color"]
r = (color >> 16) & 0xff
g = (color >> 8) & 0xff
b = color & 0xff
run.font.color.rgb = RGBColor(r, g, b)
# Set styles
run.font.bold = bool(span["flags"] & 2 ** 4)
run.font.italic = bool(span["flags"] & 2 ** 1)
run.font.underline = bool(span["flags"] & 2 ** 2)
# Handle images with precise positioning
for page_num in range(len(pdf_doc)):
page = pdf_doc.load_page(page_num)
image_list = page.get_images(full=True)
for img_index, img in enumerate(image_list):
xref = img[0]
base_image = pdf_doc.extract_image(xref)
image_bytes = base_image["image"]
# Convert to PIL Image
image = Image.open(io.BytesIO(image_bytes))
image_path = f"/tmp/img_{page_num}_{img_index}.png"
image.save(image_path)
# Add to document with original dimensions
doc.add_picture(image_path, width=Inches(image.width/72), height=Inches(image.height/72))
os.remove(image_path)
pdf_doc.close()
doc.save(docx_path)
except Exception as e:
print(f"Formatting enhancement failed: {e}, using basic conversion")
return docx_path
except Exception as e:
raise Exception(f"Conversion failed: {str(e)}")
def process_pdf(file):
if not file:
raise gr.Error("Please upload a PDF file first")
if not file.name.lower().endswith('.pdf'):
raise gr.Error("Please upload a PDF file")
try:
start_time = time.time()
output_path = convert_pdf_to_word(file, file.name)
conversion_time = time.time() - start_time
return output_path, f"✅ Conversion completed in {conversion_time:.1f} seconds"
except Exception as e:
raise gr.Error(f"Conversion failed: {str(e)}")
# Create Gradio interface
with gr.Blocks(title="PDF to Word Converter") as demo:
gr.Markdown("# PDF to Word Converter")
gr.Markdown("Upload a PDF file and convert it to an editable Word document while preserving formatting.")
with gr.Row():
with gr.Column():
file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
convert_btn = gr.Button("Convert to Word", variant="primary")
with gr.Column():
status = gr.Textbox(label="Status")
file_output = gr.File(label="Download Word File")
convert_btn.click(
fn=process_pdf,
inputs=file_input,
outputs=[file_output, status]
)
if __name__ == "__main__":
demo.launch() |