itsVilen commited on
Commit
9f4fc64
·
verified ·
1 Parent(s): 468810d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -0
app.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import fitz # PyMuPDF
4
+ from docx import Document
5
+ from docx.shared import Pt, RGBColor, Inches
6
+ from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
7
+ from PIL import Image
8
+ import io
9
+ import gradio as gr
10
+ import subprocess
11
+
12
+ # Install required system dependencies
13
+ def install_dependencies():
14
+ subprocess.run(["apt-get", "update"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
15
+ subprocess.run(["apt-get", "install", "-y", "poppler-utils", "libreoffice"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
16
+
17
+ install_dependencies()
18
+
19
+ def convert_pdf_to_word(pdf_file, filename):
20
+ """Convert PDF to Word with maximum fidelity"""
21
+ try:
22
+ # Prepare output path
23
+ docx_path = filename.replace('.pdf', '_converted.docx')
24
+
25
+ # First try with pdf2docx
26
+ try:
27
+ from pdf2docx import Converter
28
+ cv = Converter(pdf_file.name)
29
+ cv.convert(docx_path,
30
+ start=0,
31
+ end=None,
32
+ keep_layout=True)
33
+ cv.close()
34
+ except Exception as e:
35
+ print(f"Primary conversion method failed: {e}, trying fallback...")
36
+ # Fallback to libreoffice
37
+ subprocess.run(["libreoffice", "--headless", "--convert-to", "docx", "--outdir", "/tmp", pdf_file.name],
38
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
39
+ temp_docx = os.path.join("/tmp", os.path.basename(pdf_file.name).replace(".pdf", ".docx"))
40
+ if os.path.exists(temp_docx):
41
+ os.rename(temp_docx, docx_path)
42
+ else:
43
+ raise Exception("LibreOffice conversion failed")
44
+
45
+ # Enhance the output with PyMuPDF for better formatting
46
+ try:
47
+ doc = Document(docx_path)
48
+ pdf_doc = fitz.open(pdf_file.name)
49
+
50
+ # Process each page for precise formatting
51
+ for page_num in range(len(pdf_doc)):
52
+ page = pdf_doc.load_page(page_num)
53
+ blocks = page.get_text("dict")["blocks"]
54
+
55
+ for b in blocks:
56
+ if "lines" in b:
57
+ for line in b["lines"]:
58
+ for span in line["spans"]:
59
+ # Match text style in Word doc
60
+ for paragraph in doc.paragraphs:
61
+ if span["text"].strip() and span["text"].strip() in paragraph.text:
62
+ for run in paragraph.runs:
63
+ if span["text"].strip() in run.text:
64
+ # Set font properties
65
+ run.font.name = span["font"]
66
+ run.font.size = Pt(span["size"])
67
+
68
+ # Set color
69
+ if "color" in span:
70
+ color = span["color"]
71
+ r = (color >> 16) & 0xff
72
+ g = (color >> 8) & 0xff
73
+ b = color & 0xff
74
+ run.font.color.rgb = RGBColor(r, g, b)
75
+
76
+ # Set styles
77
+ run.font.bold = bool(span["flags"] & 2 ** 4)
78
+ run.font.italic = bool(span["flags"] & 2 ** 1)
79
+ run.font.underline = bool(span["flags"] & 2 ** 2)
80
+
81
+ # Handle images with precise positioning
82
+ for page_num in range(len(pdf_doc)):
83
+ page = pdf_doc.load_page(page_num)
84
+ image_list = page.get_images(full=True)
85
+
86
+ for img_index, img in enumerate(image_list):
87
+ xref = img[0]
88
+ base_image = pdf_doc.extract_image(xref)
89
+ image_bytes = base_image["image"]
90
+
91
+ # Convert to PIL Image
92
+ image = Image.open(io.BytesIO(image_bytes))
93
+ image_path = f"/tmp/img_{page_num}_{img_index}.png"
94
+ image.save(image_path)
95
+
96
+ # Add to document with original dimensions
97
+ doc.add_picture(image_path, width=Inches(image.width/72), height=Inches(image.height/72))
98
+ os.remove(image_path)
99
+
100
+ pdf_doc.close()
101
+ doc.save(docx_path)
102
+ except Exception as e:
103
+ print(f"Formatting enhancement failed: {e}, using basic conversion")
104
+
105
+ return docx_path
106
+
107
+ except Exception as e:
108
+ raise Exception(f"Conversion failed: {str(e)}")
109
+
110
+ def process_pdf(file):
111
+ if not file:
112
+ raise gr.Error("Please upload a PDF file first")
113
+
114
+ if not file.name.lower().endswith('.pdf'):
115
+ raise gr.Error("Please upload a PDF file")
116
+
117
+ try:
118
+ start_time = time.time()
119
+ output_path = convert_pdf_to_word(file, file.name)
120
+ conversion_time = time.time() - start_time
121
+
122
+ return output_path, f"✅ Conversion completed in {conversion_time:.1f} seconds"
123
+ except Exception as e:
124
+ raise gr.Error(f"Conversion failed: {str(e)}")
125
+
126
+ # Create Gradio interface
127
+ with gr.Blocks(title="PDF to Word Converter") as demo:
128
+ gr.Markdown("# PDF to Word Converter")
129
+ gr.Markdown("Upload a PDF file and convert it to an editable Word document while preserving formatting.")
130
+
131
+ with gr.Row():
132
+ with gr.Column():
133
+ file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
134
+ convert_btn = gr.Button("Convert to Word", variant="primary")
135
+ with gr.Column():
136
+ status = gr.Textbox(label="Status")
137
+ file_output = gr.File(label="Download Word File")
138
+
139
+ convert_btn.click(
140
+ fn=process_pdf,
141
+ inputs=file_input,
142
+ outputs=[file_output, status]
143
+ )
144
+
145
+ if __name__ == "__main__":
146
+ demo.launch()