Files changed (3) hide show
  1. app.py +297 -0
  2. pre-requirements.txt +1 -0
  3. requirements.txt +32 -0
app.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import json
3
+ import math
4
+ import os
5
+ import traceback
6
+ from io import BytesIO
7
+ from typing import Any, Dict, List, Optional, Tuple
8
+ import re
9
+ import time
10
+ from threading import Thread
11
+ from io import BytesIO
12
+ import uuid
13
+ import tempfile
14
+
15
+ import gradio as gr
16
+ import requests
17
+ import torch
18
+ from PIL import Image
19
+ import fitz
20
+ import numpy as np
21
+
22
+ from transformers import AutoModelForCausalLM, AutoTokenizer, Qwen2VLImageProcessor
23
+
24
+ from reportlab.lib.pagesizes import A4
25
+ from reportlab.lib.styles import getSampleStyleSheet
26
+ from reportlab.platypus import SimpleDocTemplate, Image as RLImage, Paragraph, Spacer
27
+ from reportlab.lib.units import inch
28
+
29
+ # --- Constants and Model Setup ---
30
+ MAX_INPUT_TOKEN_LENGTH = 4096
31
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
32
+
33
+ print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
34
+ print("torch.__version__ =", torch.__version__)
35
+ print("torch.version.cuda =", torch.version.cuda)
36
+ print("cuda available:", torch.cuda.is_available())
37
+ print("cuda device count:", torch.cuda.device_count())
38
+ if torch.cuda.is_available():
39
+ print("current device:", torch.cuda.current_device())
40
+ print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
41
+
42
+ print("Using device:", device)
43
+
44
+
45
+ # --- Model Loading: tencent/POINTS-Reader ---
46
+ MODEL_PATH = 'tencent/POINTS-Reader'
47
+
48
+ print(f"Loading model: {MODEL_PATH}")
49
+ model = AutoModelForCausalLM.from_pretrained(
50
+ MODEL_PATH,
51
+ trust_remote_code=True,
52
+ torch_dtype=torch.float16,
53
+ device_map='auto'
54
+ )
55
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
56
+ image_processor = Qwen2VLImageProcessor.from_pretrained(MODEL_PATH)
57
+ print("Model loaded successfully.")
58
+
59
+
60
+ # --- PDF Generation and Preview Utility Function ---
61
+ def generate_and_preview_pdf(image: Image.Image, text_content: str, font_size: int, line_spacing: float, alignment: str, image_size: str):
62
+ """
63
+ Generates a PDF, saves it, and then creates image previews of its pages.
64
+ Returns the path to the PDF and a list of paths to the preview images.
65
+ """
66
+ if image is None or not text_content or not text_content.strip():
67
+ raise gr.Error("Cannot generate PDF. Image or text content is missing.")
68
+
69
+ # --- 1. Generate the PDF ---
70
+ temp_dir = tempfile.gettempdir()
71
+ pdf_filename = os.path.join(temp_dir, f"output_{uuid.uuid4()}.pdf")
72
+ doc = SimpleDocTemplate(
73
+ pdf_filename,
74
+ pagesize=A4,
75
+ rightMargin=inch, leftMargin=inch,
76
+ topMargin=inch, bottomMargin=inch
77
+ )
78
+ styles = getSampleStyleSheet()
79
+ style_normal = styles["Normal"]
80
+ style_normal.fontSize = int(font_size)
81
+ style_normal.leading = int(font_size) * line_spacing
82
+ style_normal.alignment = {"Left": 0, "Center": 1, "Right": 2, "Justified": 4}[alignment]
83
+
84
+ story = []
85
+
86
+ img_buffer = BytesIO()
87
+ image.save(img_buffer, format='PNG')
88
+ img_buffer.seek(0)
89
+
90
+ page_width, _ = A4
91
+ available_width = page_width - 2 * inch
92
+ image_widths = {
93
+ "Small": available_width * 0.3,
94
+ "Medium": available_width * 0.6,
95
+ "Large": available_width * 0.9,
96
+ }
97
+ img_width = image_widths[image_size]
98
+ img = RLImage(img_buffer, width=img_width, height=image.height * (img_width / image.width))
99
+ story.append(img)
100
+ story.append(Spacer(1, 12))
101
+
102
+ cleaned_text = re.sub(r'#+\s*', '', text_content).replace("*", "")
103
+ text_paragraphs = cleaned_text.split('\n')
104
+
105
+ for para in text_paragraphs:
106
+ if para.strip():
107
+ story.append(Paragraph(para, style_normal))
108
+
109
+ doc.build(story)
110
+
111
+ # --- 2. Render PDF pages as images for preview ---
112
+ preview_images = []
113
+ try:
114
+ pdf_doc = fitz.open(pdf_filename)
115
+ for page_num in range(len(pdf_doc)):
116
+ page = pdf_doc.load_page(page_num)
117
+ pix = page.get_pixmap(dpi=150)
118
+ preview_img_path = os.path.join(temp_dir, f"preview_{uuid.uuid4()}_p{page_num}.png")
119
+ pix.save(preview_img_path)
120
+ preview_images.append(preview_img_path)
121
+ pdf_doc.close()
122
+ except Exception as e:
123
+ print(f"Error generating PDF preview: {e}")
124
+
125
+ return pdf_filename, preview_images
126
+
127
+
128
+ # --- Core Application Logic ---
129
+ @spaces.GPU
130
+ def process_document_stream(
131
+ image: Image.Image,
132
+ prompt_input: str,
133
+ max_new_tokens: int,
134
+ temperature: float,
135
+ top_p: float,
136
+ top_k: int,
137
+ repetition_penalty: float
138
+ ):
139
+ """
140
+ Main function that handles model inference using tencent/POINTS-Reader.
141
+ """
142
+ if image is None:
143
+ yield "Please upload an image.", ""
144
+ return
145
+ if not prompt_input or not prompt_input.strip():
146
+ yield "Please enter a prompt.", ""
147
+ return
148
+
149
+ temp_image_path = None
150
+ try:
151
+ # --- FIX: Save the PIL Image to a temporary file ---
152
+ # The model expects a file path, not a PIL object.
153
+ temp_dir = tempfile.gettempdir()
154
+ temp_image_path = os.path.join(temp_dir, f"temp_image_{uuid.uuid4()}.png")
155
+ image.save(temp_image_path)
156
+
157
+ # Prepare content for the model using the temporary file path
158
+ content = [
159
+ dict(type='image', image=temp_image_path),
160
+ dict(type='text', text=prompt_input)
161
+ ]
162
+ messages = [
163
+ {
164
+ 'role': 'user',
165
+ 'content': content
166
+ }
167
+ ]
168
+
169
+ # Prepare generation configuration from UI inputs
170
+ generation_config = {
171
+ 'max_new_tokens': max_new_tokens,
172
+ 'repetition_penalty': repetition_penalty,
173
+ 'temperature': temperature,
174
+ 'top_p': top_p,
175
+ 'top_k': top_k,
176
+ 'do_sample': True if temperature > 0 else False
177
+ }
178
+
179
+ # Run inference
180
+ response = model.chat(
181
+ messages,
182
+ tokenizer,
183
+ image_processor,
184
+ generation_config
185
+ )
186
+ # Yield the full response at once
187
+ yield response, response
188
+
189
+ except Exception as e:
190
+ traceback.print_exc()
191
+ yield f"An error occurred during processing: {str(e)}", ""
192
+ finally:
193
+ # --- Clean up the temporary image file ---
194
+ if temp_image_path and os.path.exists(temp_image_path):
195
+ os.remove(temp_image_path)
196
+
197
+
198
+ # --- Gradio UI Definition ---
199
+ def create_gradio_interface():
200
+ """Builds and returns the Gradio web interface."""
201
+ css = """
202
+ .main-container { max-width: 1400px; margin: 0 auto; }
203
+ .process-button { border: none !important; color: white !important; font-weight: bold !important; background-color: blue !important;}
204
+ .process-button:hover { background-color: darkblue !important; transform: translateY(-2px) !important; box-shadow: 0 4px 8px rgba(0,0,0,0.2) !important; }
205
+ #gallery { min-height: 400px; }
206
+ """
207
+ with gr.Blocks(theme="bethecloud/storj_theme", css=css) as demo:
208
+ gr.HTML(f"""
209
+ <div class="title" style="text-align: center">
210
+ <h1>Document Conversion with POINTS Reader 📖</h1>
211
+ <p style="font-size: 1.1em; color: #6b7280; margin-bottom: 0.6em;">
212
+ Using tencent/POINTS-Reader Multimodal for Image Content Extraction
213
+ </p>
214
+ </div>
215
+ """)
216
+
217
+ with gr.Row():
218
+ # Left Column (Inputs)
219
+ with gr.Column(scale=1):
220
+ gr.Textbox(
221
+ label="Model in Use ⚡",
222
+ value="tencent/POINTS-Reader",
223
+ interactive=False
224
+ )
225
+ prompt_input = gr.Textbox(
226
+ label="Query Input",
227
+ placeholder="✦︎ Enter the prompt",
228
+ value="Perform OCR on the image precisely.",
229
+ )
230
+ image_input = gr.Image(label="Upload Image", type="pil", sources=['upload'])
231
+
232
+ with gr.Accordion("Advanced Settings", open=False):
233
+ max_new_tokens = gr.Slider(minimum=512, maximum=8192, value=2048, step=256, label="Max New Tokens")
234
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=1.0, step=0.05, value=0.7)
235
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.8)
236
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=100, step=1, value=20)
237
+ repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.05)
238
+
239
+ gr.Markdown("### PDF Export Settings")
240
+ font_size = gr.Dropdown(choices=["8", "10", "12", "14", "16", "18"], value="12", label="Font Size")
241
+ line_spacing = gr.Dropdown(choices=[1.0, 1.15, 1.5, 2.0], value=1.15, label="Line Spacing")
242
+ alignment = gr.Dropdown(choices=["Left", "Center", "Right", "Justified"], value="Justified", label="Text Alignment")
243
+ image_size = gr.Dropdown(choices=["Small", "Medium", "Large"], value="Medium", label="Image Size in PDF")
244
+
245
+ process_btn = gr.Button("🚀 Process Image", variant="primary", elem_classes=["process-button"], size="lg")
246
+ clear_btn = gr.Button("🗑️ Clear All", variant="secondary")
247
+
248
+ # Right Column (Outputs)
249
+ with gr.Column(scale=2):
250
+ with gr.Tabs() as tabs:
251
+ with gr.Tab("📝 Extracted Content"):
252
+ raw_output_stream = gr.Textbox(label="Raw Model Output (max T ≤ 120s)", interactive=False, lines=15, show_copy_button=True)
253
+ with gr.Row():
254
+ examples = gr.Examples(
255
+ examples=["examples/1.png",
256
+ "examples/2.png",
257
+ "examples/3.png",
258
+ "examples/4.png",
259
+ "examples/5.png"],
260
+ inputs=image_input, label="Examples"
261
+ )
262
+ gr.Markdown("[Report-Bug💻](https://huggingface.co/spaces/prithivMLmods/POINTS-Reader-OCR/discussions) | [prithivMLmods🤗](https://huggingface.co/prithivMLmods)")
263
+
264
+ with gr.Tab("📰 README.md"):
265
+ with gr.Accordion("(Result.md)", open=True):
266
+ markdown_output = gr.Markdown()
267
+
268
+ with gr.Tab("📋 PDF Preview"):
269
+ generate_pdf_btn = gr.Button("📄 Generate PDF & Render", variant="primary")
270
+ pdf_output_file = gr.File(label="Download Generated PDF", interactive=False)
271
+ pdf_preview_gallery = gr.Gallery(label="PDF Page Preview", show_label=True, elem_id="gallery", columns=2, object_fit="contain", height="auto")
272
+
273
+ # Event Handlers
274
+ def clear_all_outputs():
275
+ return None, "", "Raw output will appear here.", "", None, None
276
+
277
+ process_btn.click(
278
+ fn=process_document_stream,
279
+ inputs=[image_input, prompt_input, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
280
+ outputs=[raw_output_stream, markdown_output]
281
+ )
282
+
283
+ generate_pdf_btn.click(
284
+ fn=generate_and_preview_pdf,
285
+ inputs=[image_input, raw_output_stream, font_size, line_spacing, alignment, image_size],
286
+ outputs=[pdf_output_file, pdf_preview_gallery]
287
+ )
288
+
289
+ clear_btn.click(
290
+ clear_all_outputs,
291
+ outputs=[image_input, prompt_input, raw_output_stream, markdown_output, pdf_output_file, pdf_preview_gallery]
292
+ )
293
+ return demo
294
+
295
+ if __name__ == "__main__":
296
+ demo = create_gradio_interface()
297
+ demo.queue(max_size=50).launch(share=True, show_error=True)
pre-requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ pip>=23.0.0
requirements.txt ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ git+https://github.com/Dao-AILab/flash-attention.git
2
+ git+https://github.com/huggingface/accelerate.git
3
+ git+https://github.com/WePOINTS/WePOINTS.git
4
+ git+https://github.com/huggingface/peft.git
5
+ transformers-stream-generator
6
+ transformers==4.55.2
7
+ huggingface_hub
8
+ albumentations
9
+ qwen-vl-utils
10
+ pyvips-binary
11
+ sentencepiece
12
+ opencv-python
13
+ docling-core
14
+ python-docx
15
+ torchvision
16
+ safetensors
17
+ matplotlib
18
+ num2words
19
+ reportlab
20
+ xformers
21
+ requests
22
+ pymupdf
23
+ hf_xet
24
+ spaces
25
+ pyvips
26
+ pillow
27
+ gradio
28
+ einops
29
+ torch
30
+ fpdf
31
+ timm
32
+ av