Files changed (2) hide show
  1. app.py +315 -0
  2. requirements.txt +14 -0
app.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import threading
4
+ import gradio as gr
5
+ import spaces
6
+ import torch
7
+ import numpy as np
8
+ from PIL import Image
9
+ import cv2
10
+ from transformers import (
11
+ Qwen2_5_VLForConditionalGeneration,
12
+ Qwen2VLForConditionalGeneration,
13
+ Glm4vForConditionalGeneration,
14
+ AutoProcessor,
15
+ TextIteratorStreamer,
16
+ )
17
+ from qwen_vl_utils import process_vision_info
18
+
19
+ # Constants for text generation
20
+ MAX_MAX_NEW_TOKENS = 4096
21
+ DEFAULT_MAX_NEW_TOKENS = 3584
22
+ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
23
+
24
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
25
+
26
+ # Load Camel-Doc-OCR-062825
27
+ MODEL_ID_M = "prithivMLmods/Camel-Doc-OCR-062825"
28
+ processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
29
+ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
30
+ MODEL_ID_M,
31
+ trust_remote_code=True,
32
+ torch_dtype=torch.float16
33
+ ).to(device).eval()
34
+
35
+ # Load Qwen2.5-VL-3B-Instruct-abliterated
36
+ MODEL_ID_X = "huihui-ai/Qwen2.5-VL-3B-Instruct-abliterated"
37
+ processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
38
+ model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
39
+ MODEL_ID_X,
40
+ trust_remote_code=True,
41
+ torch_dtype=torch.float16
42
+ ).to(device).eval()
43
+
44
+ # Load Megalodon-OCR-Sync-0713
45
+ MODEL_ID_T = "prithivMLmods/Megalodon-OCR-Sync-0713"
46
+ processor_t = AutoProcessor.from_pretrained(MODEL_ID_T, trust_remote_code=True)
47
+ model_t = Qwen2_5_VLForConditionalGeneration.from_pretrained(
48
+ MODEL_ID_T,
49
+ trust_remote_code=True,
50
+ torch_dtype=torch.float16
51
+ ).to(device).eval()
52
+
53
+ # Load GLM-4.1V-9B-Thinking
54
+ MODEL_ID_S = "zai-org/GLM-4.1V-9B-Thinking"
55
+ processor_s = AutoProcessor.from_pretrained(MODEL_ID_S, trust_remote_code=True)
56
+ model_s = Glm4vForConditionalGeneration.from_pretrained(
57
+ MODEL_ID_S,
58
+ trust_remote_code=True,
59
+ torch_dtype=torch.float16
60
+ ).to(device).eval()
61
+
62
+ # Load DeepEyes-7B
63
+ MODEL_ID_Y = "ChenShawn/DeepEyes-7B"
64
+ processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
65
+ model_y = Qwen2_5_VLForConditionalGeneration.from_pretrained(
66
+ MODEL_ID_Y,
67
+ trust_remote_code=True,
68
+ torch_dtype=torch.float16
69
+ ).to(device).eval()
70
+
71
+ def downsample_video(video_path):
72
+ """
73
+ Downsample a video to evenly spaced frames, returning each as a PIL image with its timestamp.
74
+ """
75
+ vidcap = cv2.VideoCapture(video_path)
76
+ total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
77
+ fps = vidcap.get(cv2.CAP_PROP_FPS)
78
+ frames = []
79
+ frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
80
+ for i in frame_indices:
81
+ vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
82
+ success, image = vidcap.read()
83
+ if success:
84
+ image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
85
+ pil_image = Image.fromarray(image)
86
+ timestamp = round(i / fps, 2)
87
+ frames.append((pil_image, timestamp))
88
+ vidcap.release()
89
+ return frames
90
+
91
+ @spaces.GPU
92
+ def generate_image(model_name: str, text: str, image: Image.Image,
93
+ max_new_tokens: int = 1024,
94
+ temperature: float = 0.6,
95
+ top_p: float = 0.9,
96
+ top_k: int = 50,
97
+ repetition_penalty: float = 1.2):
98
+ """
99
+ Generate responses using the selected model for image input.
100
+ """
101
+ if model_name == "Camel-Doc-OCR-062825":
102
+ processor = processor_m
103
+ model = model_m
104
+ elif model_name == "Megalodon-OCR-Sync-0713":
105
+ processor = processor_t
106
+ model = model_t
107
+ elif model_name == "GLM-4.1V-9B-Thinking":
108
+ processor = processor_s
109
+ model = model_s
110
+ elif model_name == "DeepEyes-7B-Thinking":
111
+ processor = processor_y
112
+ model = model_y
113
+ elif model_name == "Qwen2.5-VL-3B-Instruct-abliterated":
114
+ processor = processor_x
115
+ model = model_x
116
+ else:
117
+ yield "Invalid model selected.", "Invalid model selected."
118
+ return
119
+
120
+ if image is None:
121
+ yield "Please upload an image.", "Please upload an image."
122
+ return
123
+
124
+ messages = [{
125
+ "role": "user",
126
+ "content": [
127
+ {"type": "image", "image": image},
128
+ {"type": "text", "text": text},
129
+ ]
130
+ }]
131
+ prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
132
+ inputs = processor(
133
+ text=[prompt_full],
134
+ images=[image],
135
+ return_tensors="pt",
136
+ padding=True,
137
+ truncation=False,
138
+ max_length=MAX_INPUT_TOKEN_LENGTH
139
+ ).to(device)
140
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
141
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
142
+ thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
143
+ thread.start()
144
+ buffer = ""
145
+ for new_text in streamer:
146
+ buffer += new_text
147
+ time.sleep(0.01)
148
+ yield buffer, buffer
149
+
150
+ @spaces.GPU
151
+ def generate_video(model_name: str, text: str, video_path: str,
152
+ max_new_tokens: int = 1024,
153
+ temperature: float = 0.6,
154
+ top_p: float = 0.9,
155
+ top_k: int = 50,
156
+ repetition_penalty: float = 1.2):
157
+ """
158
+ Generate responses using the selected model for video input.
159
+ """
160
+ if model_name == "Camel-Doc-OCR-062825":
161
+ processor = processor_m
162
+ model = model_m
163
+ elif model_name == "Megalodon-OCR-Sync-0713":
164
+ processor = processor_t
165
+ model = model_t
166
+ elif model_name == "GLM-4.1V-9B-Thinking":
167
+ processor = processor_s
168
+ model = model_s
169
+ elif model_name == "DeepEyes-7B-Thinking":
170
+ processor = processor_y
171
+ model = model_y
172
+ elif model_name == "Qwen2.5-VL-3B-Instruct-abliterated":
173
+ processor = processor_x
174
+ model = model_x
175
+ else:
176
+ yield "Invalid model selected.", "Invalid model selected."
177
+ return
178
+
179
+ if video_path is None:
180
+ yield "Please upload a video.", "Please upload a video."
181
+ return
182
+
183
+ frames = downsample_video(video_path)
184
+ messages = [
185
+ {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
186
+ {"role": "user", "content": [{"type": "text", "text": text}]}
187
+ ]
188
+ for frame in frames:
189
+ image, timestamp = frame
190
+ messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
191
+ messages[1]["content"].append({"type": "image", "image": image})
192
+ inputs = processor.apply_chat_template(
193
+ messages,
194
+ tokenize=True,
195
+ add_generation_prompt=True,
196
+ return_dict=True,
197
+ return_tensors="pt",
198
+ truncation=False,
199
+ max_length=MAX_INPUT_TOKEN_LENGTH
200
+ ).to(device)
201
+ streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
202
+ generation_kwargs = {
203
+ **inputs,
204
+ "streamer": streamer,
205
+ "max_new_tokens": max_new_tokens,
206
+ "do_sample": True,
207
+ "temperature": temperature,
208
+ "top_p": top_p,
209
+ "top_k": top_k,
210
+ "repetition_penalty": repetition_penalty,
211
+ }
212
+ thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
213
+ thread.start()
214
+ buffer = ""
215
+ for new_text in streamer:
216
+ buffer += new_text
217
+ buffer = buffer.replace("<|im_end|>", "")
218
+ time.sleep(0.01)
219
+ yield buffer, buffer
220
+
221
+ # Define examples for image and video inference
222
+ image_examples = [
223
+ ["explain the movie shot in detail.", "images/5.jpg"],
224
+ ["convert this page to doc [text] precisely for markdown.", "images/1.png"],
225
+ ["convert this page to doc [table] precisely for markdown.", "images/2.png"],
226
+ ["explain the movie shot in detail.", "images/3.png"],
227
+ ["fill the correct numbers.", "images/4.png"]
228
+ ]
229
+
230
+ video_examples = [
231
+ ["explain the video in detail.", "videos/b.mp4"],
232
+ ["explain the ad video in detail.", "videos/a.mp4"]
233
+ ]
234
+
235
+ # Updated CSS with model choice highlighting
236
+ css = """
237
+ .submit-btn {
238
+ background-color: #2980b9 !important;
239
+ color: white !important;
240
+ }
241
+ .submit-btn:hover {
242
+ background-color: #3498db !important;
243
+ }
244
+ .canvas-output {
245
+ border: 2px solid #4682B4;
246
+ border-radius: 10px;
247
+ padding: 20px;
248
+ }
249
+ """
250
+
251
+ # Create the Gradio Interface
252
+ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
253
+ gr.Markdown("# **[Multimodal VLM OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
254
+ with gr.Row():
255
+ with gr.Column():
256
+ with gr.Tabs():
257
+ with gr.TabItem("Image Inference"):
258
+ image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
259
+ image_upload = gr.Image(type="pil", label="Image")
260
+ image_submit = gr.Button("Submit", elem_classes="submit-btn")
261
+ gr.Examples(
262
+ examples=image_examples,
263
+ inputs=[image_query, image_upload]
264
+ )
265
+ with gr.TabItem("Video Inference"):
266
+ video_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
267
+ video_upload = gr.Video(label="Video")
268
+ video_submit = gr.Button("Submit", elem_classes="submit-btn")
269
+ gr.Examples(
270
+ examples=video_examples,
271
+ inputs=[video_query, video_upload]
272
+ )
273
+
274
+ with gr.Accordion("Advanced options", open=False):
275
+ max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
276
+ temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
277
+ top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
278
+ top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
279
+ repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
280
+
281
+ with gr.Column():
282
+ with gr.Column(elem_classes="canvas-output"):
283
+ gr.Markdown("## Output")
284
+ output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2)
285
+ with gr.Accordion("(Result.md)", open=False):
286
+ markdown_output = gr.Markdown(label="(Result.md)")
287
+
288
+ model_choice = gr.Radio(
289
+ choices=["Camel-Doc-OCR-062825", "GLM-4.1V-9B-Thinking", "Megalodon-OCR-Sync-0713", "DeepEyes-7B-Thinking", "Qwen2.5-VL-3B-Instruct-abliterated"],
290
+ label="Select Model",
291
+ value="Camel-Doc-OCR-062825"
292
+ )
293
+ gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR-Comparator/discussions)")
294
+ gr.Markdown("> Camel-Doc-OCR-062825 and Megalodon-OCR-Sync-0713 are both fine-tuned versions of the Qwen2.5-VL series focused on document retrieval, content extraction, analysis recognition, and excelling in OCR and visual document analysis tasks for structured and unstructured content—Camel-Doc-OCR-062825 leveraging the Qwen2.5-VL-7B-Instruct as its base, while Megalodon-OCR-Sync-0713 uses Qwen2.5-VL-3B-Instruct and is especially trained on diverse captioning datasets.")
295
+ gr.Markdown("> GLM-4.1V-9B-Thinking is a vision-language model (VLM) based on the GLM-4-9B-0414 foundation, with a strong emphasis on advanced reasoning capabilities, chain-of-thought inference, and robust bilingual (Chinese/English) performance on complex multimodal benchmarks.")
296
+ gr.Markdown("> DeepEyes-7B stands out for its agentic reinforcement learning approach, focusing on thinking with images for better visual reasoning, math problem-solving, and mitigating hallucination using Qwen2.5-VL-7B-Instruct as its foundation. Finally, Qwen2.5-VL-3B-Instruct-abliterated is part of the Qwen2.5-VL family, known for its versatile vision-language understanding and generation, serving as the foundational architecture for several of these fine-tuned vision-language and OCR models.")
297
+
298
+ # Define the submit button actions
299
+ image_submit.click(fn=generate_image,
300
+ inputs=[
301
+ model_choice, image_query, image_upload,
302
+ max_new_tokens, temperature, top_p, top_k,
303
+ repetition_penalty
304
+ ],
305
+ outputs=[output, markdown_output])
306
+ video_submit.click(fn=generate_video,
307
+ inputs=[
308
+ model_choice, video_query, video_upload,
309
+ max_new_tokens, temperature, top_p, top_k,
310
+ repetition_penalty
311
+ ],
312
+ outputs=[output, markdown_output])
313
+
314
+ if __name__ == "__main__":
315
+ demo.queue(max_size=30).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ numpy
3
+ transformers
4
+ transformers-stream-generator
5
+ qwen-vl-utils
6
+ torchvision
7
+ torch
8
+ requests
9
+ huggingface-hub
10
+ spaces
11
+ accelerate
12
+ pillow
13
+ opencv-python
14
+ av