Files changed (6) hide show
  1. .gitattributes +0 -2
  2. README.md +2 -2
  3. app.py +37 -86
  4. images/0.png +0 -0
  5. images/3.jpg +0 -3
  6. images/4.png +0 -3
.gitattributes CHANGED
@@ -43,5 +43,3 @@ rolm/2.jpeg filter=lfs diff=lfs merge=lfs -text
43
  images/1.jpg filter=lfs diff=lfs merge=lfs -text
44
  videos/1.mp4 filter=lfs diff=lfs merge=lfs -text
45
  videos/2.mp4 filter=lfs diff=lfs merge=lfs -text
46
- images/4.png filter=lfs diff=lfs merge=lfs -text
47
- images/3.jpg filter=lfs diff=lfs merge=lfs -text
 
43
  images/1.jpg filter=lfs diff=lfs merge=lfs -text
44
  videos/1.mp4 filter=lfs diff=lfs merge=lfs -text
45
  videos/2.mp4 filter=lfs diff=lfs merge=lfs -text
 
 
README.md CHANGED
@@ -4,11 +4,11 @@ emoji: 🍍
4
  colorFrom: indigo
5
  colorTo: gray
6
  sdk: gradio
7
- sdk_version: 5.36.2
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
- short_description: nanonets / qwen2vl ocr / rolmocr / aya vision / lh41
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
4
  colorFrom: indigo
5
  colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 5.34.0
8
  app_file: app.py
9
  pinned: true
10
  license: apache-2.0
11
+ short_description: image and video understanding
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -22,22 +22,6 @@ from transformers import (
22
  )
23
  from transformers.image_utils import load_image
24
 
25
- #theme:custom
26
- #custom_theme = gr.themes.Base(
27
- # primary_hue="indigo",
28
- # secondary_hue="violet",
29
- # neutral_hue="gray"
30
- #).set(
31
- # body_background_fill="#f7f5fa",
32
- # body_text_color="#1f1f1f",
33
- # input_background_fill="#ffffff",
34
- # button_primary_background_fill="#8b5cf6",
35
- # button_primary_text_color="#ffffff",
36
- # button_secondary_background_fill="#e0d7f5",
37
- # button_secondary_text_color="#1f1f1f",
38
- # shadow_spread="sm"
39
- #)
40
-
41
  # Constants for text generation
42
  MAX_MAX_NEW_TOKENS = 2048
43
  DEFAULT_MAX_NEW_TOKENS = 1024
@@ -45,11 +29,11 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
45
 
46
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
47
 
48
- # Load Nanonets-OCR-s
49
- MODEL_ID_V = "nanonets/Nanonets-OCR-s"
50
- processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
51
- model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
52
- MODEL_ID_V,
53
  trust_remote_code=True,
54
  torch_dtype=torch.float16
55
  ).to(device).eval()
@@ -63,29 +47,20 @@ model_x = Qwen2VLForConditionalGeneration.from_pretrained(
63
  torch_dtype=torch.float16
64
  ).to(device).eval()
65
 
66
- # Load Aya-Vision-8b
67
- MODEL_ID_A = "CohereForAI/aya-vision-8b"
68
- processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
69
- model_a = AutoModelForImageTextToText.from_pretrained(
70
- MODEL_ID_A,
71
- trust_remote_code=True,
72
- torch_dtype=torch.float16
73
- ).to(device).eval()
74
-
75
- # Load Lh41-1042-Magellanic-7B-0711
76
- MODEL_ID_W = "prithivMLmods/Lh41-1042-Magellanic-7B-0711"
77
- processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
78
- model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
79
- MODEL_ID_W,
80
  trust_remote_code=True,
81
  torch_dtype=torch.float16
82
  ).to(device).eval()
83
 
84
- # Load RolmOCR
85
- MODEL_ID_M = "reducto/RolmOCR"
86
- processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
87
- model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
88
- MODEL_ID_M,
89
  trust_remote_code=True,
90
  torch_dtype=torch.float16
91
  ).to(device).eval()
@@ -120,29 +95,25 @@ def generate_image(model_name: str, text: str, image: Image.Image,
120
  repetition_penalty: float = 1.2):
121
  """
122
  Generates responses using the selected model for image input.
123
- Yields raw text and Markdown-formatted text.
124
  """
125
- if model_name == "RolmOCR-7B":
126
  processor = processor_m
127
  model = model_m
128
- elif model_name == "Qwen2-VL-OCR-2B":
129
  processor = processor_x
130
  model = model_x
131
  elif model_name == "Nanonets-OCR-s":
132
  processor = processor_v
133
  model = model_v
134
- elif model_name == "Aya-Vision-8B":
135
  processor = processor_a
136
  model = model_a
137
- elif model_name == "Lh41-1042-Magellanic-7B-0711":
138
- processor = processor_w
139
- model = model_w
140
  else:
141
- yield "Invalid model selected.", "Invalid model selected."
142
  return
143
 
144
  if image is None:
145
- yield "Please upload an image.", "Please upload an image."
146
  return
147
 
148
  messages = [{
@@ -170,7 +141,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
170
  buffer += new_text
171
  buffer = buffer.replace("<|im_end|>", "")
172
  time.sleep(0.01)
173
- yield buffer, buffer
174
 
175
  @spaces.GPU
176
  def generate_video(model_name: str, text: str, video_path: str,
@@ -181,29 +152,25 @@ def generate_video(model_name: str, text: str, video_path: str,
181
  repetition_penalty: float = 1.2):
182
  """
183
  Generates responses using the selected model for video input.
184
- Yields raw text and Markdown-formatted text.
185
  """
186
- if model_name == "RolmOCR-7B":
187
  processor = processor_m
188
  model = model_m
189
- elif model_name == "Qwen2-VL-OCR-2B":
190
  processor = processor_x
191
  model = model_x
192
  elif model_name == "Nanonets-OCR-s":
193
  processor = processor_v
194
  model = model_v
195
- elif model_name == "Aya-Vision-8B":
196
  processor = processor_a
197
  model = model_a
198
- elif model_name == "Lh41-1042-Magellanic-7B-0711":
199
- processor = processor_w
200
- model = model_w
201
  else:
202
- yield "Invalid model selected.", "Invalid model selected."
203
  return
204
 
205
  if video_path is None:
206
- yield "Please upload a video.", "Please upload a video."
207
  return
208
 
209
  frames = downsample_video(video_path)
@@ -242,13 +209,10 @@ def generate_video(model_name: str, text: str, video_path: str,
242
  buffer += new_text
243
  buffer = buffer.replace("<|im_end|>", "")
244
  time.sleep(0.01)
245
- yield buffer, buffer
246
 
247
  # Define examples for image and video inference
248
  image_examples = [
249
- ["Extract the content", "images/4.png"],
250
- ["Explain the scene", "images/3.jpg"],
251
- ["Convert this page to doc [table] precisely for markdown.", "images/0.png"],
252
  ["Perform OCR on the Image.", "images/1.jpg"],
253
  ["Extract the table content", "images/2.png"]
254
  ]
@@ -266,16 +230,11 @@ css = """
266
  .submit-btn:hover {
267
  background-color: #3498db !important;
268
  }
269
- .canvas-output {
270
- border: 2px solid #4682B4;
271
- border-radius: 10px;
272
- padding: 20px;
273
- }
274
  """
275
 
276
  # Create the Gradio Interface
277
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
278
- gr.Markdown("# **[Multimodal OCR](https://huggingface.co/collections/prithivMLmods/multimodal-implementations-67c9982ea04b39f0608badb0)**")
279
  with gr.Row():
280
  with gr.Column():
281
  with gr.Tabs():
@@ -301,37 +260,29 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
301
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
302
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
303
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
304
-
305
  with gr.Column():
306
- with gr.Column(elem_classes="canvas-output"):
307
- gr.Markdown("## Output")
308
- output = gr.Textbox(label="Raw Output Stream", interactive=False, lines=2, show_copy_button=True)
309
- #format[ft.md]
310
- with gr.Accordion("(Result.md)", open=False):
311
- markdown_output = gr.Markdown(label="Formatted Result (Result.Md)")
312
  model_choice = gr.Radio(
313
- choices=["Nanonets-OCR-s", "Qwen2-VL-OCR-2B", "RolmOCR-7B",
314
- "Lh41-1042-Magellanic-7B-0711", "Aya-Vision-8B"],
315
  label="Select Model",
316
  value="Nanonets-OCR-s"
317
  )
318
- gr.Markdown("**Model Info πŸ’»** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR/discussions)")
 
 
319
  gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
320
- gr.Markdown("> [Lh41-1042-Magellanic-7B-0711](https://huggingface.co/prithivMLmods/Lh41-1042-Magellanic-7B-0711): lh41-1042-magellanic-7b-0711 model is a fine-tuned version of qwen2.5-vl-7b-instruct, optimized for image captioning, visual analysis, and image reasoning. built on top of the qwen2.5-vl, this experimental model enhances visual comprehension, focused training on 3,000k image pairs for superior image understanding")
321
- gr.Markdown("> [Qwen2-VL-OCR-2B](https://huggingface.co/prithivMLmods/Qwen2-VL-OCR-2B-Instruct): qwen2-vl-ocr-2b-instruct model is a fine-tuned version of qwen2-vl-2b-instruct, tailored for tasks that involve [messy] optical character recognition (ocr), image-to-text conversion, and math problem solving with latex formatting.")
322
- gr.Markdown("> [RolmOCR](https://huggingface.co/reducto/RolmOCR): rolmocr, high-quality, openly available approach to parsing pdfs and other complex documents optical character recognition. it is designed to handle a wide range of document types, including scanned documents, handwritten text, and complex layouts.")
323
  gr.Markdown("> [Aya-Vision](https://huggingface.co/CohereLabs/aya-vision-8b): cohere labs aya vision 8b is an open weights research release of an 8-billion parameter model with advanced capabilities optimized for a variety of vision-language use cases, including ocr, captioning, visual reasoning, summarization, question answering, code, and more.")
324
- gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
325
-
326
  image_submit.click(
327
  fn=generate_image,
328
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
329
- outputs=[output, markdown_output]
330
  )
331
  video_submit.click(
332
  fn=generate_video,
333
  inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
334
- outputs=[output, markdown_output]
335
  )
336
 
337
  if __name__ == "__main__":
 
22
  )
23
  from transformers.image_utils import load_image
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  # Constants for text generation
26
  MAX_MAX_NEW_TOKENS = 2048
27
  DEFAULT_MAX_NEW_TOKENS = 1024
 
29
 
30
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
 
32
+ # Load RolmOCR
33
+ MODEL_ID_M = "reducto/RolmOCR"
34
+ processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
35
+ model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
36
+ MODEL_ID_M,
37
  trust_remote_code=True,
38
  torch_dtype=torch.float16
39
  ).to(device).eval()
 
47
  torch_dtype=torch.float16
48
  ).to(device).eval()
49
 
50
+ # Load Nanonets-OCR-s
51
+ MODEL_ID_V = "nanonets/Nanonets-OCR-s"
52
+ processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
53
+ model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
54
+ MODEL_ID_V,
 
 
 
 
 
 
 
 
 
55
  trust_remote_code=True,
56
  torch_dtype=torch.float16
57
  ).to(device).eval()
58
 
59
+ # Load aya-vision-8b
60
+ MODEL_ID_A = "CohereForAI/aya-vision-8b"
61
+ processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
62
+ model_a = AutoModelForImageTextToText.from_pretrained(
63
+ MODEL_ID_A,
64
  trust_remote_code=True,
65
  torch_dtype=torch.float16
66
  ).to(device).eval()
 
95
  repetition_penalty: float = 1.2):
96
  """
97
  Generates responses using the selected model for image input.
 
98
  """
99
+ if model_name == "RolmOCR":
100
  processor = processor_m
101
  model = model_m
102
+ elif model_name == "Qwen2-VL-OCR-2B-Instruct":
103
  processor = processor_x
104
  model = model_x
105
  elif model_name == "Nanonets-OCR-s":
106
  processor = processor_v
107
  model = model_v
108
+ elif model_name == "Aya-Vision":
109
  processor = processor_a
110
  model = model_a
 
 
 
111
  else:
112
+ yield "Invalid model selected."
113
  return
114
 
115
  if image is None:
116
+ yield "Please upload an image."
117
  return
118
 
119
  messages = [{
 
141
  buffer += new_text
142
  buffer = buffer.replace("<|im_end|>", "")
143
  time.sleep(0.01)
144
+ yield buffer
145
 
146
  @spaces.GPU
147
  def generate_video(model_name: str, text: str, video_path: str,
 
152
  repetition_penalty: float = 1.2):
153
  """
154
  Generates responses using the selected model for video input.
 
155
  """
156
+ if model_name == "RolmOCR":
157
  processor = processor_m
158
  model = model_m
159
+ elif model_name == "Qwen2-VL-OCR-2B-Instruct":
160
  processor = processor_x
161
  model = model_x
162
  elif model_name == "Nanonets-OCR-s":
163
  processor = processor_v
164
  model = model_v
165
+ elif model_name == "Aya-Vision":
166
  processor = processor_a
167
  model = model_a
 
 
 
168
  else:
169
+ yield "Invalid model selected."
170
  return
171
 
172
  if video_path is None:
173
+ yield "Please upload a video."
174
  return
175
 
176
  frames = downsample_video(video_path)
 
209
  buffer += new_text
210
  buffer = buffer.replace("<|im_end|>", "")
211
  time.sleep(0.01)
212
+ yield buffer
213
 
214
  # Define examples for image and video inference
215
  image_examples = [
 
 
 
216
  ["Perform OCR on the Image.", "images/1.jpg"],
217
  ["Extract the table content", "images/2.png"]
218
  ]
 
230
  .submit-btn:hover {
231
  background-color: #3498db !important;
232
  }
 
 
 
 
 
233
  """
234
 
235
  # Create the Gradio Interface
236
  with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
237
+ gr.Markdown("# **Multimodal OCR**")
238
  with gr.Row():
239
  with gr.Column():
240
  with gr.Tabs():
 
260
  top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
261
  top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
262
  repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
 
263
  with gr.Column():
264
+ output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
 
 
 
 
 
265
  model_choice = gr.Radio(
266
+ choices=["Nanonets-OCR-s", "Qwen2-VL-OCR-2B-Instruct", "RolmOCR", "Aya-Vision"],
 
267
  label="Select Model",
268
  value="Nanonets-OCR-s"
269
  )
270
+
271
+ gr.Markdown("**Model Info**")
272
+ gr.Markdown("> [Qwen2-VL-OCR-2B-Instruct](https://huggingface.co/prithivMLmods/Qwen2-VL-OCR-2B-Instruct): qwen2-vl-ocr-2b-instruct model is a fine-tuned version of qwen2-vl-2b-instruct, tailored for tasks that involve [messy] optical character recognition (ocr), image-to-text conversion, and math problem solving with latex formatting.")
273
  gr.Markdown("> [Nanonets-OCR-s](https://huggingface.co/nanonets/Nanonets-OCR-s): nanonets-ocr-s is a powerful, state-of-the-art image-to-markdown ocr model that goes far beyond traditional text extraction. it transforms documents into structured markdown with intelligent content recognition and semantic tagging.")
274
+ gr.Markdown("> [RolmOCR](https://huggingface.co/reducto/RolmOCR): rolmocr, high-quality, openly available approach to parsing pdfs and other complex documents oprical character recognition. it is designed to handle a wide range of document types, including scanned documents, handwritten text, and complex layouts.")
 
 
275
  gr.Markdown("> [Aya-Vision](https://huggingface.co/CohereLabs/aya-vision-8b): cohere labs aya vision 8b is an open weights research release of an 8-billion parameter model with advanced capabilities optimized for a variety of vision-language use cases, including ocr, captioning, visual reasoning, summarization, question answering, code, and more.")
276
+
 
277
  image_submit.click(
278
  fn=generate_image,
279
  inputs=[model_choice, image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
280
+ outputs=output
281
  )
282
  video_submit.click(
283
  fn=generate_video,
284
  inputs=[model_choice, video_query, video_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
285
+ outputs=output
286
  )
287
 
288
  if __name__ == "__main__":
images/0.png DELETED
Binary file (86.1 kB)
 
images/3.jpg DELETED

Git LFS Details

  • SHA256: 510714fb3ee4eaddbd24f4b1f36e75bf13611326c39046674db27095c26132cc
  • Pointer size: 131 Bytes
  • Size of remote file: 224 kB
images/4.png DELETED

Git LFS Details

  • SHA256: 8a5736439eea1647b192e13473f9cde9c3c619dc066297e38dee2cf11fe5779d
  • Pointer size: 131 Bytes
  • Size of remote file: 152 kB