prithivMLmods commited on
Commit
6858bb7
·
verified ·
1 Parent(s): d0f3c0b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -23
app.py CHANGED
@@ -19,6 +19,8 @@ from transformers import (
19
  Qwen2VLForConditionalGeneration,
20
  AutoProcessor,
21
  AutoTokenizer,
 
 
22
  TextIteratorStreamer,
23
  )
24
 
@@ -31,10 +33,11 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
31
 
32
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
33
 
34
- # Load SkyCaptioner-V1
35
- MODEL_ID_M = "Skywork/SkyCaptioner-V1"
36
- processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
37
- model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 
38
  MODEL_ID_M,
39
  trust_remote_code=True,
40
  torch_dtype=torch.float16
@@ -58,16 +61,6 @@ model_k = Qwen2VLForConditionalGeneration.from_pretrained(
58
  torch_dtype=torch.float16
59
  ).to(device).eval()
60
 
61
- # Load Imgscope-OCR-2B-0527
62
- MODEL_ID_Y = "prithivMLmods/Imgscope-OCR-2B-0527"
63
- processor_y = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
64
- model_y = Qwen2VLForConditionalGeneration.from_pretrained(
65
- MODEL_ID_Y,
66
- trust_remote_code=True,
67
- torch_dtype=torch.float16
68
- ).to(device).eval()
69
-
70
-
71
  def downsample_video(video_path):
72
  """
73
  Downsamples the video to evenly spaced frames.
@@ -99,7 +92,7 @@ def generate_image(model_name: str, text: str, image: Image.Image,
99
  """
100
  Generates responses using the selected model for image input.
101
  """
102
- if model_name == "SkyCaptioner-V1":
103
  processor = processor_m
104
  model = model_m
105
  elif model_name == "SpaceThinker-3B":
@@ -108,9 +101,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
108
  elif model_name == "coreOCR-7B-050325-preview":
109
  processor = processor_k
110
  model = model_k
111
- elif model_name == "Imgscope-OCR-2B-0527":
112
- processor = processor_y
113
- model = model_y
114
  else:
115
  yield "Invalid model selected."
116
  return
@@ -156,7 +146,7 @@ def generate_video(model_name: str, text: str, video_path: str,
156
  """
157
  Generates responses using the selected model for video input.
158
  """
159
- if model_name == "SkyCaptioner-V1":
160
  processor = processor_m
161
  model = model_m
162
  elif model_name == "SpaceThinker-3B":
@@ -165,9 +155,6 @@ def generate_video(model_name: str, text: str, video_path: str,
165
  elif model_name == "coreOCR-7B-050325-preview":
166
  processor = processor_k
167
  model = model_k
168
- elif model_name == "Imgscope-OCR-2B-0527":
169
- processor = processor_y
170
- model = model_y
171
  else:
172
  yield "Invalid model selected."
173
  return
@@ -269,7 +256,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
269
  with gr.Column():
270
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
271
  model_choice = gr.Radio(
272
- choices=["SkyCaptioner-V1", "SpaceThinker-3B", "coreOCR-7B-050325-preview", "Imgscope-OCR-2B-0527"],
273
  label="Select Model",
274
  value="SkyCaptioner-V1"
275
  )
 
19
  Qwen2VLForConditionalGeneration,
20
  AutoProcessor,
21
  AutoTokenizer,
22
+ AutoModel,
23
+ AutoImageProcessor,
24
  TextIteratorStreamer,
25
  )
26
 
 
33
 
34
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
35
 
36
+ # Load Llama-3.1-Nemotron-Nano-VL-8B-V1
37
+ MODEL_ID_M = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
38
+ processor_m = AutoImageProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
39
+ tokenizer_m = AutoTokenizer.from_pretrained(MODEL_ID_M)
40
+ model_m = AutoModel.from_pretrained(
41
  MODEL_ID_M,
42
  trust_remote_code=True,
43
  torch_dtype=torch.float16
 
61
  torch_dtype=torch.float16
62
  ).to(device).eval()
63
 
 
 
 
 
 
 
 
 
 
 
64
  def downsample_video(video_path):
65
  """
66
  Downsamples the video to evenly spaced frames.
 
92
  """
93
  Generates responses using the selected model for image input.
94
  """
95
+ if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
96
  processor = processor_m
97
  model = model_m
98
  elif model_name == "SpaceThinker-3B":
 
101
  elif model_name == "coreOCR-7B-050325-preview":
102
  processor = processor_k
103
  model = model_k
 
 
 
104
  else:
105
  yield "Invalid model selected."
106
  return
 
146
  """
147
  Generates responses using the selected model for video input.
148
  """
149
+ if model_name == "Llama-3.1-Nemotron-Nano-VL-8B-V1":
150
  processor = processor_m
151
  model = model_m
152
  elif model_name == "SpaceThinker-3B":
 
155
  elif model_name == "coreOCR-7B-050325-preview":
156
  processor = processor_k
157
  model = model_k
 
 
 
158
  else:
159
  yield "Invalid model selected."
160
  return
 
256
  with gr.Column():
257
  output = gr.Textbox(label="Output", interactive=False, lines=2, scale=2)
258
  model_choice = gr.Radio(
259
+ choices=["Llama-3.1-Nemotron-Nano-VL-8B-V1", "SpaceThinker-3B", "coreOCR-7B-050325-preview"],
260
  label="Select Model",
261
  value="SkyCaptioner-V1"
262
  )