vidhanm commited on
Commit
fbe5121
·
1 Parent(s): 055abc9

updated app.py for loading local files from repo

Browse files
Files changed (1) hide show
  1. app.py +31 -55
app.py CHANGED
@@ -1,39 +1,36 @@
1
  import gradio as gr
2
  from PIL import Image
3
  import torch
4
- from transformers import AutoProcessor, AutoModelForVision2Seq
5
  import os
6
 
7
  # Determine the device to use
8
- # Using os.environ.get to allow device override from Space hardware config if needed
9
- # Defaults to CUDA if available, else CPU.
10
  device_choice = os.environ.get("DEVICE", "auto")
11
  if device_choice == "auto":
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
  else:
14
  device = device_choice
15
-
16
  print(f"Using device: {device}")
17
 
18
  # Load the model and processor
19
  model_id = "lusxvr/nanoVLM-222M"
 
 
 
20
  try:
21
- processor = AutoProcessor.from_pretrained(model_id)
22
- model = AutoModelForVision2Seq.from_pretrained(model_id).to(device)
 
 
 
23
  print("Model and processor loaded successfully.")
24
  except Exception as e:
25
  print(f"Error loading model/processor: {e}")
26
- # If loading fails, we'll have the Gradio app display an error.
27
- # This helps in debugging if the Space doesn't start correctly.
28
- processor = None
29
- model = None
30
 
31
  def generate_text_for_image(image_input, prompt_input):
32
- """
33
- Generates text based on an image and a text prompt.
34
- """
35
  if model is None or processor is None:
36
- return "Error: Model or processor not loaded. Check the Space logs for details."
37
 
38
  if image_input is None:
39
  return "Please upload an image."
@@ -41,7 +38,6 @@ def generate_text_for_image(image_input, prompt_input):
41
  return "Please provide a prompt (e.g., 'Describe this image' or 'What color is the car?')."
42
 
43
  try:
44
- # Ensure the image is in PIL format and RGB
45
  if not isinstance(image_input, Image.Image):
46
  pil_image = Image.fromarray(image_input)
47
  else:
@@ -50,26 +46,20 @@ def generate_text_for_image(image_input, prompt_input):
50
  if pil_image.mode != "RGB":
51
  pil_image = pil_image.convert("RGB")
52
 
53
- # Prepare inputs for the model
54
- # The prompt for nanoVLM is typically a question or an instruction.
55
  inputs = processor(text=[prompt_input], images=[pil_image], return_tensors="pt").to(device)
56
-
57
- # Generate text
58
- # You can adjust max_new_tokens, temperature, top_k, etc.
59
  generated_ids = model.generate(
60
  **inputs,
61
- max_new_tokens=150, # Increased for potentially longer descriptions
62
- num_beams=3, # Example of adding beam search
63
  no_repeat_ngram_size=2,
64
  early_stopping=True
65
  )
66
 
67
- # Decode the generated tokens
68
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
69
 
70
- # The output might sometimes include the prompt itself, depending on the model.
71
- # Simple heuristic to remove prompt if it appears at the beginning:
72
- if generated_text.startswith(prompt_input):
73
  cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
74
  else:
75
  cleaned_text = generated_text
@@ -78,35 +68,20 @@ def generate_text_for_image(image_input, prompt_input):
78
 
79
  except Exception as e:
80
  print(f"Error during generation: {e}")
81
- return f"An error occurred: {str(e)}"
 
82
 
83
- # Create the Gradio interface
84
  description = """
85
  Upload an image and provide a text prompt (e.g., "What is in this image?", "Describe the animal in detail.").
86
  The model will generate a textual response based on the visual content and your query.
87
  This Space uses the `lusxvr/nanoVLM-222M` model.
88
  """
89
-
90
- # Example image from COCO dataset
91
  example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" # A cat and a remote
92
 
93
- # iface = gr.Interface(
94
- # fn=generate_text_for_image,
95
- # inputs=[
96
- # gr.Image(type="pil", label="Upload Image"),
97
- # gr.Textbox(label="Your Prompt/Question", info="e.g., 'What is this a picture of?', 'Describe the main subject.', 'How many animals are there?'")
98
- # ],
99
- # outputs=gr.Textbox(label="Generated Text", show_copy_button=True),
100
- # title="Interactive nanoVLM-222M Demo",
101
- # description=description,
102
- # examples=[
103
- # [example_image_url, "a photo of a"],
104
- # [example_image_url, "Describe the image in detail."],
105
- # [example_image_url, "What objects are on the sofa?"],
106
- # ],
107
- # cache_examples=True # Cache results for examples to load faster
108
- # )
109
- # ... (other parts of your app.py)
110
 
111
  iface = gr.Interface(
112
  fn=generate_text_for_image,
@@ -123,14 +98,15 @@ iface = gr.Interface(
123
  [example_image_url, "What objects are on the sofa?"],
124
  ],
125
  cache_examples=True,
126
- allow_flagging="never" # Add this line to disable flagging
127
- # OR, if you want flagging, configure its directory (ensure GRADIO_FLAGGING_DIR is set in Dockerfile):
128
- # import os
129
- # flagging_dir=os.environ.get("GRADIO_FLAGGING_DIR"),
130
  )
131
 
132
-
133
  if __name__ == "__main__":
134
- # For Hugging Face Spaces, it's common to launch with server_name="0.0.0.0"
135
- # The Space infrastructure handles the public URL and port mapping.
136
- iface.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
1
  import gradio as gr
2
  from PIL import Image
3
  import torch
4
+ from transformers import AutoProcessor, AutoModelForVision2Seq # Keep these for now
5
  import os
6
 
7
  # Determine the device to use
 
 
8
  device_choice = os.environ.get("DEVICE", "auto")
9
  if device_choice == "auto":
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
11
  else:
12
  device = device_choice
 
13
  print(f"Using device: {device}")
14
 
15
  # Load the model and processor
16
  model_id = "lusxvr/nanoVLM-222M"
17
+ processor = None
18
+ model = None
19
+
20
  try:
21
+ print(f"Attempting to load processor for {model_id} with trust_remote_code=True")
22
+ # For custom models like nanoVLM, trust_remote_code=True is often needed.
23
+ processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
24
+ print(f"Processor loaded. Attempting to load model for {model_id} with trust_remote_code=True")
25
+ model = AutoModelForVision2Seq.from_pretrained(model_id, trust_remote_code=True).to(device)
26
  print("Model and processor loaded successfully.")
27
  except Exception as e:
28
  print(f"Error loading model/processor: {e}")
29
+ # More detailed error logging or fallback could be added here.
 
 
 
30
 
31
  def generate_text_for_image(image_input, prompt_input):
 
 
 
32
  if model is None or processor is None:
33
+ return "Error: Model or processor not loaded. Check the Space logs. This might be due to missing 'trust_remote_code=True' or model compatibility issues."
34
 
35
  if image_input is None:
36
  return "Please upload an image."
 
38
  return "Please provide a prompt (e.g., 'Describe this image' or 'What color is the car?')."
39
 
40
  try:
 
41
  if not isinstance(image_input, Image.Image):
42
  pil_image = Image.fromarray(image_input)
43
  else:
 
46
  if pil_image.mode != "RGB":
47
  pil_image = pil_image.convert("RGB")
48
 
 
 
49
  inputs = processor(text=[prompt_input], images=[pil_image], return_tensors="pt").to(device)
50
+
 
 
51
  generated_ids = model.generate(
52
  **inputs,
53
+ max_new_tokens=150,
54
+ num_beams=3,
55
  no_repeat_ngram_size=2,
56
  early_stopping=True
57
  )
58
 
 
59
  generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
60
 
61
+ # Basic cleaning of the prompt if the model includes it in the output
62
+ if prompt_input and generated_text.startswith(prompt_input):
 
63
  cleaned_text = generated_text[len(prompt_input):].lstrip(" ,.:")
64
  else:
65
  cleaned_text = generated_text
 
68
 
69
  except Exception as e:
70
  print(f"Error during generation: {e}")
71
+ # Provide a more user-friendly error if possible
72
+ return f"An error occurred during text generation: {str(e)}"
73
 
 
74
  description = """
75
  Upload an image and provide a text prompt (e.g., "What is in this image?", "Describe the animal in detail.").
76
  The model will generate a textual response based on the visual content and your query.
77
  This Space uses the `lusxvr/nanoVLM-222M` model.
78
  """
 
 
79
  example_image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" # A cat and a remote
80
 
81
+ # Get the pre-defined writable directory for Gradio's temporary files/cache
82
+ # This environment variable is set in your Dockerfile.
83
+ gradio_cache_dir = os.environ.get("GRADIO_TEMP_DIR", "/tmp/gradio_tmp")
84
+
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  iface = gr.Interface(
87
  fn=generate_text_for_image,
 
98
  [example_image_url, "What objects are on the sofa?"],
99
  ],
100
  cache_examples=True,
101
+ # Use the writable directory for caching examples
102
+ examples_cache_folder=gradio_cache_dir,
103
+ allow_flagging="never"
 
104
  )
105
 
 
106
  if __name__ == "__main__":
107
+ if model is None or processor is None:
108
+ print("CRITICAL: Model or processor failed to load. Gradio interface will not start.")
109
+ # You could raise an error here or sys.exit(1) to make the Space fail clearly if loading is essential.
110
+ else:
111
+ print("Launching Gradio interface...")
112
+ iface.launch(server_name="0.0.0.0", server_port=7860)