ariG23498 HF Staff commited on
Commit
59ec2ed
·
1 Parent(s): 2395e7b

adding logic

Browse files
Files changed (3) hide show
  1. README.md +2 -2
  2. app.py +44 -4
  3. requirements.txt +3 -0
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
- title: Gemma3n Image Audio
3
  emoji: 😻
4
- colorFrom: purple
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.35.0
 
1
  ---
2
+ title: Gemma3n Visual (Audio) Question Answering
3
  emoji: 😻
4
+ colorFrom: blue
5
  colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.35.0
app.py CHANGED
@@ -1,7 +1,47 @@
 
1
  import gradio as gr
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
  import gradio as gr
3
+ import torch
4
+ from transformers import AutoModelForImageTextToText, AutoProcessor
5
 
6
+ # Load model and processor
7
+ MODEL_PATH = "google/gemma-3n-E2B-it"
8
+ processor = AutoProcessor.from_pretrained(MODEL_PATH)
9
+ model = AutoModelForImageTextToText.from_pretrained(MODEL_PATH, torch_dtype="auto", device_map="auto")
10
 
11
+ @spaces.GPU
12
+ def process_inputs(image, audio):
13
+ # Prepare inputs for the model
14
+ inputs = processor(
15
+ images=image,
16
+ audio=audio,
17
+ return_tensors="pt"
18
+ ).to(model.device, dtype=model.dtype)
19
+
20
+ # Generate text output
21
+ outputs = model.generate(
22
+ **inputs,
23
+ max_new_tokens=256
24
+ )
25
+
26
+ # Decode and return text
27
+ text = processor.batch_decode(
28
+ outputs,
29
+ skip_special_tokens=True,
30
+ clean_up_tokenization_spaces=True
31
+ )[0]
32
+ return text
33
+
34
+ # Gradio interface
35
+ iface = gr.Interface(
36
+ fn=process_inputs,
37
+ inputs=[
38
+ gr.Image(label="Upload Image"),
39
+ gr.Audio(label="Ask Question about the Image")
40
+ ],
41
+ outputs=gr.Textbox(label="Answer"),
42
+ title="Image and Audio Question Answering",
43
+ description="Upload an image as context and ask a quesiton about the image. The model will generate a text response."
44
+ )
45
+
46
+ if __name__ == "__main__":
47
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ spaces
2
+ gradio
3
+ transformers=4.53.0