Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,30 +1,35 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
2 |
import torch
|
3 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
4 |
-
import os
|
5 |
|
6 |
-
#
|
7 |
model_name = "Qwen/Qwen2.5-Omni-3B"
|
8 |
-
|
9 |
-
|
|
|
10 |
|
11 |
# Function to process inputs and generate response
|
12 |
-
def process_input(text_input, image_input=None, audio_input=None):
|
13 |
-
|
|
|
|
|
14 |
if image_input:
|
15 |
-
|
16 |
if audio_input:
|
17 |
-
|
|
|
|
|
18 |
|
19 |
-
#
|
20 |
-
|
21 |
|
22 |
# Generate response
|
23 |
-
outputs = model.generate(
|
24 |
-
response_text =
|
25 |
|
26 |
-
#
|
27 |
-
response_audio = None
|
28 |
|
29 |
return response_text, response_audio
|
30 |
|
@@ -33,15 +38,16 @@ with gr.Blocks() as demo:
|
|
33 |
gr.Markdown("# Qwen2.5-Omni-3B Demo")
|
34 |
with gr.Row():
|
35 |
text_input = gr.Textbox(label="Text Input")
|
36 |
-
image_input = gr.Image(label="Upload Image")
|
37 |
-
audio_input = gr.Audio(label="Upload Audio")
|
|
|
38 |
submit_button = gr.Button("Submit")
|
39 |
text_output = gr.Textbox(label="Text Response")
|
40 |
audio_output = gr.Audio(label="Audio Response")
|
41 |
|
42 |
submit_button.click(
|
43 |
fn=process_input,
|
44 |
-
inputs=[text_input, image_input, audio_input],
|
45 |
outputs=[text_output, audio_output]
|
46 |
)
|
47 |
|
|
|
1 |
import gradio as gr
|
2 |
+
from transformers import Qwen2_5OmniModel, AutoProcessor
|
3 |
+
from qwen_omni_utils import process_mm_info
|
4 |
import torch
|
|
|
|
|
5 |
|
6 |
+
# Load model and processor
|
7 |
model_name = "Qwen/Qwen2.5-Omni-3B"
|
8 |
+
model = Qwen2_5OmniModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
|
9 |
+
processor = AutoProcessor.from_pretrained(model_name)
|
10 |
+
device = model.device
|
11 |
|
12 |
# Function to process inputs and generate response
|
13 |
+
def process_input(text_input, image_input=None, audio_input=None, video_input=None):
|
14 |
+
conversation = [
|
15 |
+
{"role": "user", "content": [{"text": text_input}]}
|
16 |
+
]
|
17 |
if image_input:
|
18 |
+
conversation[0]["content"].append({"image": image_input})
|
19 |
if audio_input:
|
20 |
+
conversation[0]["content"].append({"audio": audio_input})
|
21 |
+
if video_input:
|
22 |
+
conversation[0]["content"].append({"video": video_input})
|
23 |
|
24 |
+
# Process conversation
|
25 |
+
model_inputs = processor.process_chat_conversation(conversation, return_tensors="pt").to(device)
|
26 |
|
27 |
# Generate response
|
28 |
+
outputs = model.generate(**model_inputs, max_length=200)
|
29 |
+
response_text = processor.decode(outputs[0], skip_special_tokens=True)
|
30 |
|
31 |
+
# Audio output is not implemented for simplicity
|
32 |
+
response_audio = None
|
33 |
|
34 |
return response_text, response_audio
|
35 |
|
|
|
38 |
gr.Markdown("# Qwen2.5-Omni-3B Demo")
|
39 |
with gr.Row():
|
40 |
text_input = gr.Textbox(label="Text Input")
|
41 |
+
image_input = gr.Image(label="Upload Image", type="filepath")
|
42 |
+
audio_input = gr.Audio(label="Upload Audio", type="filepath")
|
43 |
+
video_input = gr.Video(label="Upload Video", type="filepath")
|
44 |
submit_button = gr.Button("Submit")
|
45 |
text_output = gr.Textbox(label="Text Response")
|
46 |
audio_output = gr.Audio(label="Audio Response")
|
47 |
|
48 |
submit_button.click(
|
49 |
fn=process_input,
|
50 |
+
inputs=[text_input, image_input, audio_input, video_input],
|
51 |
outputs=[text_output, audio_output]
|
52 |
)
|
53 |
|