kevalfst commited on
Commit
507486b
·
verified ·
1 Parent(s): 7d42a94

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -18
app.py CHANGED
@@ -1,30 +1,35 @@
1
  import gradio as gr
 
 
2
  import torch
3
- from transformers import AutoModelForCausalLM, AutoTokenizer
4
- import os
5
 
6
- # Set model and tokenizer
7
  model_name = "Qwen/Qwen2.5-Omni-3B"
8
- tokenizer = AutoTokenizer.from_pretrained(model_name)
9
- model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
 
10
 
11
  # Function to process inputs and generate response
12
- def process_input(text_input, image_input=None, audio_input=None):
13
- inputs = {"text": text_input}
 
 
14
  if image_input:
15
- inputs["image"] = image_input
16
  if audio_input:
17
- inputs["audio"] = audio_input
 
 
18
 
19
- # Tokenize inputs (simplified for demo)
20
- input_ids = tokenizer.encode(inputs["text"], return_tensors="pt").to(model.device)
21
 
22
  # Generate response
23
- outputs = model.generate(input_ids, max_length=200)
24
- response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
25
 
26
- # Placeholder for speech generation (requires additional setup)
27
- response_audio = None # Implement speech generation if needed
28
 
29
  return response_text, response_audio
30
 
@@ -33,15 +38,16 @@ with gr.Blocks() as demo:
33
  gr.Markdown("# Qwen2.5-Omni-3B Demo")
34
  with gr.Row():
35
  text_input = gr.Textbox(label="Text Input")
36
- image_input = gr.Image(label="Upload Image")
37
- audio_input = gr.Audio(label="Upload Audio")
 
38
  submit_button = gr.Button("Submit")
39
  text_output = gr.Textbox(label="Text Response")
40
  audio_output = gr.Audio(label="Audio Response")
41
 
42
  submit_button.click(
43
  fn=process_input,
44
- inputs=[text_input, image_input, audio_input],
45
  outputs=[text_output, audio_output]
46
  )
47
 
 
1
  import gradio as gr
2
+ from transformers import Qwen2_5OmniModel, AutoProcessor
3
+ from qwen_omni_utils import process_mm_info
4
  import torch
 
 
5
 
6
+ # Load model and processor
7
  model_name = "Qwen/Qwen2.5-Omni-3B"
8
+ model = Qwen2_5OmniModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
9
+ processor = AutoProcessor.from_pretrained(model_name)
10
+ device = model.device
11
 
12
  # Function to process inputs and generate response
13
+ def process_input(text_input, image_input=None, audio_input=None, video_input=None):
14
+ conversation = [
15
+ {"role": "user", "content": [{"text": text_input}]}
16
+ ]
17
  if image_input:
18
+ conversation[0]["content"].append({"image": image_input})
19
  if audio_input:
20
+ conversation[0]["content"].append({"audio": audio_input})
21
+ if video_input:
22
+ conversation[0]["content"].append({"video": video_input})
23
 
24
+ # Process conversation
25
+ model_inputs = processor.process_chat_conversation(conversation, return_tensors="pt").to(device)
26
 
27
  # Generate response
28
+ outputs = model.generate(**model_inputs, max_length=200)
29
+ response_text = processor.decode(outputs[0], skip_special_tokens=True)
30
 
31
+ # Audio output is not implemented for simplicity
32
+ response_audio = None
33
 
34
  return response_text, response_audio
35
 
 
38
  gr.Markdown("# Qwen2.5-Omni-3B Demo")
39
  with gr.Row():
40
  text_input = gr.Textbox(label="Text Input")
41
+ image_input = gr.Image(label="Upload Image", type="filepath")
42
+ audio_input = gr.Audio(label="Upload Audio", type="filepath")
43
+ video_input = gr.Video(label="Upload Video", type="filepath")
44
  submit_button = gr.Button("Submit")
45
  text_output = gr.Textbox(label="Text Response")
46
  audio_output = gr.Audio(label="Audio Response")
47
 
48
  submit_button.click(
49
  fn=process_input,
50
+ inputs=[text_input, image_input, audio_input, video_input],
51
  outputs=[text_output, audio_output]
52
  )
53