Qwen3-ASR-Demo / app.py
littlebird13's picture
Update app.py
4d811e8 verified
import gradio as gr
import os
os.system('pip install dashscope -U')
import dashscope
from dashscope import MultiModalConversation
API_KEY = os.environ['API_KEY']
dashscope.api_key = API_KEY
dashscope.base_http_api_url = "https://dashscope.aliyuncs.com/api/v1"
def asr_inference(audio_file, context, language, enable_itn):
if not audio_file:
return "Please upload an audio file"
messages = [
{
"role": "system",
"content": [
{"text": context},
]
},
{
"role": "user",
"content": [
{"audio": audio_file},
]
}
]
if language == 'auto':
response = MultiModalConversation.call(
model="qwen3-asr-flash",
messages=messages,
result_format="message",
asr_options={
"enable_lid": True,
"enable_itn": enable_itn
}
)
else:
response = MultiModalConversation.call(
model="qwen3-asr-flash",
messages=messages,
result_format="message",
asr_options={
"language": language,
"enable_lid": True,
"enable_itn": enable_itn
}
)
try:
if hasattr(response, 'status_code') and response.status_code == 200:
if (hasattr(response, 'output') and
hasattr(response.output, 'choices') and
len(response.output.choices) > 0):
choice = response.output.choices[0]
if (hasattr(choice, 'message') and
hasattr(choice.message, 'content') and
len(choice.message.content) > 0):
content = choice.message.content[0]
if 'text' in content:
result_text = content['text']
if language == 'auto' and hasattr(choice.message, "annotations"):
result_lang = choice.message.annotations[0]['language']
else:
result_lang = None
else:
result_text = "No text content found"
result_lang = None
else:
result_text = "Incomplete response structure"
result_lang = None
else:
result_text = "No recognition result found in response"
result_lang = None
else:
status_code = getattr(response, 'status_code', 'Unknown')
error_msg = getattr(response, 'message', 'Unknown error')
result_text = f"Request failed (Status: {status_code}): {error_msg}"
result_lang = None
except Exception as e:
result_text = f"Processing error: {str(e)}"
result_lang = None
# Map result_lang to display name
lang_display = {
"auto": "Auto Detect",
"zh": "Chinese",
"en": "English",
"ja": "Japanese",
"ko": "Korean",
"es": "Spanish",
"fr": "French",
"de": "German",
"ar": "Arabic",
"it": "Italian",
"ru": "Russian",
"pt": "Portuguese"
}
if result_lang in lang_display:
result_lang = lang_display[result_lang]
elif result_lang is not None:
result_lang = f"Unknown Language ({result_lang})"
return result_text, result_lang
with gr.Blocks(theme=gr.themes.Soft(), title="Speech Recognition Tool") as demo:
# ========== LOGO Area (Centered + Enlarged) ==========
gr.Markdown("""
<div style="width: 100%; display: flex; justify-content: center; margin: 30px 0;">
<img src="https://modelscope.oss-cn-beijing.aliyuncs.com/resource/00EE8C99-9C05-4236-A6D0-B58FF172D31B.png"
alt="Qwen-ASR Logo"
width="300"
style="border-radius: 12px; box-shadow: 0 6px 12px rgba(0,0,0,0.15);"/>
</div>
""", sanitize_html=False)
# ========== API Documentation Link ==========
gr.Markdown("""
<div style="text-align: center; margin: 10px 0; font-size: 14px; color: #555;">
🌐 <a href="https://help.aliyun.com/zh/dashscope/developer-reference/"
target="_blank"
style="color: #0066cc; text-decoration: none;">
View DashScope API Documentation
</a>
</div>
""", sanitize_html=False)
gr.Markdown("Upload an audio file to get speech-to-text results.\nSupports custom context for tailored recognition. Supports language detection and inverse text normalization.")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(label="🎀 Upload Audio", type="filepath")
context_input = gr.Textbox(label="πŸ“ Context (Optional)", value="", interactive=True)
language = gr.Dropdown(
label="🌍 Language Setting",
choices=[
("Auto Detect", "auto"),
("Chinese", "zh"),
("English", "en"),
("Japanese", "ja"),
("Korean", "ko"),
("Spanish", "es"),
("French", "fr"),
("German", "de"),
("Arabic", "ar"),
("Italian", "it"),
("Russian", "ru"),
("Portuguese", "pt")
],
value="auto"
)
enable_itn = gr.Checkbox(label="πŸ”„ Enable Inverse Text Normalization (ITN)", value=False)
submit_btn = gr.Button("πŸš€ Start Recognition", variant="primary")
with gr.Column():
text_output = gr.Textbox(label="πŸ“ Recognition Result", interactive=False, lines=6, max_lines=12)
lang_output = gr.Textbox(label="πŸ“ Detected Language (only in auto mode)", interactive=False, lines=1, max_lines=12)
submit_btn.click(
fn=asr_inference,
inputs=[audio_input, context_input, language, enable_itn],
outputs=[text_output, lang_output]
)
# Example Section
gr.Markdown("### πŸ’‘ Examples")
examples_data = {
"Example 1 - CSGO Match": {
"audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/csgo.wav",
"context": "A csgo match between NAVI and FazeClan in Major Paris 2023. S1mple and B1t are in NAVI. Ropz, Rain, Karrigan and Twistzz are in Faze.",
"description": "Game commentary (Pro Terms & Names)"
},
"Example 2 - Noisy Environment": {
"audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/noise3.wav",
"context": "",
"description": "English Recognition in Noise"
},
"Example 3 - Complex Audio": {
"audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/noise1.wav",
"context": "",
"description": "Dialect Recognition in Heavy Noise"
}
}
with gr.Row():
for title, data in examples_data.items():
with gr.Column():
example_btn = gr.Button(f"πŸ“Ž {title}", variant="secondary", size="sm")
gr.Markdown(f"*{data['description']}*", elem_classes=["example-desc"])
example_btn.click(
fn=lambda audio=data['audio'], context=data['context']: (audio, context),
outputs=[audio_input, context_input]
)
if __name__ == "__main__":
demo.launch()