Spaces:
Running
Running
import gradio as gr | |
import os | |
os.system('pip install dashscope -U') | |
import dashscope | |
from dashscope import MultiModalConversation | |
API_KEY = os.environ['API_KEY'] | |
dashscope.api_key = API_KEY | |
dashscope.base_http_api_url = "https://dashscope.aliyuncs.com/api/v1" | |
def asr_inference(audio_file, context, language, enable_itn): | |
if not audio_file: | |
return "Please upload an audio file" | |
messages = [ | |
{ | |
"role": "system", | |
"content": [ | |
{"text": context}, | |
] | |
}, | |
{ | |
"role": "user", | |
"content": [ | |
{"audio": audio_file}, | |
] | |
} | |
] | |
if language == 'auto': | |
response = MultiModalConversation.call( | |
model="qwen3-asr-flash", | |
messages=messages, | |
result_format="message", | |
asr_options={ | |
"enable_lid": True, | |
"enable_itn": enable_itn | |
} | |
) | |
else: | |
response = MultiModalConversation.call( | |
model="qwen3-asr-flash", | |
messages=messages, | |
result_format="message", | |
asr_options={ | |
"language": language, | |
"enable_lid": True, | |
"enable_itn": enable_itn | |
} | |
) | |
try: | |
if hasattr(response, 'status_code') and response.status_code == 200: | |
if (hasattr(response, 'output') and | |
hasattr(response.output, 'choices') and | |
len(response.output.choices) > 0): | |
choice = response.output.choices[0] | |
if (hasattr(choice, 'message') and | |
hasattr(choice.message, 'content') and | |
len(choice.message.content) > 0): | |
content = choice.message.content[0] | |
if 'text' in content: | |
result_text = content['text'] | |
if language == 'auto' and hasattr(choice.message, "annotations"): | |
result_lang = choice.message.annotations[0]['language'] | |
else: | |
result_lang = None | |
else: | |
result_text = "No text content found" | |
result_lang = None | |
else: | |
result_text = "Incomplete response structure" | |
result_lang = None | |
else: | |
result_text = "No recognition result found in response" | |
result_lang = None | |
else: | |
status_code = getattr(response, 'status_code', 'Unknown') | |
error_msg = getattr(response, 'message', 'Unknown error') | |
result_text = f"Request failed (Status: {status_code}): {error_msg}" | |
result_lang = None | |
except Exception as e: | |
result_text = f"Processing error: {str(e)}" | |
result_lang = None | |
# Map result_lang to display name | |
lang_display = { | |
"auto": "Auto Detect", | |
"zh": "Chinese", | |
"en": "English", | |
"ja": "Japanese", | |
"ko": "Korean", | |
"es": "Spanish", | |
"fr": "French", | |
"de": "German", | |
"ar": "Arabic", | |
"it": "Italian", | |
"ru": "Russian", | |
"pt": "Portuguese" | |
} | |
if result_lang in lang_display: | |
result_lang = lang_display[result_lang] | |
elif result_lang is not None: | |
result_lang = f"Unknown Language ({result_lang})" | |
return result_text, result_lang | |
with gr.Blocks(theme=gr.themes.Soft(), title="Speech Recognition Tool") as demo: | |
# ========== LOGO Area (Centered + Enlarged) ========== | |
gr.Markdown(""" | |
<div style="width: 100%; display: flex; justify-content: center; margin: 30px 0;"> | |
<img src="https://modelscope.oss-cn-beijing.aliyuncs.com/resource/00EE8C99-9C05-4236-A6D0-B58FF172D31B.png" | |
alt="Qwen-ASR Logo" | |
width="300" | |
style="border-radius: 12px; box-shadow: 0 6px 12px rgba(0,0,0,0.15);"/> | |
</div> | |
""", sanitize_html=False) | |
# ========== API Documentation Link ========== | |
gr.Markdown(""" | |
<div style="text-align: center; margin: 10px 0; font-size: 14px; color: #555;"> | |
π <a href="https://help.aliyun.com/zh/dashscope/developer-reference/" | |
target="_blank" | |
style="color: #0066cc; text-decoration: none;"> | |
View DashScope API Documentation | |
</a> | |
</div> | |
""", sanitize_html=False) | |
gr.Markdown("Upload an audio file to get speech-to-text results.\nSupports custom context for tailored recognition. Supports language detection and inverse text normalization.") | |
with gr.Row(): | |
with gr.Column(): | |
audio_input = gr.Audio(label="π€ Upload Audio", type="filepath") | |
context_input = gr.Textbox(label="π Context (Optional)", value="", interactive=True) | |
language = gr.Dropdown( | |
label="π Language Setting", | |
choices=[ | |
("Auto Detect", "auto"), | |
("Chinese", "zh"), | |
("English", "en"), | |
("Japanese", "ja"), | |
("Korean", "ko"), | |
("Spanish", "es"), | |
("French", "fr"), | |
("German", "de"), | |
("Arabic", "ar"), | |
("Italian", "it"), | |
("Russian", "ru"), | |
("Portuguese", "pt") | |
], | |
value="auto" | |
) | |
enable_itn = gr.Checkbox(label="π Enable Inverse Text Normalization (ITN)", value=False) | |
submit_btn = gr.Button("π Start Recognition", variant="primary") | |
with gr.Column(): | |
text_output = gr.Textbox(label="π Recognition Result", interactive=False, lines=6, max_lines=12) | |
lang_output = gr.Textbox(label="π Detected Language (only in auto mode)", interactive=False, lines=1, max_lines=12) | |
submit_btn.click( | |
fn=asr_inference, | |
inputs=[audio_input, context_input, language, enable_itn], | |
outputs=[text_output, lang_output] | |
) | |
# Example Section | |
gr.Markdown("### π‘ Examples") | |
examples_data = { | |
"Example 1 - CSGO Match": { | |
"audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/csgo.wav", | |
"context": "A csgo match between NAVI and FazeClan in Major Paris 2023. S1mple and B1t are in NAVI. Ropz, Rain, Karrigan and Twistzz are in Faze.", | |
"description": "Game commentary (Pro Terms & Names)" | |
}, | |
"Example 2 - Noisy Environment": { | |
"audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/noise3.wav", | |
"context": "", | |
"description": "English Recognition in Noise" | |
}, | |
"Example 3 - Complex Audio": { | |
"audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/noise1.wav", | |
"context": "", | |
"description": "Dialect Recognition in Heavy Noise" | |
} | |
} | |
with gr.Row(): | |
for title, data in examples_data.items(): | |
with gr.Column(): | |
example_btn = gr.Button(f"π {title}", variant="secondary", size="sm") | |
gr.Markdown(f"*{data['description']}*", elem_classes=["example-desc"]) | |
example_btn.click( | |
fn=lambda audio=data['audio'], context=data['context']: (audio, context), | |
outputs=[audio_input, context_input] | |
) | |
if __name__ == "__main__": | |
demo.launch() |