import gradio as gr import os os.system('pip install dashscope -U') import dashscope from dashscope import MultiModalConversation API_KEY = os.environ['API_KEY'] dashscope.api_key = API_KEY dashscope.base_http_api_url = "https://dashscope.aliyuncs.com/api/v1" def asr_inference(audio_file, context, language, enable_itn): if not audio_file: return "Please upload an audio file" messages = [ { "role": "system", "content": [ {"text": context}, ] }, { "role": "user", "content": [ {"audio": audio_file}, ] } ] if language == 'auto': response = MultiModalConversation.call( model="qwen3-asr-flash", messages=messages, result_format="message", asr_options={ "enable_lid": True, "enable_itn": enable_itn } ) else: response = MultiModalConversation.call( model="qwen3-asr-flash", messages=messages, result_format="message", asr_options={ "language": language, "enable_lid": True, "enable_itn": enable_itn } ) try: if hasattr(response, 'status_code') and response.status_code == 200: if (hasattr(response, 'output') and hasattr(response.output, 'choices') and len(response.output.choices) > 0): choice = response.output.choices[0] if (hasattr(choice, 'message') and hasattr(choice.message, 'content') and len(choice.message.content) > 0): content = choice.message.content[0] if 'text' in content: result_text = content['text'] if language == 'auto' and hasattr(choice.message, "annotations"): result_lang = choice.message.annotations[0]['language'] else: result_lang = None else: result_text = "No text content found" result_lang = None else: result_text = "Incomplete response structure" result_lang = None else: result_text = "No recognition result found in response" result_lang = None else: status_code = getattr(response, 'status_code', 'Unknown') error_msg = getattr(response, 'message', 'Unknown error') result_text = f"Request failed (Status: {status_code}): {error_msg}" result_lang = None except Exception as e: result_text = f"Processing error: {str(e)}" result_lang = None # Map result_lang to display name lang_display = { "auto": "Auto Detect", "zh": "Chinese", "en": "English", "ja": "Japanese", "ko": "Korean", "es": "Spanish", "fr": "French", "de": "German", "ar": "Arabic", "it": "Italian", "ru": "Russian", "pt": "Portuguese" } if result_lang in lang_display: result_lang = lang_display[result_lang] elif result_lang is not None: result_lang = f"Unknown Language ({result_lang})" return result_text, result_lang with gr.Blocks(theme=gr.themes.Soft(), title="Speech Recognition Tool") as demo: # ========== LOGO Area (Centered + Enlarged) ========== gr.Markdown("""
Qwen-ASR Logo
""", sanitize_html=False) # ========== API Documentation Link ========== gr.Markdown("""
🌐 View DashScope API Documentation
""", sanitize_html=False) gr.Markdown("Upload an audio file to get speech-to-text results.\nSupports custom context for tailored recognition. Supports language detection and inverse text normalization.") with gr.Row(): with gr.Column(): audio_input = gr.Audio(label="🎤 Upload Audio", type="filepath") context_input = gr.Textbox(label="📝 Context (Optional)", value="", interactive=True) language = gr.Dropdown( label="🌍 Language Setting", choices=[ ("Auto Detect", "auto"), ("Chinese", "zh"), ("English", "en"), ("Japanese", "ja"), ("Korean", "ko"), ("Spanish", "es"), ("French", "fr"), ("German", "de"), ("Arabic", "ar"), ("Italian", "it"), ("Russian", "ru"), ("Portuguese", "pt") ], value="auto" ) enable_itn = gr.Checkbox(label="🔄 Enable Inverse Text Normalization (ITN)", value=False) submit_btn = gr.Button("🚀 Start Recognition", variant="primary") with gr.Column(): text_output = gr.Textbox(label="📝 Recognition Result", interactive=False, lines=6, max_lines=12) lang_output = gr.Textbox(label="📝 Detected Language (only in auto mode)", interactive=False, lines=1, max_lines=12) submit_btn.click( fn=asr_inference, inputs=[audio_input, context_input, language, enable_itn], outputs=[text_output, lang_output] ) # Example Section gr.Markdown("### 💡 Examples") examples_data = { "Example 1 - CSGO Match": { "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/csgo.wav", "context": "A csgo match between NAVI and FazeClan in Major Paris 2023. S1mple and B1t are in NAVI. Ropz, Rain, Karrigan and Twistzz are in Faze.", "description": "Game commentary (Pro Terms & Names)" }, "Example 2 - Noisy Environment": { "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/noise3.wav", "context": "", "description": "English Recognition in Noise" }, "Example 3 - Complex Audio": { "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/noise1.wav", "context": "", "description": "Dialect Recognition in Heavy Noise" } } with gr.Row(): for title, data in examples_data.items(): with gr.Column(): example_btn = gr.Button(f"📎 {title}", variant="secondary", size="sm") gr.Markdown(f"*{data['description']}*", elem_classes=["example-desc"]) example_btn.click( fn=lambda audio=data['audio'], context=data['context']: (audio, context), outputs=[audio_input, context_input] ) if __name__ == "__main__": demo.launch()