import gradio as gr import os os.system('pip install dashscope -U') import dashscope from dashscope import MultiModalConversation API_KEY = os.environ['API_KEY'] dashscope.api_key = API_KEY dashscope.base_http_api_url = "https://dashscope.aliyuncs.com/api/v1" def asr_inference(audio_file, context, language, enable_itn): if not audio_file: return "Please upload an audio file" messages = [ { "role": "system", "content": [ {"text": context}, ] }, { "role": "user", "content": [ {"audio": audio_file}, ] } ] if language == 'auto': response = MultiModalConversation.call( model="qwen3-asr-flash", messages=messages, result_format="message", asr_options={ "enable_lid": True, "enable_itn": enable_itn } ) else: response = MultiModalConversation.call( model="qwen3-asr-flash", messages=messages, result_format="message", asr_options={ "language": language, "enable_lid": True, "enable_itn": enable_itn } ) try: if hasattr(response, 'status_code') and response.status_code == 200: if (hasattr(response, 'output') and hasattr(response.output, 'choices') and len(response.output.choices) > 0): choice = response.output.choices[0] if (hasattr(choice, 'message') and hasattr(choice.message, 'content') and len(choice.message.content) > 0): content = choice.message.content[0] if 'text' in content: result_text = content['text'] if language == 'auto' and hasattr(choice.message, "annotations"): result_lang = choice.message.annotations[0]['language'] else: result_lang = None else: result_text = "No text content found" result_lang = None else: result_text = "Incomplete response structure" result_lang = None else: result_text = "No recognition result found in response" result_lang = None else: status_code = getattr(response, 'status_code', 'Unknown') error_msg = getattr(response, 'message', 'Unknown error') result_text = f"Request failed (Status: {status_code}): {error_msg}" result_lang = None except Exception as e: result_text = f"Processing error: {str(e)}" result_lang = None # Map result_lang to display name lang_display = { "auto": "Auto Detect", "zh": "Chinese", "en": "English", "ja": "Japanese", "ko": "Korean", "es": "Spanish", "fr": "French", "de": "German", "ar": "Arabic", "it": "Italian", "ru": "Russian", "pt": "Portuguese" } if result_lang in lang_display: result_lang = lang_display[result_lang] elif result_lang is not None: result_lang = f"Unknown Language ({result_lang})" return result_text, result_lang with gr.Blocks(theme=gr.themes.Soft(), title="Speech Recognition Tool") as demo: # ========== LOGO Area (Centered + Enlarged) ========== gr.Markdown("""