Spaces:

Qwen
/

Qwen3-ASR-Demo

Running

File size: 7,686 Bytes

import gradio as gr
import os
os.system('pip install dashscope -U')
import dashscope
from dashscope import MultiModalConversation

API_KEY = os.environ['API_KEY']

dashscope.api_key = API_KEY
dashscope.base_http_api_url = "https://dashscope.aliyuncs.com/api/v1"

def asr_inference(audio_file, context, language, enable_itn):
    if not audio_file:
        return "Please upload an audio file"

    messages = [
        {
            "role": "system",
            "content": [
                {"text": context},
            ]
        },
        {
            "role": "user",
            "content": [
                {"audio": audio_file},
            ]
        }
    ]

    if language == 'auto':
        response = MultiModalConversation.call(
            model="qwen3-asr-flash",
            messages=messages,
            result_format="message",
            asr_options={
                "enable_lid": True,
                "enable_itn": enable_itn
            }
        )
    else:
        response = MultiModalConversation.call(
            model="qwen3-asr-flash",
            messages=messages,
            result_format="message",
            asr_options={
                "language": language,
                "enable_lid": True,
                "enable_itn": enable_itn
            }
        )

    try:
        if hasattr(response, 'status_code') and response.status_code == 200:
            if (hasattr(response, 'output') and 
                hasattr(response.output, 'choices') and 
                len(response.output.choices) > 0):
                
                choice = response.output.choices[0]
                if (hasattr(choice, 'message') and 
                    hasattr(choice.message, 'content') and 
                    len(choice.message.content) > 0):
                    
                    content = choice.message.content[0]
                    if 'text' in content:
                        result_text = content['text']
                        if language == 'auto' and hasattr(choice.message, "annotations"):
                            result_lang = choice.message.annotations[0]['language']
                        else:
                            result_lang = None
                    else:
                        result_text = "No text content found"
                        result_lang = None
                else:
                    result_text = "Incomplete response structure"
                    result_lang = None
            else:
                result_text = "No recognition result found in response"
                result_lang = None
        else:
            status_code = getattr(response, 'status_code', 'Unknown')
            error_msg = getattr(response, 'message', 'Unknown error')
            result_text = f"Request failed (Status: {status_code}): {error_msg}"
            result_lang = None
            
    except Exception as e:
        result_text = f"Processing error: {str(e)}"
        result_lang = None

    # Map result_lang to display name
    lang_display = {
        "auto": "Auto Detect",
        "zh": "Chinese",
        "en": "English",
        "ja": "Japanese",
        "ko": "Korean",
        "es": "Spanish",
        "fr": "French",
        "de": "German",
        "ar": "Arabic",
        "it": "Italian",
        "ru": "Russian",
        "pt": "Portuguese"
    }
    if result_lang in lang_display:
        result_lang = lang_display[result_lang]
    elif result_lang is not None:
        result_lang = f"Unknown Language ({result_lang})"

    return result_text, result_lang


with gr.Blocks(theme=gr.themes.Soft(), title="Speech Recognition Tool") as demo:
    # ========== LOGO Area (Centered + Enlarged) ==========
    gr.Markdown("""
    <div style="width: 100%; display: flex; justify-content: center; margin: 30px 0;">
        <img src="https://modelscope.oss-cn-beijing.aliyuncs.com/resource/00EE8C99-9C05-4236-A6D0-B58FF172D31B.png" 
             alt="Qwen-ASR Logo" 
             width="300"
             style="border-radius: 12px; box-shadow: 0 6px 12px rgba(0,0,0,0.15);"/>
    </div>
    """, sanitize_html=False)

    # ========== API Documentation Link ==========
    gr.Markdown("""
    <div style="text-align: center; margin: 10px 0; font-size: 14px; color: #555;">
        🌐 <a href="https://help.aliyun.com/zh/dashscope/developer-reference/" 
              target="_blank" 
              style="color: #0066cc; text-decoration: none;">
              View DashScope API Documentation
        </a>
    </div>
    """, sanitize_html=False)

    gr.Markdown("Upload an audio file to get speech-to-text results.\nSupports custom context for tailored recognition. Supports language detection and inverse text normalization.")

    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(label="🎤 Upload Audio", type="filepath")
            context_input = gr.Textbox(label="📝 Context (Optional)", value="", interactive=True)
            language = gr.Dropdown(
                label="🌍 Language Setting",
                choices=[
                    ("Auto Detect", "auto"),
                    ("Chinese", "zh"),
                    ("English", "en"),
                    ("Japanese", "ja"),
                    ("Korean", "ko"),
                    ("Spanish", "es"),
                    ("French", "fr"),
                    ("German", "de"),
                    ("Arabic", "ar"),
                    ("Italian", "it"),
                    ("Russian", "ru"),
                    ("Portuguese", "pt")
                ],
                value="auto"
            )
            enable_itn = gr.Checkbox(label="🔄 Enable Inverse Text Normalization (ITN)", value=False)
            submit_btn = gr.Button("🚀 Start Recognition", variant="primary")

        with gr.Column():
            text_output = gr.Textbox(label="📝 Recognition Result", interactive=False, lines=6, max_lines=12)
            lang_output = gr.Textbox(label="📝 Detected Language (only in auto mode)", interactive=False, lines=1, max_lines=12)

    submit_btn.click(
        fn=asr_inference,
        inputs=[audio_input, context_input, language, enable_itn],
        outputs=[text_output, lang_output]
    )
    

    # Example Section
    gr.Markdown("### 💡 Examples")
    
    examples_data = {
        "Example 1 - CSGO Match": {
            "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/csgo.wav",
            "context": "A csgo match between NAVI and FazeClan in Major Paris 2023. S1mple and B1t are in NAVI. Ropz, Rain, Karrigan and Twistzz are in Faze.",
            "description": "Game commentary (Pro Terms & Names)"
        },
        "Example 2 - Noisy Environment": {
            "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/noise3.wav",
            "context": "",
            "description": "English Recognition in Noise"
        },
        "Example 3 - Complex Audio": {
            "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/noise1.wav",
            "context": "",
            "description": "Dialect Recognition in Heavy Noise"
        }
    }
    
    with gr.Row():
        for title, data in examples_data.items():
            with gr.Column():
                example_btn = gr.Button(f"📎 {title}", variant="secondary", size="sm")
                gr.Markdown(f"*{data['description']}*", elem_classes=["example-desc"])
                
                example_btn.click(
                    fn=lambda audio=data['audio'], context=data['context']: (audio, context),
                    outputs=[audio_input, context_input]
                )

if __name__ == "__main__":
    demo.launch()