File size: 7,686 Bytes
d462144
 
 
 
 
 
 
 
 
 
 
 
 
b1e90aa
d462144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b1e90aa
d462144
 
b1e90aa
d462144
 
b1e90aa
d462144
 
b1e90aa
 
 
d462144
 
 
b1e90aa
d462144
 
b1e90aa
d462144
b1e90aa
 
 
 
 
 
 
 
 
 
 
 
d462144
 
 
 
b1e90aa
d462144
 
 
 
b1e90aa
 
d462144
 
 
 
 
 
 
 
 
b1e90aa
d462144
 
 
 
 
b1e90aa
d462144
 
 
 
b1e90aa
d462144
 
 
b1e90aa
 
d462144
b1e90aa
d462144
b1e90aa
 
 
 
 
 
 
 
 
 
 
 
d462144
 
 
b1e90aa
 
d462144
 
b1e90aa
 
d462144
 
 
 
 
 
 
b1e90aa
 
 
d462144
 
b1e90aa
d462144
 
47af753
d462144
b1e90aa
d462144
 
47af753
d462144
b1e90aa
d462144
 
47af753
d462144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import gradio as gr
import os
os.system('pip install dashscope -U')
import dashscope
from dashscope import MultiModalConversation

API_KEY = os.environ['API_KEY']

dashscope.api_key = API_KEY
dashscope.base_http_api_url = "https://dashscope.aliyuncs.com/api/v1"

def asr_inference(audio_file, context, language, enable_itn):
    if not audio_file:
        return "Please upload an audio file"

    messages = [
        {
            "role": "system",
            "content": [
                {"text": context},
            ]
        },
        {
            "role": "user",
            "content": [
                {"audio": audio_file},
            ]
        }
    ]

    if language == 'auto':
        response = MultiModalConversation.call(
            model="qwen3-asr-flash",
            messages=messages,
            result_format="message",
            asr_options={
                "enable_lid": True,
                "enable_itn": enable_itn
            }
        )
    else:
        response = MultiModalConversation.call(
            model="qwen3-asr-flash",
            messages=messages,
            result_format="message",
            asr_options={
                "language": language,
                "enable_lid": True,
                "enable_itn": enable_itn
            }
        )

    try:
        if hasattr(response, 'status_code') and response.status_code == 200:
            if (hasattr(response, 'output') and 
                hasattr(response.output, 'choices') and 
                len(response.output.choices) > 0):
                
                choice = response.output.choices[0]
                if (hasattr(choice, 'message') and 
                    hasattr(choice.message, 'content') and 
                    len(choice.message.content) > 0):
                    
                    content = choice.message.content[0]
                    if 'text' in content:
                        result_text = content['text']
                        if language == 'auto' and hasattr(choice.message, "annotations"):
                            result_lang = choice.message.annotations[0]['language']
                        else:
                            result_lang = None
                    else:
                        result_text = "No text content found"
                        result_lang = None
                else:
                    result_text = "Incomplete response structure"
                    result_lang = None
            else:
                result_text = "No recognition result found in response"
                result_lang = None
        else:
            status_code = getattr(response, 'status_code', 'Unknown')
            error_msg = getattr(response, 'message', 'Unknown error')
            result_text = f"Request failed (Status: {status_code}): {error_msg}"
            result_lang = None
            
    except Exception as e:
        result_text = f"Processing error: {str(e)}"
        result_lang = None

    # Map result_lang to display name
    lang_display = {
        "auto": "Auto Detect",
        "zh": "Chinese",
        "en": "English",
        "ja": "Japanese",
        "ko": "Korean",
        "es": "Spanish",
        "fr": "French",
        "de": "German",
        "ar": "Arabic",
        "it": "Italian",
        "ru": "Russian",
        "pt": "Portuguese"
    }
    if result_lang in lang_display:
        result_lang = lang_display[result_lang]
    elif result_lang is not None:
        result_lang = f"Unknown Language ({result_lang})"

    return result_text, result_lang


with gr.Blocks(theme=gr.themes.Soft(), title="Speech Recognition Tool") as demo:
    # ========== LOGO Area (Centered + Enlarged) ==========
    gr.Markdown("""
    <div style="width: 100%; display: flex; justify-content: center; margin: 30px 0;">
        <img src="https://modelscope.oss-cn-beijing.aliyuncs.com/resource/00EE8C99-9C05-4236-A6D0-B58FF172D31B.png" 
             alt="Qwen-ASR Logo" 
             width="300"
             style="border-radius: 12px; box-shadow: 0 6px 12px rgba(0,0,0,0.15);"/>
    </div>
    """, sanitize_html=False)

    # ========== API Documentation Link ==========
    gr.Markdown("""
    <div style="text-align: center; margin: 10px 0; font-size: 14px; color: #555;">
        🌐 <a href="https://help.aliyun.com/zh/dashscope/developer-reference/" 
              target="_blank" 
              style="color: #0066cc; text-decoration: none;">
              View DashScope API Documentation
        </a>
    </div>
    """, sanitize_html=False)

    gr.Markdown("Upload an audio file to get speech-to-text results.\nSupports custom context for tailored recognition. Supports language detection and inverse text normalization.")

    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(label="🎀 Upload Audio", type="filepath")
            context_input = gr.Textbox(label="πŸ“ Context (Optional)", value="", interactive=True)
            language = gr.Dropdown(
                label="🌍 Language Setting",
                choices=[
                    ("Auto Detect", "auto"),
                    ("Chinese", "zh"),
                    ("English", "en"),
                    ("Japanese", "ja"),
                    ("Korean", "ko"),
                    ("Spanish", "es"),
                    ("French", "fr"),
                    ("German", "de"),
                    ("Arabic", "ar"),
                    ("Italian", "it"),
                    ("Russian", "ru"),
                    ("Portuguese", "pt")
                ],
                value="auto"
            )
            enable_itn = gr.Checkbox(label="πŸ”„ Enable Inverse Text Normalization (ITN)", value=False)
            submit_btn = gr.Button("πŸš€ Start Recognition", variant="primary")

        with gr.Column():
            text_output = gr.Textbox(label="πŸ“ Recognition Result", interactive=False, lines=6, max_lines=12)
            lang_output = gr.Textbox(label="πŸ“ Detected Language (only in auto mode)", interactive=False, lines=1, max_lines=12)

    submit_btn.click(
        fn=asr_inference,
        inputs=[audio_input, context_input, language, enable_itn],
        outputs=[text_output, lang_output]
    )
    

    # Example Section
    gr.Markdown("### πŸ’‘ Examples")
    
    examples_data = {
        "Example 1 - CSGO Match": {
            "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/csgo.wav",
            "context": "A csgo match between NAVI and FazeClan in Major Paris 2023. S1mple and B1t are in NAVI. Ropz, Rain, Karrigan and Twistzz are in Faze.",
            "description": "Game commentary (Pro Terms & Names)"
        },
        "Example 2 - Noisy Environment": {
            "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/noise3.wav",
            "context": "",
            "description": "English Recognition in Noise"
        },
        "Example 3 - Complex Audio": {
            "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-ASR/noise1.wav",
            "context": "",
            "description": "Dialect Recognition in Heavy Noise"
        }
    }
    
    with gr.Row():
        for title, data in examples_data.items():
            with gr.Column():
                example_btn = gr.Button(f"πŸ“Ž {title}", variant="secondary", size="sm")
                gr.Markdown(f"*{data['description']}*", elem_classes=["example-desc"])
                
                example_btn.click(
                    fn=lambda audio=data['audio'], context=data['context']: (audio, context),
                    outputs=[audio_input, context_input]
                )

if __name__ == "__main__":
    demo.launch()