import os import azure.cognitiveservices.speech as speechsdk from time import sleep from datetime import datetime import gradio as gr # Get environment settings speech_key, service_region = os.environ['SPEECH__SERVICE__KEY'], os.environ['SPEECH__SERVICE__REGION'] # Define languages – source is Chinese, target is US English from_language, to_languages = 'zh-CN', ['fi-FI'] def synthesize_audio(text, voice="fi-FI-NooraNeural", output_audio="translation_audio.wav"): speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) speech_config.speech_synthesis_voice_name = voice audio_config = speechsdk.audio.AudioOutputConfig(filename=output_audio) synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config) result = synthesizer.speak_text_async(text).get() if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: print(f"Audio synthesized and saved to: {output_audio}") else: print("Synthesis failed: ", result.error_details) def get_result_text(reason, translation_recognition_result, to_language): reason_format = { speechsdk.ResultReason.TranslatedSpeech: f'RECOGNIZED "{from_language}": {translation_recognition_result.text}\n' + f'TRANSLATED into "{to_language}": {translation_recognition_result.translations[to_language]}', speechsdk.ResultReason.RecognizedSpeech: f'Recognized: "{translation_recognition_result.text}"', speechsdk.ResultReason.NoMatch: f'No speech could be recognized: {translation_recognition_result.no_match_details}', speechsdk.ResultReason.Canceled: f'Speech Recognition canceled: {translation_recognition_result.cancellation_details}' } return reason_format.get(reason, 'Unable to recognize speech') def process_audio(input_audio_path): """ Process the uploaded audio file: 1. Translate the audio using continuous recognition. 2. Synthesize the translated text into audio. 3. Return the translated text (for preview) and synthesized audio file. """ # Set up translation configuration translation_config = speechsdk.translation.SpeechTranslationConfig( subscription=speech_key, region=service_region) translation_config.speech_recognition_language = from_language for lang in to_languages: translation_config.add_target_language(lang) # Use the uploaded file's path for translation audio_config = speechsdk.audio.AudioConfig(filename=input_audio_path) translation_recognizer = speechsdk.translation.TranslationRecognizer( translation_config=translation_config, audio_config=audio_config) done = False translations = [] stop_called = False # flag to avoid duplicate stop processing def handle_result(evt): # Print and store translated segments if evt.result.reason == speechsdk.ResultReason.TranslatedSpeech: text_seg = evt.result.translations[to_languages[0]] print("Segment: ", text_seg) translations.append(text_seg) def stop_cb(evt): nonlocal done, stop_called if stop_called: return stop_called = True done = True # Connect events translation_recognizer.recognized.connect(handle_result) translation_recognizer.session_stopped.connect(stop_cb) translation_recognizer.canceled.connect(stop_cb) # Start recognition translation_recognizer.start_continuous_recognition() while not done: sleep(0.5) translation_recognizer.stop_continuous_recognition() # Join all translated segments into one final text final_text = '\n'.join(translations) print("Final Translated Text:\n", final_text) # Synthesize the translated text into audio timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") synthesized_audio_file = f"translation_audio_{from_language}_to_{to_languages[0]}_{timestamp}.wav" synthesize_audio(final_text, voice="fi-FI-NooraNeural", output_audio=synthesized_audio_file) # Return both the translated text and the path to the synthesized audio file. return final_text, synthesized_audio_file # Create a Gradio Interface iface = gr.Interface( fn=process_audio, inputs=gr.Audio(sources=["upload"], type="filepath", label="Upload Audio"), outputs=[gr.Textbox(label="Translated Text Preview"), gr.Audio(type="filepath", label="Translated Audio")], title="Speech Translation and Synthesis", description="Upload an audio file containing Chinese speech. The app translates the speech to US English and synthesizes the translated text into audio for playback and download." ) if __name__ == "__main__": iface.launch()