Spaces:
Sleeping
Sleeping
import os | |
import azure.cognitiveservices.speech as speechsdk | |
from time import sleep | |
from datetime import datetime | |
import gradio as gr | |
# Get environment settings | |
speech_key, service_region = os.environ['SPEECH__SERVICE__KEY'], os.environ['SPEECH__SERVICE__REGION'] | |
# Define languages – source is Chinese, target is US English | |
from_language, to_languages = 'zh-CN', ['fi-FI'] | |
def synthesize_audio(text, voice="fi-FI-NooraNeural", output_audio="translation_audio.wav"): | |
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region) | |
speech_config.speech_synthesis_voice_name = voice | |
audio_config = speechsdk.audio.AudioOutputConfig(filename=output_audio) | |
synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config) | |
result = synthesizer.speak_text_async(text).get() | |
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: | |
print(f"Audio synthesized and saved to: {output_audio}") | |
else: | |
print("Synthesis failed: ", result.error_details) | |
def get_result_text(reason, translation_recognition_result, to_language): | |
reason_format = { | |
speechsdk.ResultReason.TranslatedSpeech: | |
f'RECOGNIZED "{from_language}": {translation_recognition_result.text}\n' + | |
f'TRANSLATED into "{to_language}": {translation_recognition_result.translations[to_language]}', | |
speechsdk.ResultReason.RecognizedSpeech: f'Recognized: "{translation_recognition_result.text}"', | |
speechsdk.ResultReason.NoMatch: f'No speech could be recognized: {translation_recognition_result.no_match_details}', | |
speechsdk.ResultReason.Canceled: f'Speech Recognition canceled: {translation_recognition_result.cancellation_details}' | |
} | |
return reason_format.get(reason, 'Unable to recognize speech') | |
def process_audio(input_audio_path): | |
""" | |
Process the uploaded audio file: | |
1. Translate the audio using continuous recognition. | |
2. Synthesize the translated text into audio. | |
3. Return the translated text (for preview) and synthesized audio file. | |
""" | |
# Set up translation configuration | |
translation_config = speechsdk.translation.SpeechTranslationConfig( | |
subscription=speech_key, region=service_region) | |
translation_config.speech_recognition_language = from_language | |
for lang in to_languages: | |
translation_config.add_target_language(lang) | |
# Use the uploaded file's path for translation | |
audio_config = speechsdk.audio.AudioConfig(filename=input_audio_path) | |
translation_recognizer = speechsdk.translation.TranslationRecognizer( | |
translation_config=translation_config, audio_config=audio_config) | |
done = False | |
translations = [] | |
stop_called = False # flag to avoid duplicate stop processing | |
def handle_result(evt): | |
# Print and store translated segments | |
if evt.result.reason == speechsdk.ResultReason.TranslatedSpeech: | |
text_seg = evt.result.translations[to_languages[0]] | |
print("Segment: ", text_seg) | |
translations.append(text_seg) | |
def stop_cb(evt): | |
nonlocal done, stop_called | |
if stop_called: | |
return | |
stop_called = True | |
done = True | |
# Connect events | |
translation_recognizer.recognized.connect(handle_result) | |
translation_recognizer.session_stopped.connect(stop_cb) | |
translation_recognizer.canceled.connect(stop_cb) | |
# Start recognition | |
translation_recognizer.start_continuous_recognition() | |
while not done: | |
sleep(0.5) | |
translation_recognizer.stop_continuous_recognition() | |
# Join all translated segments into one final text | |
final_text = '\n'.join(translations) | |
print("Final Translated Text:\n", final_text) | |
# Synthesize the translated text into audio | |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
synthesized_audio_file = f"translation_audio_{from_language}_to_{to_languages[0]}_{timestamp}.wav" | |
synthesize_audio(final_text, voice="fi-FI-NooraNeural", output_audio=synthesized_audio_file) | |
# Return both the translated text and the path to the synthesized audio file. | |
return final_text, synthesized_audio_file | |
# Create a Gradio Interface | |
iface = gr.Interface( | |
fn=process_audio, | |
inputs=gr.Audio(sources=["upload"], type="filepath", label="Upload Audio"), | |
outputs=[gr.Textbox(label="Translated Text Preview"), gr.Audio(type="filepath", label="Translated Audio")], | |
title="Speech Translation and Synthesis", | |
description="Upload an audio file containing Chinese speech. The app translates the speech to US English and synthesizes the translated text into audio for playback and download." | |
) | |
if __name__ == "__main__": | |
iface.launch() |