Spaces:
Sleeping
Sleeping
File size: 4,757 Bytes
277a5a7 44e1298 277a5a7 44e1298 277a5a7 44e1298 277a5a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import os
import azure.cognitiveservices.speech as speechsdk
from time import sleep
from datetime import datetime
import gradio as gr
# Get environment settings
speech_key, service_region = os.environ['SPEECH__SERVICE__KEY'], os.environ['SPEECH__SERVICE__REGION']
# Define languages – source is Chinese, target is US English
from_language, to_languages = 'zh-CN', ['fi-FI']
def synthesize_audio(text, voice="fi-FI-NooraNeural", output_audio="translation_audio.wav"):
speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
speech_config.speech_synthesis_voice_name = voice
audio_config = speechsdk.audio.AudioOutputConfig(filename=output_audio)
synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
result = synthesizer.speak_text_async(text).get()
if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
print(f"Audio synthesized and saved to: {output_audio}")
else:
print("Synthesis failed: ", result.error_details)
def get_result_text(reason, translation_recognition_result, to_language):
reason_format = {
speechsdk.ResultReason.TranslatedSpeech:
f'RECOGNIZED "{from_language}": {translation_recognition_result.text}\n' +
f'TRANSLATED into "{to_language}": {translation_recognition_result.translations[to_language]}',
speechsdk.ResultReason.RecognizedSpeech: f'Recognized: "{translation_recognition_result.text}"',
speechsdk.ResultReason.NoMatch: f'No speech could be recognized: {translation_recognition_result.no_match_details}',
speechsdk.ResultReason.Canceled: f'Speech Recognition canceled: {translation_recognition_result.cancellation_details}'
}
return reason_format.get(reason, 'Unable to recognize speech')
def process_audio(input_audio_path):
"""
Process the uploaded audio file:
1. Translate the audio using continuous recognition.
2. Synthesize the translated text into audio.
3. Return the translated text (for preview) and synthesized audio file.
"""
# Set up translation configuration
translation_config = speechsdk.translation.SpeechTranslationConfig(
subscription=speech_key, region=service_region)
translation_config.speech_recognition_language = from_language
for lang in to_languages:
translation_config.add_target_language(lang)
# Use the uploaded file's path for translation
audio_config = speechsdk.audio.AudioConfig(filename=input_audio_path)
translation_recognizer = speechsdk.translation.TranslationRecognizer(
translation_config=translation_config, audio_config=audio_config)
done = False
translations = []
stop_called = False # flag to avoid duplicate stop processing
def handle_result(evt):
# Print and store translated segments
if evt.result.reason == speechsdk.ResultReason.TranslatedSpeech:
text_seg = evt.result.translations[to_languages[0]]
print("Segment: ", text_seg)
translations.append(text_seg)
def stop_cb(evt):
nonlocal done, stop_called
if stop_called:
return
stop_called = True
done = True
# Connect events
translation_recognizer.recognized.connect(handle_result)
translation_recognizer.session_stopped.connect(stop_cb)
translation_recognizer.canceled.connect(stop_cb)
# Start recognition
translation_recognizer.start_continuous_recognition()
while not done:
sleep(0.5)
translation_recognizer.stop_continuous_recognition()
# Join all translated segments into one final text
final_text = '\n'.join(translations)
print("Final Translated Text:\n", final_text)
# Synthesize the translated text into audio
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
synthesized_audio_file = f"translation_audio_{from_language}_to_{to_languages[0]}_{timestamp}.wav"
synthesize_audio(final_text, voice="fi-FI-NooraNeural", output_audio=synthesized_audio_file)
# Return both the translated text and the path to the synthesized audio file.
return final_text, synthesized_audio_file
# Create a Gradio Interface
iface = gr.Interface(
fn=process_audio,
inputs=gr.Audio(sources=["upload"], type="filepath", label="Upload Audio"),
outputs=[gr.Textbox(label="Translated Text Preview"), gr.Audio(type="filepath", label="Translated Audio")],
title="Speech Translation and Synthesis",
description="Upload an audio file containing Chinese speech. The app translates the speech to US English and synthesizes the translated text into audio for playback and download."
)
if __name__ == "__main__":
iface.launch() |