Spaces:

KarhuSuomi
/

speech-translation

Sleeping

File size: 4,757 Bytes

import os
import azure.cognitiveservices.speech as speechsdk
from time import sleep
from datetime import datetime
import gradio as gr

# Get environment settings
speech_key, service_region = os.environ['SPEECH__SERVICE__KEY'], os.environ['SPEECH__SERVICE__REGION']
# Define languages – source is Chinese, target is US English
from_language, to_languages = 'zh-CN', ['fi-FI']

def synthesize_audio(text, voice="fi-FI-NooraNeural", output_audio="translation_audio.wav"):
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    speech_config.speech_synthesis_voice_name = voice
    audio_config = speechsdk.audio.AudioOutputConfig(filename=output_audio)
    synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
    
    result = synthesizer.speak_text_async(text).get()
    if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
         print(f"Audio synthesized and saved to: {output_audio}")
    else:
         print("Synthesis failed: ", result.error_details)

def get_result_text(reason, translation_recognition_result, to_language):
    reason_format = {
        speechsdk.ResultReason.TranslatedSpeech:
            f'RECOGNIZED "{from_language}": {translation_recognition_result.text}\n' +
            f'TRANSLATED into "{to_language}": {translation_recognition_result.translations[to_language]}',
        speechsdk.ResultReason.RecognizedSpeech: f'Recognized: "{translation_recognition_result.text}"',
        speechsdk.ResultReason.NoMatch: f'No speech could be recognized: {translation_recognition_result.no_match_details}',
        speechsdk.ResultReason.Canceled: f'Speech Recognition canceled: {translation_recognition_result.cancellation_details}'
    }
    return reason_format.get(reason, 'Unable to recognize speech')

def process_audio(input_audio_path):
    """
    Process the uploaded audio file:
    1. Translate the audio using continuous recognition.
    2. Synthesize the translated text into audio.
    3. Return the translated text (for preview) and synthesized audio file.
    """
    # Set up translation configuration
    translation_config = speechsdk.translation.SpeechTranslationConfig(
        subscription=speech_key, region=service_region)
    translation_config.speech_recognition_language = from_language
    for lang in to_languages:
        translation_config.add_target_language(lang)
        
    # Use the uploaded file's path for translation
    audio_config = speechsdk.audio.AudioConfig(filename=input_audio_path)
    translation_recognizer = speechsdk.translation.TranslationRecognizer(
        translation_config=translation_config, audio_config=audio_config)

    done = False
    translations = []
    stop_called = False  # flag to avoid duplicate stop processing

    def handle_result(evt):
        # Print and store translated segments
        if evt.result.reason == speechsdk.ResultReason.TranslatedSpeech:
            text_seg = evt.result.translations[to_languages[0]]
            print("Segment: ", text_seg)
            translations.append(text_seg)

    def stop_cb(evt):
        nonlocal done, stop_called
        if stop_called:
            return
        stop_called = True
        done = True

    # Connect events
    translation_recognizer.recognized.connect(handle_result)
    translation_recognizer.session_stopped.connect(stop_cb)
    translation_recognizer.canceled.connect(stop_cb)

    # Start recognition
    translation_recognizer.start_continuous_recognition()
    while not done:
        sleep(0.5)
    translation_recognizer.stop_continuous_recognition()

    # Join all translated segments into one final text
    final_text = '\n'.join(translations)
    print("Final Translated Text:\n", final_text)

    # Synthesize the translated text into audio
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    synthesized_audio_file = f"translation_audio_{from_language}_to_{to_languages[0]}_{timestamp}.wav"
    synthesize_audio(final_text, voice="fi-FI-NooraNeural", output_audio=synthesized_audio_file)

    # Return both the translated text and the path to the synthesized audio file.
    return final_text, synthesized_audio_file

# Create a Gradio Interface
iface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(sources=["upload"], type="filepath", label="Upload Audio"),
    outputs=[gr.Textbox(label="Translated Text Preview"), gr.Audio(type="filepath", label="Translated Audio")],
    title="Speech Translation and Synthesis",
    description="Upload an audio file containing Chinese speech. The app translates the speech to US English and synthesizes the translated text into audio for playback and download."
)

if __name__ == "__main__":
    iface.launch()