Spaces:

KarhuSuomi
/

speech-translation

Sleeping

Tal

finnish voice

44e1298 4 months ago

4.76 kB

	import os
	import azure.cognitiveservices.speech as speechsdk
	from time import sleep
	from datetime import datetime
	import gradio as gr

	# Get environment settings
	speech_key, service_region = os.environ['SPEECH__SERVICE__KEY'], os.environ['SPEECH__SERVICE__REGION']
	# Define languages – source is Chinese, target is US English
	from_language, to_languages = 'zh-CN', ['fi-FI']

	def synthesize_audio(text, voice="fi-FI-NooraNeural", output_audio="translation_audio.wav"):
	speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
	speech_config.speech_synthesis_voice_name = voice
	audio_config = speechsdk.audio.AudioOutputConfig(filename=output_audio)
	synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)

	result = synthesizer.speak_text_async(text).get()
	if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
	print(f"Audio synthesized and saved to: {output_audio}")
	else:
	print("Synthesis failed: ", result.error_details)

	def get_result_text(reason, translation_recognition_result, to_language):
	reason_format = {
	speechsdk.ResultReason.TranslatedSpeech:
	f'RECOGNIZED "{from_language}": {translation_recognition_result.text}\n' +
	f'TRANSLATED into "{to_language}": {translation_recognition_result.translations[to_language]}',
	speechsdk.ResultReason.RecognizedSpeech: f'Recognized: "{translation_recognition_result.text}"',
	speechsdk.ResultReason.NoMatch: f'No speech could be recognized: {translation_recognition_result.no_match_details}',
	speechsdk.ResultReason.Canceled: f'Speech Recognition canceled: {translation_recognition_result.cancellation_details}'
	}
	return reason_format.get(reason, 'Unable to recognize speech')

	def process_audio(input_audio_path):
	"""
	Process the uploaded audio file:
	1. Translate the audio using continuous recognition.
	2. Synthesize the translated text into audio.
	3. Return the translated text (for preview) and synthesized audio file.
	"""
	# Set up translation configuration
	translation_config = speechsdk.translation.SpeechTranslationConfig(
	subscription=speech_key, region=service_region)
	translation_config.speech_recognition_language = from_language
	for lang in to_languages:
	translation_config.add_target_language(lang)

	# Use the uploaded file's path for translation
	audio_config = speechsdk.audio.AudioConfig(filename=input_audio_path)
	translation_recognizer = speechsdk.translation.TranslationRecognizer(
	translation_config=translation_config, audio_config=audio_config)

	done = False
	translations = []
	stop_called = False # flag to avoid duplicate stop processing

	def handle_result(evt):
	# Print and store translated segments
	if evt.result.reason == speechsdk.ResultReason.TranslatedSpeech:
	text_seg = evt.result.translations[to_languages[0]]
	print("Segment: ", text_seg)
	translations.append(text_seg)

	def stop_cb(evt):
	nonlocal done, stop_called
	if stop_called:
	return
	stop_called = True
	done = True

	# Connect events
	translation_recognizer.recognized.connect(handle_result)
	translation_recognizer.session_stopped.connect(stop_cb)
	translation_recognizer.canceled.connect(stop_cb)

	# Start recognition
	translation_recognizer.start_continuous_recognition()
	while not done:
	sleep(0.5)
	translation_recognizer.stop_continuous_recognition()

	# Join all translated segments into one final text
	final_text = '\n'.join(translations)
	print("Final Translated Text:\n", final_text)

	# Synthesize the translated text into audio
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	synthesized_audio_file = f"translation_audio_{from_language}_to_{to_languages[0]}_{timestamp}.wav"
	synthesize_audio(final_text, voice="fi-FI-NooraNeural", output_audio=synthesized_audio_file)

	# Return both the translated text and the path to the synthesized audio file.
	return final_text, synthesized_audio_file

	# Create a Gradio Interface
	iface = gr.Interface(
	fn=process_audio,
	inputs=gr.Audio(sources=["upload"], type="filepath", label="Upload Audio"),
	outputs=[gr.Textbox(label="Translated Text Preview"), gr.Audio(type="filepath", label="Translated Audio")],
	title="Speech Translation and Synthesis",
	description="Upload an audio file containing Chinese speech. The app translates the speech to US English and synthesizes the translated text into audio for playback and download."
	)

	if __name__ == "__main__":
	iface.launch()