Spaces:

gitgato
/

speecht-to-speech

Runtime error

App Files Files Community

speecht-to-speech / app.py

gitgato

Update app.py

8ae49e6 verified over 1 year ago

raw

history blame contribute delete

2.05 kB

	import torch
	from transformers import pipeline
	from datasets import load_dataset
	from transformers import AutoModel
	from transformers import pipeline, SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForTextToSpeech
	import numpy as np
	import gradio as gr

	# Configurar el pipeline de reconocimiento automático de voz
	pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base")
	# Load model directly

	# Función para traducir texto
	def translate(audio):
	outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
	return outputs["text"]

	# Cargar el procesador y el modelo de SpeechT5
	processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
	model = AutoModel.from_pretrained("gitgato/mabama")
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

	# Cargar los datos de embeddings del hablante
	embeddings_dataset = load_dataset("ovieyra21/mabama-v5", split="train")
	speaker_embeddings = torch.tensor(embeddings_dataset[0]["xvector"]).unsqueeze(0)

	# Función para sintetizar el habla
	def synthesise(text):
	inputs = processor(text=text, return_tensors="pt")
	speech = model.generate(inputs["input_ids"], speaker_embedding=speaker_embeddings, vocoder=vocoder)
	return speech.numpy()

	# Configuración para el tipo de audio de salida
	target_dtype = np.int16
	max_range = np.iinfo(target_dtype).max

	# Función para traducción de habla a habla
	def speech_to_speech_translation(audio):
	translated_text = translate(audio)
	synthesised_speech = synthesise(translated_text)
	synthesised_speech = (synthesised_speech * max_range).astype(np.int16)
	return 16000, synthesised_speech

	# Interfaz de Gradio
	demo = gr.Interface(
	fn=speech_to_speech_translation,
	inputs=gr.Audio(sources=["microphone"], type="file", label="Input Audio"),
	outputs=gr.Audio(label="Generated Speech", type="numpy"),
	title="Speech-to-Speech Translation",
	description="Translate speech input to synthesized speech output."
	)

	# Lanzar la interfaz
	demo.launch(debug=True)