|
|
|
import gradio as gr |
|
import shutil |
|
import os |
|
import subprocess |
|
import sys |
|
|
|
subprocess.run(["python", "src/setup_wav2lip.py"]) |
|
|
|
|
|
sys.path.append(os.path.abspath("./src")) |
|
|
|
from whisper_audio_transcriber import transcribe_audio, guardar_transcripcion |
|
from call_openai_api import moni as rtff |
|
|
|
|
|
AUDIO_RECORD_PATH = os.path.abspath("./assets/audio/grabacion_gradio.wav") |
|
VIDEO_PATH = os.path.abspath("./assets/video/data_video_sun.mp4") |
|
TRANSCRIPTION_TEXT_PATH = os.path.abspath("./results/transcripcion.txt") |
|
RESULT_AUDIO_TEMP_PATH = os.path.abspath("./results/audiov2.wav") |
|
RESULT_AUDIO_FINAL_PATH = os.path.abspath("./assets/audio/audio.wav") |
|
RESULT_VIDEO_PATH = os.path.abspath("./results/result_voice.mp4") |
|
TEXT_TO_SPEECH_PATH = os.path.abspath("./src/text_to_speech.py") |
|
RUN_INFERENCE_PATH = os.path.abspath("./src/run_inference.py") |
|
|
|
|
|
def transcribir_con_progreso(audio_path): |
|
progreso = gr.Progress() |
|
progreso(0, "Iniciando transcripción...") |
|
model_name = "openai/whisper-large" |
|
progreso(25, "Cargando modelo Whisper...") |
|
transcripcion = transcribe_audio(audio_path, model_name) |
|
progreso(75, "Guardando transcripción...") |
|
guardar_transcripcion(transcripcion, filename=TRANSCRIPTION_TEXT_PATH) |
|
progreso(100, "Transcripción completada.") |
|
return transcripcion |
|
|
|
|
|
def generar_audio_desde_texto(): |
|
print("Ejecutando text_to_speech...") |
|
result = subprocess.run( |
|
[sys.executable, TEXT_TO_SPEECH_PATH], |
|
capture_output=True, |
|
text=True |
|
) |
|
print("stdout:", result.stdout) |
|
print("stderr:", result.stderr) |
|
|
|
if result.returncode != 0: |
|
raise RuntimeError(f"Error ejecutando text_to_speech.py: {result.stderr}") |
|
|
|
if os.path.exists(RESULT_AUDIO_TEMP_PATH): |
|
os.makedirs(os.path.dirname(RESULT_AUDIO_FINAL_PATH), exist_ok=True) |
|
shutil.copy(RESULT_AUDIO_TEMP_PATH, RESULT_AUDIO_FINAL_PATH) |
|
print(f"Audio copiado a: {RESULT_AUDIO_FINAL_PATH}") |
|
return RESULT_AUDIO_FINAL_PATH |
|
else: |
|
print("Audio temporal no encontrado") |
|
return None |
|
|
|
|
|
def procesar_video_audio(): |
|
print("Iniciando procesamiento de video...") |
|
print("Audio de entrada:", RESULT_AUDIO_FINAL_PATH) |
|
print("Video de entrada:", VIDEO_PATH) |
|
|
|
result = subprocess.run( |
|
[sys.executable, RUN_INFERENCE_PATH, "--audio", RESULT_AUDIO_FINAL_PATH, "--video", VIDEO_PATH], |
|
capture_output=True, |
|
text=True |
|
) |
|
|
|
print("stdout:", result.stdout) |
|
print("stderr:", result.stderr) |
|
|
|
if os.path.exists(RESULT_VIDEO_PATH): |
|
print("Video generado:", RESULT_VIDEO_PATH) |
|
return RESULT_VIDEO_PATH |
|
else: |
|
print("No se generó el video") |
|
return None |
|
|
|
|
|
def flujo_completo(audio_file_path): |
|
try: |
|
os.makedirs(os.path.dirname(AUDIO_RECORD_PATH), exist_ok=True) |
|
shutil.copy(audio_file_path, AUDIO_RECORD_PATH) |
|
print("Audio grabado copiado a:", AUDIO_RECORD_PATH) |
|
|
|
transcripcion = transcribir_con_progreso(AUDIO_RECORD_PATH) |
|
print("Texto transcrito:", transcripcion) |
|
|
|
respuesta_openai = rtff(TRANSCRIPTION_TEXT_PATH) |
|
print("Respuesta de OpenAI:", respuesta_openai) |
|
|
|
audio_generado = generar_audio_desde_texto() |
|
video_path = procesar_video_audio() |
|
|
|
return "Grabación recibida", AUDIO_RECORD_PATH, transcripcion, audio_generado, video_path |
|
|
|
except Exception as e: |
|
return ( |
|
f"Error durante el flujo completo: {str(e)}", |
|
None, |
|
f"Error: {str(e)}", |
|
None, |
|
None |
|
) |
|
|
|
|
|
def interfaz(): |
|
with gr.Blocks() as demo: |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Video(VIDEO_PATH, loop=True, autoplay=True, height=300, width=500) |
|
audio_input = gr.Audio(label="Graba tu voz", type="filepath", format="wav") |
|
estado_grabacion = gr.Textbox(label="Estado", interactive=False) |
|
|
|
with gr.Column(): |
|
output_audio = gr.Audio(label="Audio grabado", interactive=False) |
|
output_audio_speech = gr.Audio(label="Audio TTS", interactive=False) |
|
video_resultado = gr.Video(label="Video procesado", interactive=False) |
|
texto_transcripcion = gr.Textbox(label="Texto transcrito") |
|
|
|
audio_input.change( |
|
flujo_completo, |
|
inputs=audio_input, |
|
outputs=[estado_grabacion, output_audio, texto_transcripcion, output_audio_speech, video_resultado] |
|
) |
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
demo = interfaz() |
|
demo.launch() |
|
|
|
|