xd / gradio_ui.py
jnjj's picture
Update gradio_ui.py
5baf180 verified
import gradio as gr
import os
import re
from whisper_tts import WhisperTTS
from ollama_chatbotTTS import OllamaChat
from text_to_speech import TextToSpeech
from sync_audio_video import AudioVideoSync
# Instalación y arranque de Ollama
os.system("curl https://ollama.com/install.sh | sh")
os.system("ollama serve &")
# Directorios
THUMBNAILS_DIR = "thumbnails"
VIDEO_DIR = "sample_video"
def get_thumbnail_images():
if not os.path.exists(THUMBNAILS_DIR):
return []
return [
(os.path.splitext(f)[0], os.path.join(THUMBNAILS_DIR, f))
for f in os.listdir(THUMBNAILS_DIR)
if f.lower().endswith((".png", ".jpg", ".jpeg"))
]
thumbnail_images = get_thumbnail_images()
avatar_names = [name for name, _ in thumbnail_images]
def find_matching_video(file_name):
file_name = file_name.lower()
if not os.path.exists(VIDEO_DIR):
return None
for video in os.listdir(VIDEO_DIR):
name, ext = os.path.splitext(video)
if name.lower() == file_name and ext.lower() in (".mp4", ".avi", ".mov"):
return os.path.join(VIDEO_DIR, video)
return None
def update_avatar_display(selected_name):
for name, img_path in thumbnail_images:
if name == selected_name:
return img_path
return None
def check_enable_process_button(selected_name, audio_file, transcribed_text):
if selected_name and (audio_file or transcribed_text.strip()):
return gr.update(interactive=True)
return gr.update(interactive=False)
def process_pipeline(audio_file, transcribed_text, selected_name):
# 1) Si hay audio, transcribir
if audio_file:
whisper = WhisperTTS()
transcribed_text = whisper.transcribe_audio(audio_file)
yield transcribed_text, "", None, None
# 2) Validar texto
if not transcribed_text.strip():
yield "Warning: Please provide valid text.", "", None, None
return
# 3) Chatbot
ollama = OllamaChat()
resp = ollama.get_response(transcribed_text)
resp = re.sub(r"<think>|</think>", "", resp).strip()
yield transcribed_text, resp, None, None
if not resp:
yield transcribed_text, "Warning: No chatbot response.", None, None
return
# 4) TTS
tts = TextToSpeech()
audio_out = tts.synthesize(resp)
yield transcribed_text, resp, audio_out, None
# 5) Video
if not selected_name:
yield transcribed_text, resp, audio_out, "Warning: Select an avatar."
return
vid_in = find_matching_video(selected_name)
if not vid_in:
yield transcribed_text, resp, audio_out, "Warning: No matching video."
return
sync = AudioVideoSync()
vid_out = sync.sync_audio_video(vid_in, audio_out)
yield transcribed_text, resp, audio_out, vid_out
def build_demo() -> gr.Blocks:
with gr.Blocks() as demo:
gr.Markdown("## Personalized Avatar Video")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(type="filepath", label="Audio Input")
transcribed_text = gr.Textbox(label="Edit and Process Text")
chatbot_resp = gr.Textbox(label="Assistant Response")
gr.Markdown("### Select an Avatar")
selected_avatar = gr.Radio(choices=avatar_names, label="Select an Avatar")
avatar_display = gr.Image(label="Selected Avatar", width=150, height=150)
process_btn = gr.Button("Generate Lip-Sync Video", interactive=False)
with gr.Column():
tts_audio = gr.Audio(label="Generated Speech")
video_out = gr.Video(label="Final Lip-Synced Video")
# Enlazar eventos
selected_avatar.change(update_avatar_display, inputs=[selected_avatar], outputs=[avatar_display])
for inp in (selected_avatar, audio_input, transcribed_text):
inp.change(check_enable_process_button,
inputs=[selected_avatar, audio_input, transcribed_text],
outputs=[process_btn])
process_btn.click(
process_pipeline,
inputs=[audio_input, transcribed_text, selected_avatar],
outputs=[transcribed_text, chatbot_resp, tts_audio, video_out],
)
# Configurar la cola
demo = demo.queue(max_size=100000)
return demo
if __name__ == "__main__":
demo = build_demo()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True,
inbrowser=True,
)