xd

Running

File size: 4,516 Bytes

import gradio as gr
import os
import re
from whisper_tts import WhisperTTS
from ollama_chatbotTTS import OllamaChat
from text_to_speech import TextToSpeech
from sync_audio_video import AudioVideoSync

# Instalación y arranque de Ollama
os.system("curl https://ollama.com/install.sh | sh")
os.system("ollama serve &")

# Directorios
THUMBNAILS_DIR = "thumbnails"
VIDEO_DIR = "sample_video"

def get_thumbnail_images():
    if not os.path.exists(THUMBNAILS_DIR):
        return []
    return [
        (os.path.splitext(f)[0], os.path.join(THUMBNAILS_DIR, f))
        for f in os.listdir(THUMBNAILS_DIR)
        if f.lower().endswith((".png", ".jpg", ".jpeg"))
    ]

thumbnail_images = get_thumbnail_images()
avatar_names = [name for name, _ in thumbnail_images]

def find_matching_video(file_name):
    file_name = file_name.lower()
    if not os.path.exists(VIDEO_DIR):
        return None
    for video in os.listdir(VIDEO_DIR):
        name, ext = os.path.splitext(video)
        if name.lower() == file_name and ext.lower() in (".mp4", ".avi", ".mov"):
            return os.path.join(VIDEO_DIR, video)
    return None

def update_avatar_display(selected_name):
    for name, img_path in thumbnail_images:
        if name == selected_name:
            return img_path
    return None

def check_enable_process_button(selected_name, audio_file, transcribed_text):
    if selected_name and (audio_file or transcribed_text.strip()):
        return gr.update(interactive=True)
    return gr.update(interactive=False)

def process_pipeline(audio_file, transcribed_text, selected_name):
    # 1) Si hay audio, transcribir
    if audio_file:
        whisper = WhisperTTS()
        transcribed_text = whisper.transcribe_audio(audio_file)
        yield transcribed_text, "", None, None

    # 2) Validar texto
    if not transcribed_text.strip():
        yield "Warning: Please provide valid text.", "", None, None
        return

    # 3) Chatbot
    ollama = OllamaChat()
    resp = ollama.get_response(transcribed_text)
    resp = re.sub(r"<think>|</think>", "", resp).strip()
    yield transcribed_text, resp, None, None

    if not resp:
        yield transcribed_text, "Warning: No chatbot response.", None, None
        return

    # 4) TTS
    tts = TextToSpeech()
    audio_out = tts.synthesize(resp)
    yield transcribed_text, resp, audio_out, None

    # 5) Video
    if not selected_name:
        yield transcribed_text, resp, audio_out, "Warning: Select an avatar."
        return

    vid_in = find_matching_video(selected_name)
    if not vid_in:
        yield transcribed_text, resp, audio_out, "Warning: No matching video."
        return

    sync = AudioVideoSync()
    vid_out = sync.sync_audio_video(vid_in, audio_out)
    yield transcribed_text, resp, audio_out, vid_out

def build_demo() -> gr.Blocks:
    with gr.Blocks() as demo:
        gr.Markdown("## Personalized Avatar Video")

        with gr.Row():
            with gr.Column():
                audio_input = gr.Audio(type="filepath", label="Audio Input")
                transcribed_text = gr.Textbox(label="Edit and Process Text")
                chatbot_resp = gr.Textbox(label="Assistant Response")
                gr.Markdown("### Select an Avatar")
                selected_avatar = gr.Radio(choices=avatar_names, label="Select an Avatar")
                avatar_display = gr.Image(label="Selected Avatar", width=150, height=150)
                process_btn = gr.Button("Generate Lip-Sync Video", interactive=False)

            with gr.Column():
                tts_audio = gr.Audio(label="Generated Speech")
                video_out = gr.Video(label="Final Lip-Synced Video")

        # Enlazar eventos
        selected_avatar.change(update_avatar_display, inputs=[selected_avatar], outputs=[avatar_display])
        for inp in (selected_avatar, audio_input, transcribed_text):
            inp.change(check_enable_process_button, 
                       inputs=[selected_avatar, audio_input, transcribed_text], 
                       outputs=[process_btn])

        process_btn.click(
            process_pipeline,
            inputs=[audio_input, transcribed_text, selected_avatar],
            outputs=[transcribed_text, chatbot_resp, tts_audio, video_out],
        )

    # Configurar la cola
    demo = demo.queue(max_size=100000)
    return demo

if __name__ == "__main__":
    demo = build_demo()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True,
        inbrowser=True,
    )