File size: 4,516 Bytes
0711651
 
5baf180
0711651
 
 
 
 
5baf180
 
0711651
 
5baf180
0711651
 
 
 
 
 
5baf180
 
 
 
 
0711651
 
 
 
 
 
 
 
 
5baf180
 
0711651
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5baf180
0711651
5baf180
 
 
 
 
0711651
 
 
 
5baf180
 
 
 
 
 
 
0711651
 
 
5baf180
0711651
5baf180
 
0711651
5baf180
0711651
5baf180
0711651
 
5baf180
 
 
0711651
 
 
5baf180
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0711651
 
5baf180
0711651
 
 
5baf180
 
0711651
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import gradio as gr
import os
import re
from whisper_tts import WhisperTTS
from ollama_chatbotTTS import OllamaChat
from text_to_speech import TextToSpeech
from sync_audio_video import AudioVideoSync

# Instalación y arranque de Ollama
os.system("curl https://ollama.com/install.sh | sh")
os.system("ollama serve &")

# Directorios
THUMBNAILS_DIR = "thumbnails"
VIDEO_DIR = "sample_video"

def get_thumbnail_images():
    if not os.path.exists(THUMBNAILS_DIR):
        return []
    return [
        (os.path.splitext(f)[0], os.path.join(THUMBNAILS_DIR, f))
        for f in os.listdir(THUMBNAILS_DIR)
        if f.lower().endswith((".png", ".jpg", ".jpeg"))
    ]

thumbnail_images = get_thumbnail_images()
avatar_names = [name for name, _ in thumbnail_images]

def find_matching_video(file_name):
    file_name = file_name.lower()
    if not os.path.exists(VIDEO_DIR):
        return None
    for video in os.listdir(VIDEO_DIR):
        name, ext = os.path.splitext(video)
        if name.lower() == file_name and ext.lower() in (".mp4", ".avi", ".mov"):
            return os.path.join(VIDEO_DIR, video)
    return None

def update_avatar_display(selected_name):
    for name, img_path in thumbnail_images:
        if name == selected_name:
            return img_path
    return None

def check_enable_process_button(selected_name, audio_file, transcribed_text):
    if selected_name and (audio_file or transcribed_text.strip()):
        return gr.update(interactive=True)
    return gr.update(interactive=False)

def process_pipeline(audio_file, transcribed_text, selected_name):
    # 1) Si hay audio, transcribir
    if audio_file:
        whisper = WhisperTTS()
        transcribed_text = whisper.transcribe_audio(audio_file)
        yield transcribed_text, "", None, None

    # 2) Validar texto
    if not transcribed_text.strip():
        yield "Warning: Please provide valid text.", "", None, None
        return

    # 3) Chatbot
    ollama = OllamaChat()
    resp = ollama.get_response(transcribed_text)
    resp = re.sub(r"<think>|</think>", "", resp).strip()
    yield transcribed_text, resp, None, None

    if not resp:
        yield transcribed_text, "Warning: No chatbot response.", None, None
        return

    # 4) TTS
    tts = TextToSpeech()
    audio_out = tts.synthesize(resp)
    yield transcribed_text, resp, audio_out, None

    # 5) Video
    if not selected_name:
        yield transcribed_text, resp, audio_out, "Warning: Select an avatar."
        return

    vid_in = find_matching_video(selected_name)
    if not vid_in:
        yield transcribed_text, resp, audio_out, "Warning: No matching video."
        return

    sync = AudioVideoSync()
    vid_out = sync.sync_audio_video(vid_in, audio_out)
    yield transcribed_text, resp, audio_out, vid_out

def build_demo() -> gr.Blocks:
    with gr.Blocks() as demo:
        gr.Markdown("## Personalized Avatar Video")

        with gr.Row():
            with gr.Column():
                audio_input = gr.Audio(type="filepath", label="Audio Input")
                transcribed_text = gr.Textbox(label="Edit and Process Text")
                chatbot_resp = gr.Textbox(label="Assistant Response")
                gr.Markdown("### Select an Avatar")
                selected_avatar = gr.Radio(choices=avatar_names, label="Select an Avatar")
                avatar_display = gr.Image(label="Selected Avatar", width=150, height=150)
                process_btn = gr.Button("Generate Lip-Sync Video", interactive=False)

            with gr.Column():
                tts_audio = gr.Audio(label="Generated Speech")
                video_out = gr.Video(label="Final Lip-Synced Video")

        # Enlazar eventos
        selected_avatar.change(update_avatar_display, inputs=[selected_avatar], outputs=[avatar_display])
        for inp in (selected_avatar, audio_input, transcribed_text):
            inp.change(check_enable_process_button, 
                       inputs=[selected_avatar, audio_input, transcribed_text], 
                       outputs=[process_btn])

        process_btn.click(
            process_pipeline,
            inputs=[audio_input, transcribed_text, selected_avatar],
            outputs=[transcribed_text, chatbot_resp, tts_audio, video_out],
        )

    # Configurar la cola
    demo = demo.queue(max_size=100000)
    return demo

if __name__ == "__main__":
    demo = build_demo()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=True,
        inbrowser=True,
    )