File size: 4,516 Bytes
0711651 5baf180 0711651 5baf180 0711651 5baf180 0711651 5baf180 0711651 5baf180 0711651 5baf180 0711651 5baf180 0711651 5baf180 0711651 5baf180 0711651 5baf180 0711651 5baf180 0711651 5baf180 0711651 5baf180 0711651 5baf180 0711651 5baf180 0711651 5baf180 0711651 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import gradio as gr
import os
import re
from whisper_tts import WhisperTTS
from ollama_chatbotTTS import OllamaChat
from text_to_speech import TextToSpeech
from sync_audio_video import AudioVideoSync
# Instalación y arranque de Ollama
os.system("curl https://ollama.com/install.sh | sh")
os.system("ollama serve &")
# Directorios
THUMBNAILS_DIR = "thumbnails"
VIDEO_DIR = "sample_video"
def get_thumbnail_images():
if not os.path.exists(THUMBNAILS_DIR):
return []
return [
(os.path.splitext(f)[0], os.path.join(THUMBNAILS_DIR, f))
for f in os.listdir(THUMBNAILS_DIR)
if f.lower().endswith((".png", ".jpg", ".jpeg"))
]
thumbnail_images = get_thumbnail_images()
avatar_names = [name for name, _ in thumbnail_images]
def find_matching_video(file_name):
file_name = file_name.lower()
if not os.path.exists(VIDEO_DIR):
return None
for video in os.listdir(VIDEO_DIR):
name, ext = os.path.splitext(video)
if name.lower() == file_name and ext.lower() in (".mp4", ".avi", ".mov"):
return os.path.join(VIDEO_DIR, video)
return None
def update_avatar_display(selected_name):
for name, img_path in thumbnail_images:
if name == selected_name:
return img_path
return None
def check_enable_process_button(selected_name, audio_file, transcribed_text):
if selected_name and (audio_file or transcribed_text.strip()):
return gr.update(interactive=True)
return gr.update(interactive=False)
def process_pipeline(audio_file, transcribed_text, selected_name):
# 1) Si hay audio, transcribir
if audio_file:
whisper = WhisperTTS()
transcribed_text = whisper.transcribe_audio(audio_file)
yield transcribed_text, "", None, None
# 2) Validar texto
if not transcribed_text.strip():
yield "Warning: Please provide valid text.", "", None, None
return
# 3) Chatbot
ollama = OllamaChat()
resp = ollama.get_response(transcribed_text)
resp = re.sub(r"<think>|</think>", "", resp).strip()
yield transcribed_text, resp, None, None
if not resp:
yield transcribed_text, "Warning: No chatbot response.", None, None
return
# 4) TTS
tts = TextToSpeech()
audio_out = tts.synthesize(resp)
yield transcribed_text, resp, audio_out, None
# 5) Video
if not selected_name:
yield transcribed_text, resp, audio_out, "Warning: Select an avatar."
return
vid_in = find_matching_video(selected_name)
if not vid_in:
yield transcribed_text, resp, audio_out, "Warning: No matching video."
return
sync = AudioVideoSync()
vid_out = sync.sync_audio_video(vid_in, audio_out)
yield transcribed_text, resp, audio_out, vid_out
def build_demo() -> gr.Blocks:
with gr.Blocks() as demo:
gr.Markdown("## Personalized Avatar Video")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(type="filepath", label="Audio Input")
transcribed_text = gr.Textbox(label="Edit and Process Text")
chatbot_resp = gr.Textbox(label="Assistant Response")
gr.Markdown("### Select an Avatar")
selected_avatar = gr.Radio(choices=avatar_names, label="Select an Avatar")
avatar_display = gr.Image(label="Selected Avatar", width=150, height=150)
process_btn = gr.Button("Generate Lip-Sync Video", interactive=False)
with gr.Column():
tts_audio = gr.Audio(label="Generated Speech")
video_out = gr.Video(label="Final Lip-Synced Video")
# Enlazar eventos
selected_avatar.change(update_avatar_display, inputs=[selected_avatar], outputs=[avatar_display])
for inp in (selected_avatar, audio_input, transcribed_text):
inp.change(check_enable_process_button,
inputs=[selected_avatar, audio_input, transcribed_text],
outputs=[process_btn])
process_btn.click(
process_pipeline,
inputs=[audio_input, transcribed_text, selected_avatar],
outputs=[transcribed_text, chatbot_resp, tts_audio, video_out],
)
# Configurar la cola
demo = demo.queue(max_size=100000)
return demo
if __name__ == "__main__":
demo = build_demo()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True,
inbrowser=True,
)
|