import gradio as gr import os import re from whisper_tts import WhisperTTS from ollama_chatbotTTS import OllamaChat from text_to_speech import TextToSpeech from sync_audio_video import AudioVideoSync # Instalación y arranque de Ollama os.system("curl https://ollama.com/install.sh | sh") os.system("ollama serve &") # Directorios THUMBNAILS_DIR = "thumbnails" VIDEO_DIR = "sample_video" def get_thumbnail_images(): if not os.path.exists(THUMBNAILS_DIR): return [] return [ (os.path.splitext(f)[0], os.path.join(THUMBNAILS_DIR, f)) for f in os.listdir(THUMBNAILS_DIR) if f.lower().endswith((".png", ".jpg", ".jpeg")) ] thumbnail_images = get_thumbnail_images() avatar_names = [name for name, _ in thumbnail_images] def find_matching_video(file_name): file_name = file_name.lower() if not os.path.exists(VIDEO_DIR): return None for video in os.listdir(VIDEO_DIR): name, ext = os.path.splitext(video) if name.lower() == file_name and ext.lower() in (".mp4", ".avi", ".mov"): return os.path.join(VIDEO_DIR, video) return None def update_avatar_display(selected_name): for name, img_path in thumbnail_images: if name == selected_name: return img_path return None def check_enable_process_button(selected_name, audio_file, transcribed_text): if selected_name and (audio_file or transcribed_text.strip()): return gr.update(interactive=True) return gr.update(interactive=False) def process_pipeline(audio_file, transcribed_text, selected_name): # 1) Si hay audio, transcribir if audio_file: whisper = WhisperTTS() transcribed_text = whisper.transcribe_audio(audio_file) yield transcribed_text, "", None, None # 2) Validar texto if not transcribed_text.strip(): yield "Warning: Please provide valid text.", "", None, None return # 3) Chatbot ollama = OllamaChat() resp = ollama.get_response(transcribed_text) resp = re.sub(r"|", "", resp).strip() yield transcribed_text, resp, None, None if not resp: yield transcribed_text, "Warning: No chatbot response.", None, None return # 4) TTS tts = TextToSpeech() audio_out = tts.synthesize(resp) yield transcribed_text, resp, audio_out, None # 5) Video if not selected_name: yield transcribed_text, resp, audio_out, "Warning: Select an avatar." return vid_in = find_matching_video(selected_name) if not vid_in: yield transcribed_text, resp, audio_out, "Warning: No matching video." return sync = AudioVideoSync() vid_out = sync.sync_audio_video(vid_in, audio_out) yield transcribed_text, resp, audio_out, vid_out def build_demo() -> gr.Blocks: with gr.Blocks() as demo: gr.Markdown("## Personalized Avatar Video") with gr.Row(): with gr.Column(): audio_input = gr.Audio(type="filepath", label="Audio Input") transcribed_text = gr.Textbox(label="Edit and Process Text") chatbot_resp = gr.Textbox(label="Assistant Response") gr.Markdown("### Select an Avatar") selected_avatar = gr.Radio(choices=avatar_names, label="Select an Avatar") avatar_display = gr.Image(label="Selected Avatar", width=150, height=150) process_btn = gr.Button("Generate Lip-Sync Video", interactive=False) with gr.Column(): tts_audio = gr.Audio(label="Generated Speech") video_out = gr.Video(label="Final Lip-Synced Video") # Enlazar eventos selected_avatar.change(update_avatar_display, inputs=[selected_avatar], outputs=[avatar_display]) for inp in (selected_avatar, audio_input, transcribed_text): inp.change(check_enable_process_button, inputs=[selected_avatar, audio_input, transcribed_text], outputs=[process_btn]) process_btn.click( process_pipeline, inputs=[audio_input, transcribed_text, selected_avatar], outputs=[transcribed_text, chatbot_resp, tts_audio, video_out], ) # Configurar la cola demo = demo.queue(max_size=100000) return demo if __name__ == "__main__": demo = build_demo() demo.launch( server_name="0.0.0.0", server_port=7860, share=True, inbrowser=True, )