Spaces:

TWO
/

sutra-avatar-v2

Running on CPU Upgrade

App Files Files Community

kc-two commited on 4 days ago

Commit

adb363a

1 Parent(s): 5df99cd

v2: simplified UI

Browse files

adding common util

Files changed (6) hide show

README.md +1 -1
app.py +380 -179
base_task_executor.py +24 -35
cloud_task_executor.py +2 -1
common_util.py +110 -0
requirements.txt +3 -1

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🐨
 colorFrom: blue
 colorTo: red
 sdk: gradio
-sdk_version: 5.4.0
 app_file: app.py
 pinned: false
 license: mit

 colorFrom: blue
 colorTo: red
 sdk: gradio
+sdk_version: 5.37.0
 app_file: app.py
 pinned: false
 license: mit

app.py CHANGED Viewed

@@ -4,28 +4,28 @@ import argparse
 import glob
 import os
 from pathlib import Path
 import gradio as gr
 from cloud_task_executor import CloudTaskExecutor
 from elevenlabs_helper import ElevenLabsHelper
 # ---
 talk_key = "talk"
-valid_base_motion_expressions = [
-    f"{talk_key}-head",
-    f"{talk_key}-neutral",
-    "smile",
-    "approve",
-    "disapprove",
-    "confused",
-    "sad",
-    "surprised",
-]
-def get_default_base_motion_expression():
-    return valid_base_motion_expressions[0]
 # ---
@@ -51,9 +51,6 @@ def get_sorted_filenames_in_dir(dir_path: str, ext: str = ".jpg", throw_if_empty
 # ---
-description = """Experience a demo of the world's most advanced Text/Audio To Video (TTV) system, crafted by Two AI.
-                 Sign up with Two AI to gain rapid, long-form generation, API keys, and more!"""
 # Core constants
 tmp_dir = "/tmp/gradio"
 data_dir = "./data"
@@ -62,6 +59,9 @@ female_key = "female"
 unknown_key = "unknown"
 media_height = 512
 # Male/Female
 female_terms = ["Female", "Lady", "Woman"]
 male_terms = ["Male", "Lad", "Man"]
@@ -101,31 +101,35 @@ example_driving_audios_female = get_sorted_filenames_in_dir(
 )
 example_driving_audios = {female_key: example_driving_audios_female, male_key: example_driving_audios_male}
-# Driving Text
-audio_text_groups = ["General", "Promotional Messages", "Pronunciation Practice"]
-example_driving_audio_texts = {
-    "General": [
-        "The 2026 World Cup final match is in New York.",
-        "Enhance efficiency and cut costs with AI.",
-        "A bee's wings beat more than 200 times per second.",
-        "2026년 월드컵 결승전은 뉴욕에서 열립니다.",
-        "AI로 효율성을 높이고 비용을 절감하세요.",
-        "벌은 초당 200회 이상의 날개짓을 합니다.",
-        "2026 विश्व कप फाइनल मैच न्यूयॉर्क में होगा।",
-        "AI के साथ दक्षता बढ़ाएं और लागत कम करें।",
-        "मधुमक्खी के पंख सेकंड में 200 बार से अधिक फड़फड़ाते हैं।",
-    ],
-    "Promotional Messages": [
-        "Welcome to our kiosk, where you can easily purchase tickets, or access various services by simply tapping the display!",
-        "Catch all the drama, emotion, and energy in my new film, now available on Netflix—it's a must-watch!",
-        "This season of IPL is full of surprises, and I’d love to see you supporting us as we fight for victory on the ground.",
-        "Transform your health with our latest fitness programs! Join us today and take the first step toward a stronger, energized you.",
-    ],
-    "Pronunciation Practice": [
-        "A big black bug bit a big black dog on his big black nose.",
-        "Fuzzy Wuzzy was a bear. Fuzzy Wuzzy had no hair. Fuzzy Wuzzy wasn't very fuzzy, was he?",
-    ],
-}
 example_showcase_dir = os.path.join(data_dir, "showcase_examples")
 examples_showcase = {
@@ -177,6 +181,11 @@ def update_voices(media_path):
     )
     return driving_input_voice
 def task_executor_fn(
     input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input
@@ -186,94 +195,256 @@ def task_executor_fn(
         input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input
     )
 with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")])) as demo_image:
     with gr.Row():
         # Step 1: Choose Image
         with gr.Column(scale=4):
-            gr.Markdown("### Step 1: Choose Image")
-            gr.Markdown("Upload or select an example image to drive.")
-            with gr.Accordion(open=True, label="Base Image"):
-                base_image_input = gr.Image(type="filepath", sources="upload", height=media_height)
-                gr.Examples(
-                    examples=[[example] for example in example_base_images[female_key]],
-                    inputs=[base_image_input],
-                    cache_examples=False,
-                    label="Female",
-                )
-                gr.Examples(
-                    examples=[[example] for example in example_base_images[male_key]],
-                    inputs=[base_image_input],
-                    cache_examples=False,
-                    label="Male",
-                )
         # Step 2: Motion and Audio/TTS
         with gr.Column(scale=4):
-            gr.Markdown("### Step 2: Motion and Audio/TTS")
-            gr.Markdown("Select motion and provide audio or text for lip-sync.")
-            with gr.Accordion(open=True, label="Base Motion"):
-                base_motion_expression = gr.Radio(
-                    choices=valid_base_motion_expressions,
-                    label="Select base motion",
-                    value=get_default_base_motion_expression(),
-                )
             with gr.Tabs():
-                with gr.TabItem("Driving Audio: File") as tab_audio_file:
-                    with gr.Accordion(open=True, label="Driving Audio: From File"):
-                        driving_audio_input = gr.Audio(sources=["upload"], type="filepath")
-                        gr.Examples(
-                            examples=[[example] for example in example_driving_audios[female_key]],
-                            inputs=[driving_audio_input],
-                            cache_examples=False,
-                            examples_per_page=18,
-                            label="Female",
-                        )
-                        gr.Examples(
-                            examples=[[example] for example in example_driving_audios[male_key]],
-                            inputs=[driving_audio_input],
-                            cache_examples=False,
-                            examples_per_page=18,
-                            label="Male",
                         )
-                with gr.TabItem("Driving Audio: TTS") as tab_audio_tts:
-                    with gr.Accordion(open=True, label="Driving Audio: From Text"):
-                        driving_input_voice = gr.Dropdown(
-                            choices=voices[unknown_key], value=voices[unknown_key][0], label="Voice"
-                        )
-                        driving_text_input = gr.Textbox(
-                            label="Input Text (300 characters max)",
-                            lines=2,
-                        )
-                        for group in audio_text_groups:
-                            gr.Examples(
-                                examples=[[example] for example in example_driving_audio_texts[group]],
-                                inputs=[driving_text_input],
-                                cache_examples=False,
-                                label=group,
-                            )
         # Step 3: Result
         with gr.Column(scale=4):
-            gr.Markdown("### Step 3: Result")
-            gr.Markdown("Generate and view the output video.")
-            process_button_animation = gr.Button("🌟 Generate", variant="primary")
-            output_video_i2v = gr.Video(autoplay=True, label="The Output Video", height=media_height)
-            message = gr.Textbox(label="Info")
             process_button_reset = gr.ClearButton(
                 [
                     base_image_input,
                     driving_audio_input,
                     driving_text_input,
                     driving_input_voice,
                     output_video_i2v,
                 ],
                 value="🧹 Clear",
             )
-    base_image_input.change(fn=update_voices, inputs=[base_image_input], outputs=[driving_input_voice])
-    # binding functions for buttons
     process_button_animation.click(
         fn=task_executor_fn,
         inputs=[
@@ -291,76 +462,107 @@ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta San
     with gr.Row():
         # Step 1: Choose Video
         with gr.Column(scale=4):
-            gr.Markdown("### Step 1: Choose Video")
-            gr.Markdown("Upload or select an example video to drive.")
-            with gr.Accordion(open=True, label="Base Video"):
-                base_video_input = gr.Video(sources="upload", height=media_height, interactive=True)
-                gr.Examples(
-                    examples=[[example] for example in example_source_videos[female_key]],
-                    inputs=[base_video_input],
-                    cache_examples=False,
-                    label="Female",
-                )
-                gr.Examples(
-                    examples=[[example] for example in example_source_videos[male_key]],
-                    inputs=[base_video_input],
-                    cache_examples=False,
-                    label="Male",
-                )
         # Step 2: Audio/TTS
         with gr.Column(scale=4):
-            gr.Markdown("### Step 2: Audio/TTS")
-            gr.Markdown("Provide audio or text for lip-sync.")
             with gr.Tabs():
-                with gr.TabItem("Driving Audio: File") as tab_audio_file:
-                    with gr.Accordion(open=True, label="Driving Audio: From File"):
-                        driving_audio_input = gr.Audio(sources=["upload"], type="filepath")
-                        gr.Examples(
-                            examples=[[example] for example in example_driving_audios[female_key]],
-                            inputs=[driving_audio_input],
-                            cache_examples=False,
-                            examples_per_page=18,
-                            label="Female",
-                        )
-                        gr.Examples(
-                            examples=[[example] for example in example_driving_audios[male_key]],
-                            inputs=[driving_audio_input],
-                            cache_examples=False,
-                            examples_per_page=18,
-                            label="Male",
-                        )
-                with gr.TabItem("Driving Audio: TTS") as tab_audio_tts:
-                    with gr.Accordion(open=True, label="Driving Audio: From Text"):
-                        driving_input_voice = gr.Dropdown(
-                            choices=voices[unknown_key], value=voices[unknown_key][0], label="Voice"
-                        )
-                        driving_text_input = gr.Textbox(
-                            label="Input Text (300 characters max)",
-                            lines=2,
-                        )
-                        for group in audio_text_groups:
-                            gr.Examples(
-                                examples=[[example] for example in example_driving_audio_texts[group]],
-                                inputs=[driving_text_input],
-                                cache_examples=False,
-                                label=group,
-                            )
         # Step 3: Result
         with gr.Column(scale=4):
-            gr.Markdown("### Step 3: Result")
-            gr.Markdown("Generate and view the output video.")
-            process_button_animation = gr.Button("🌟 Generate", variant="primary")
-            output_video_i2v = gr.Video(autoplay=True, label="The Output Video", height=media_height)
-            message = gr.Textbox(label="Info")
-            process_button_reset = gr.ClearButton(
-                [base_video_input, driving_audio_input, driving_text_input, driving_input_voice, output_video_i2v],
-                value="🧹 Clear",
             )
-    base_video_input.change(fn=update_voices, inputs=[base_video_input], outputs=[driving_input_voice])
-    # binding functions for buttons
     base_motion_expression = gr.Radio(value=None, visible=False)
     process_button_animation.click(
         fn=task_executor_fn,
@@ -376,7 +578,7 @@ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta San
     )
 with gr.Blocks() as showcase_examples:
-    gr.Markdown("# Make Image Talk")
     with gr.Row():
         with gr.Column(scale=7):
             for path in examples_showcase["make_image_talk_multilingual"]:
@@ -395,7 +597,7 @@ with gr.Blocks() as showcase_examples:
         for path in examples_showcase['make_image_talk_selfie']:
             gr.Video(value=path, label=os.path.basename(path), height=430)
-    gr.Markdown("# Make Video Talk")
     with gr.Row():
         with gr.Column(scale=7):
             for path in examples_showcase["make_video_talk_multilingual"]:
@@ -407,7 +609,7 @@ with gr.Blocks() as showcase_examples:
         for path in examples_showcase["make_video_talk_rap_multii"]:
             gr.Video(value=path, label=os.path.basename(path), height=500)
-    gr.Markdown("# Dubbing")
     with gr.Row():
         for path in examples_showcase["dubbing_superpowerman"]:
             gr.Video(value=path, label=os.path.basename(path), height=320)
@@ -415,19 +617,18 @@ with gr.Blocks() as showcase_examples:
         for path in examples_showcase["dubbing_coffee"]:
             gr.Video(value=path, label=os.path.basename(path), height=440)
-with gr.Blocks(analytics_enabled=False, css="footer{display:none !important}", title="SUTRA Avatar v2") as demo:
-    gr.Markdown(
-        """
-        ## <img src="https://playground.two.ai/sutra.svg" height="20"/>
-        """
-    )
-    title = "# 🌟 SUTRA Avatar v2 🌟\n## Drive Image or Video with LipSync from Audio or Text"
-    gr.Markdown(title)
-    gr.Markdown(description)
     gr.TabbedInterface(
         interface_list=[demo_image, demo_video, showcase_examples],
-        tab_names=["Drive Image", "Drive Video", "Showcase Examples"],
     )
 if __name__ == "__main__":

 import glob
 import os
 from pathlib import Path
+import tempfile
 import gradio as gr
 from cloud_task_executor import CloudTaskExecutor
 from elevenlabs_helper import ElevenLabsHelper
+from common_util import CommonUtil
 # ---
 talk_key = "talk"
+valid_talking_expressions = [
+        f"{talk_key}-head",
+        f"{talk_key}-neutral",
+    ]
+valid_nontalking_expressions = [
+        "smile",
+        "approve",
+        "disapprove",
+        "confused",
+        "sad",
+        "surprised",
+    ]
 # ---
 # ---
 # Core constants
 tmp_dir = "/tmp/gradio"
 data_dir = "./data"
 unknown_key = "unknown"
 media_height = 512
+# Global variables
+temp_video_files = set()
 # Male/Female
 female_terms = ["Female", "Lady", "Woman"]
 male_terms = ["Male", "Lad", "Man"]
 )
 example_driving_audios = {female_key: example_driving_audios_female, male_key: example_driving_audios_male}
+def get_audio_dropdown_choices(audio_paths, base_dir):
+    return [
+        (path.replace(base_dir, "").lstrip("/"), path)
+        for path in audio_paths
+    ]
+example_driving_audio_base_dir = os.path.join("./data/input_audio/gradio/")
+example_driving_audio_dropdown_choices = (
+    get_audio_dropdown_choices(example_driving_audios[female_key], example_driving_audio_base_dir) +
+    get_audio_dropdown_choices(example_driving_audios[male_key], example_driving_audio_base_dir)
+)
+example_driving_audio_texts = [
+    "The 2026 World Cup final match is in New York.",
+    "Enhance efficiency and cut costs with AI.",
+    "A bee's wings beat more than 200 times per second.",
+    "2026년 월드컵 결승전은 뉴욕에서 열립니다.",
+    "AI로 효율성을 높이고 비용을 절감하세요.",
+    "벌은 초당 200회 이상의 날개짓을 합니다.",
+    "2026 विश्व कप फाइनल मैच न्यूयॉर्क में होगा।",
+    "AI के साथ दक्षता बढ़ाएं और लागत कम करें।",
+    "मधुमक्खी के पंख सेकंड में 200 बार से अधिक फड़फड़ाते हैं।",
+    "Welcome to our kiosk, where you can easily purchase tickets, or access various services by simply tapping the display!",
+    "Catch all the drama, emotion, and energy in my new film, now available on Netflix—it's a must-watch!",
+    "This season of IPL is full of surprises, and I’d love to see you supporting us as we fight for victory on the ground.",
+    "Transform your health with our latest fitness programs! Join us today and take the first step toward a stronger, energized you.",
+    "A big black bug bit a big black dog on his big black nose.",
+    "Fuzzy Wuzzy was a bear. Fuzzy Wuzzy had no hair. Fuzzy Wuzzy wasn't very fuzzy, was he?",
+]
 example_showcase_dir = os.path.join(data_dir, "showcase_examples")
 examples_showcase = {
     )
     return driving_input_voice
+def update_audio_tabs_visibility(motion_type):
+    if motion_type == "talking":
+        return gr.update(visible=True), gr.update(visible=True)
+    else:
+        return gr.update(visible=False), gr.update(visible=False)
 def task_executor_fn(
     input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input
         input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input
     )
+def check_and_convert_video_fps(video_path):
+    if not video_path:
+        return None
+    try:
+        _, is_video, _, width, height, duration, fps = CommonUtil.get_media_properties(video_path)
+        if not is_video:
+            raise gr.Error("Not a video file")
+        if not CommonUtil.check_dim(width, height):
+            min_dim = CommonUtil.valid_min_media_dim
+            max_dim = CommonUtil.valid_max_media_dim
+            raise gr.Error(f"⚠️ Video dimensions must be between {min_dim}-{max_dim} pixels.\n\nCurrent size: {width}(w)x{height}(h)")
+        if not CommonUtil.check_duration(duration):
+            min_duration = CommonUtil.valid_min_media_duration
+            max_duration = CommonUtil.valid_max_media_duration
+            raise gr.Error(f"⚠️ Video duration must be between {min_duration}-{max_duration} seconds.\n\nCurrent duration: {duration}s")
+        if CommonUtil.check_fps(fps):
+            return video_path
+        target_fps = CommonUtil.valid_video_fps
+        print(f"Converting video from {fps}fps to {target_fps}fps: {video_path}")
+        temp_dir = tempfile.mkdtemp()
+        base_name = os.path.splitext(os.path.basename(video_path))[0]
+        converted_path = os.path.join(temp_dir, f"{base_name}_{target_fps}fps.mp4")
+        CommonUtil.change_video_fps(video_path, converted_path, fps=target_fps)
+        temp_video_files.add(converted_path)
+        return converted_path
+    except gr.Error:
+        # Re-raise gr.Error to show notification
+        raise
+    except Exception as e:
+        print(f"Error processing video FPS: {e}")
+        raise gr.Error(f"Error processing video: {str(e)}")
+def check_and_validate_image(image_path):
+    """Check and validate image properties"""
+    if not image_path:
+        return None
+    try:
+        is_image, _, _, width, height, _, _ = CommonUtil.get_media_properties(image_path)
+        if not is_image:
+            raise gr.Error("⚠️ Not an image file. Please upload a valid image file.")
+        if not CommonUtil.check_dim(width, height):
+            min_dim = CommonUtil.valid_min_media_dim
+            max_dim = CommonUtil.valid_max_media_dim
+            raise gr.Error(f"⚠️ Image dimensions must be between {min_dim}-{max_dim} pixels.\n\nCurrent size: {width}(w)x{height}(h)")
+        return image_path
+    except gr.Error:
+        # Re-raise gr.Error to show notification
+        raise
+    except Exception as e:
+        print(f"Error validating image: {e}")
+        raise gr.Error(f"❌ Error processing image: {str(e)}")
+def process_video_input(video_path):
+    if not video_path:
+        return None
+    converted_path = check_and_convert_video_fps(video_path)
+    print(f"Video processing result: {converted_path}")
+    return converted_path
+def cleanup_temp_video_files():
+    global temp_video_files
+    for temp_file in temp_video_files:
+        try:
+            if os.path.exists(temp_file):
+                os.remove(temp_file)
+                print(f"Cleaned up temporary file: {temp_file}")
+        except Exception as e:
+            print(f"Error cleaning up {temp_file}: {e}")
+    # Clear the set
+    temp_video_files.clear()
 with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")])) as demo_image:
     with gr.Row():
         # Step 1: Choose Image
         with gr.Column(scale=4):
+            gr.Markdown("### STEP 1 - SELECT IMAGE")
+            base_image_input = gr.Image(label="IMAGE", type="filepath", sources="upload", height=media_height, interactive=True)
+            gr.Examples(
+                examples=[[example] for example in example_base_images[female_key]],
+                inputs=[base_image_input],
+                fn=lambda x: x,
+                outputs=[base_image_input],
+                cache_examples=False,
+                label="Female",
+            )
+            gr.Examples(
+                examples=[[example] for example in example_base_images[male_key]],
+                inputs=[base_image_input],
+                fn=lambda x: x,
+                outputs=[base_image_input],
+                cache_examples=False,
+                label="Male",
+            )
         # Step 2: Motion and Audio/TTS
         with gr.Column(scale=4):
+            gr.Markdown("### STEP 2 - SELECT MOTION & AUDIO")
+            base_motion_expression = gr.Radio(
+                choices=valid_talking_expressions,
+                value=valid_talking_expressions[0],
+                visible=False,
+            )
             with gr.Tabs():
+                with gr.TabItem("TALKING MOTION") as tab_talking_motion:
+                    base_talking_expression = gr.Radio(
+                        choices=valid_talking_expressions,
+                        label="STEP 2.1 - TALKING MOTION",
+                        value=valid_talking_expressions[0],
+                    )
+                with gr.TabItem("EXPRESSION MOTION") as tab_expression_motion:
+                    base_expression_expression = gr.Radio(
+                        choices=valid_nontalking_expressions,
+                        label="STEP 2 - EXPRESSION MOTION",
+                            value=None,
                         )
+            with gr.Tabs():
+                with gr.TabItem("DRIVING AUDIO: FILE") as tab_audio_file:
+                    driving_audio_input = gr.File(label="STEP 2.2 - AUDIO FILE", file_types=[".mp3", ".wav"], type="filepath", height=287)
+                    example_driving_audio_dropdown = gr.Dropdown(
+                        choices=example_driving_audio_dropdown_choices,
+                        value=None,
+                        label="OR SELECT FROM EXAMPLES",
+                        interactive=True,
+                        allow_custom_value=False
+                    )
+                    def update_audio_input(selected_audio):
+                        return selected_audio if selected_audio else None
+                    example_driving_audio_dropdown.change(
+                        fn=update_audio_input,
+                        inputs=[example_driving_audio_dropdown],
+                        outputs=[driving_audio_input]
+                    )
+                with gr.TabItem("DRIVING AUDIO: TTS") as tab_audio_tts:
+                    driving_input_voice = gr.Dropdown(
+                        choices=voices[unknown_key], value=voices[unknown_key][0], label="STEP 2.2 - VOICE"
+                    )
+                    driving_text_input = gr.Textbox(
+                        label="INPUT TEXT (300 characters max)",
+                        lines=2,
+                    )
+                    example_text_dropdown = gr.Dropdown(
+                        choices=example_driving_audio_texts,
+                        value=None,
+                        label="OR SELECT FROM EXAMPLES",
+                        interactive=True,
+                        allow_custom_value=False
+                    )
+                    def update_text_input(selected_text):
+                        return selected_text if selected_text else ""
+                    example_text_dropdown.change(
+                        fn=update_text_input,
+                        inputs=[example_text_dropdown],
+                        outputs=[driving_text_input]
+                    )
+            process_button_animation = gr.Button("🌟 Generate", variant="primary", elem_classes=["generate-button"])
         # Step 3: Result
         with gr.Column(scale=4):
+            gr.Markdown("### RESULT")
+            output_video_i2v = gr.Video(autoplay=True, label="OUTPUT VIDEO", height=512, show_download_button=True)
+            message = gr.Textbox(label="INFO", max_lines=8)
             process_button_reset = gr.ClearButton(
                 [
                     base_image_input,
+                    base_motion_expression,
+                    base_talking_expression,
+                    base_expression_expression,
                     driving_audio_input,
                     driving_text_input,
                     driving_input_voice,
+                    example_text_dropdown,
+                    example_driving_audio_dropdown,
                     output_video_i2v,
                 ],
                 value="🧹 Clear",
+                variant="secondary",
             )
+    def process_image_and_update_voices(image_path):
+        validated_image = check_and_validate_image(image_path)
+        voice_dropdown = update_voices(validated_image)
+        return validated_image, voice_dropdown
+    base_image_input.change(
+        fn=process_image_and_update_voices,
+        inputs=[base_image_input],
+        outputs=[base_image_input, driving_input_voice]
+    )
+    base_talking_expression.change(
+        fn=lambda x: x,
+        inputs=[base_talking_expression],
+        outputs=[base_motion_expression],
+    )
+    base_expression_expression.change(
+        fn=lambda x: gr.update(value=x),
+        inputs=[base_expression_expression],
+        outputs=[base_motion_expression],
+    )
+    def update_talking_tab():
+        audio_visibility = update_audio_tabs_visibility("talking")
+        return audio_visibility[0], audio_visibility[1], gr.update(choices=valid_talking_expressions, value=valid_talking_expressions[0])
+    def update_expression_tab():
+        audio_visibility = update_audio_tabs_visibility("expression")
+        return audio_visibility[1], audio_visibility[0], gr.update(choices=valid_nontalking_expressions, value=valid_nontalking_expressions[0])
+    tab_talking_motion.select(
+        fn=update_talking_tab,
+        inputs=[],
+        outputs=[tab_audio_file, tab_audio_tts, base_motion_expression],
+    )
+    tab_expression_motion.select(
+        fn=update_expression_tab,
+        inputs=[],
+        outputs=[tab_audio_file, tab_audio_tts, base_motion_expression],
+    )
     process_button_animation.click(
         fn=task_executor_fn,
         inputs=[
     with gr.Row():
         # Step 1: Choose Video
         with gr.Column(scale=4):
+            gr.Markdown("### STEP 1 - SELECT VIDEO")
+            base_video_input = gr.Video(label="VIDEO", sources="upload", height=media_height, interactive=True)
+            gr.Examples(
+                examples=[[example] for example in example_source_videos[female_key]],
+                inputs=[base_video_input],
+                fn=lambda x: x,
+                outputs=[base_video_input],
+                cache_examples=False,
+                label="Female",
+                elem_id="female-video-examples"
+            )
+            gr.Examples(
+                examples=[[example] for example in example_source_videos[male_key]],
+                inputs=[base_video_input],
+                fn=lambda x: x,
+                outputs=[base_video_input],
+                cache_examples=False,
+                label="Male",
+                elem_id="male-video-examples"
+            )
         # Step 2: Audio/TTS
         with gr.Column(scale=4):
+            gr.Markdown("### STEP 2 - SELECT AUDIO")
             with gr.Tabs():
+                with gr.TabItem("DRIVING AUDIO: FILE") as tab_audio_file:
+                    driving_audio_input = gr.File(label="AUDIO", file_types=[".mp3", ".wav"], type="filepath", height=454)
+                    example_driving_audio_dropdown = gr.Dropdown(
+                        choices=example_driving_audio_dropdown_choices,
+                        value=None,
+                        label="OR SELECT FROM EXAMPLES",
+                        interactive=True,
+                        allow_custom_value=False
+                    )
+                    def update_audio_input(selected_audio):
+                        return selected_audio if selected_audio else None
+                    example_driving_audio_dropdown.change(
+                        fn=update_audio_input,
+                        inputs=[example_driving_audio_dropdown],
+                        outputs=[driving_audio_input]
+                    )
+                with gr.TabItem("DRIVING AUDIO: TTS") as tab_audio_tts:
+                    driving_input_voice = gr.Dropdown(
+                        choices=voices[unknown_key], value=voices[unknown_key][0], label="VOICE"
+                    )
+                    driving_text_input = gr.Textbox(
+                        label="INPUT TEXT (300 characters max)",
+                        lines=5,
+                    )
+                    example_text_dropdown = gr.Dropdown(
+                        choices=example_driving_audio_texts,
+                        value=None,
+                        label="OR SELECT FROM EXAMPLES",
+                        interactive=True,
+                        allow_custom_value=False
+                    )
+                    def update_text_input(selected_text):
+                        return selected_text if selected_text else ""
+                    example_text_dropdown.change(
+                        fn=update_text_input,
+                        inputs=[example_text_dropdown],
+                        outputs=[driving_text_input]
+                    )
+            process_button_animation = gr.Button("🌟 Generate", variant="primary", elem_classes=["generate-button"])
         # Step 3: Result
         with gr.Column(scale=4):
+            gr.Markdown("### RESULT")
+            output_video_i2v = gr.Video(autoplay=True, label="OUTPUT VIDEO", height=512, show_download_button=True)
+            message = gr.Textbox(label="INFO", max_lines=8)
+            process_button_reset = gr.Button("🧹 Clear", variant="secondary")
+            def clear_all():
+                cleanup_temp_video_files()
+                return None, None, None, None, None
+            process_button_reset.click(
+                fn=clear_all,
+                inputs=[],
+                outputs=[base_video_input, driving_audio_input, driving_text_input, driving_input_voice, output_video_i2v]
             )
+    def process_video_and_update_voices(video_path):
+        processed_video = process_video_input(video_path)
+        voice_dropdown = update_voices(processed_video)
+        return processed_video, voice_dropdown
+    base_video_input.change(
+        fn=process_video_and_update_voices,
+        inputs=[base_video_input],
+        outputs=[base_video_input, driving_input_voice]
+    )
     base_motion_expression = gr.Radio(value=None, visible=False)
     process_button_animation.click(
         fn=task_executor_fn,
     )
 with gr.Blocks() as showcase_examples:
+    gr.Markdown("# IMAGE TO AVATAR")
     with gr.Row():
         with gr.Column(scale=7):
             for path in examples_showcase["make_image_talk_multilingual"]:
         for path in examples_showcase['make_image_talk_selfie']:
             gr.Video(value=path, label=os.path.basename(path), height=430)
+    gr.Markdown("# VIDEO TO AVATAR")
     with gr.Row():
         with gr.Column(scale=7):
             for path in examples_showcase["make_video_talk_multilingual"]:
         for path in examples_showcase["make_video_talk_rap_multii"]:
             gr.Video(value=path, label=os.path.basename(path), height=500)
+    gr.Markdown("# VIDEO TO AVATAR: DUBBING")
     with gr.Row():
         for path in examples_showcase["dubbing_superpowerman"]:
             gr.Video(value=path, label=os.path.basename(path), height=320)
         for path in examples_showcase["dubbing_coffee"]:
             gr.Video(value=path, label=os.path.basename(path), height=440)
+with gr.Blocks(analytics_enabled=False,
+               css="footer{display:none !important} .generate-button{margin-top:-10px !important;} #female-video-examples .gallery *, #male-video-examples .gallery *{height:142.1px !important; min-height:142.1px !important; max-height:142.1px !important;} #female-video-examples .gallery img, #male-video-examples .gallery img, #female-video-examples .gallery video, #male-video-examples .gallery video{width:80px !important; height:142.1px !important; object-fit:cover !important; min-height:142.1px !important; max-height:142.1px !important;} #female-video-examples .gallery > div, #male-video-examples .gallery > div{width:80px !important; height:142.1px !important; min-height:142.1px !important; max-height:142.1px !important; margin:2px !important;} .logo-left{text-align:left !important; margin:0 !important; padding:0 !important; border:none !important; outline:none !important; box-shadow:none !important; min-height:auto !important; height:auto !important; overflow:visible !important;} .logo-left > div{text-align:left !important; margin:0 !important; padding:0 !important; overflow:visible !important;} .logo-left img{height:45px !important; min-height:45px !important; max-height:45px !important;} .logo-right{text-align:right !important; margin:0 !important; padding:0 !important; border:none !important; outline:none !important; box-shadow:none !important; min-height:auto !important; height:auto !important; overflow:visible !important; display:flex !important; justify-content:flex-end !important; align-items:center !important;} .logo-right > div{text-align:right !important; margin:0 !important; padding:0 !important; overflow:visible !important; width:100% !important; display:flex !important; justify-content:flex-end !important;} .logo-right img{height:70px !important; min-height:70px !important; max-height:70px !important;}",
+               title="SUTRA Avatar v2") as demo:
+    with gr.Row():
+        with gr.Column(scale=10):
+            gr.HTML(value="<img src='data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iNTgiIGhlaWdodD0iMTUiIHZpZXdCb3g9IjAgMCA1OCAxNSIgZmlsbD0ibm9uZSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KPGcgY2xpcC1wYXRoPSJ1cmwoI2NsaXAwXzNfMikiPgo8cGF0aCBkPSJNNS43MjE5MSAxNC43ODQ0QzIuNDY5NDUgMTQuNzg0NCAwLjYwMjI5NSAxMy4wMDQ0IDAuNjAyMjk1IDkuODY0NDVIMi45MTExNEMyLjkxMTE0IDExLjY4NDQgMy45NTUxNCAxMi43ODQ0IDUuNzAxODMgMTIuNzg0NEM3LjIyNzY4IDEyLjc4NDQgOC4wNzA5MSAxMi4wODQ0IDguMDcwOTEgMTAuODI0NEM4LjA3MDkxIDkuNzY0NDUgNy41Njg5OSA5LjIyNDQ1IDUuOTgyOTEgOC41ODQ0NUwzLjY5NDE0IDcuNjY0NDVDMS45MjczNyA2Ljk4NDQ1IDEuMDAzODMgNS43MjQ0NSAxLjAwMzgzIDMuOTQ0NDVDMS4wMDM4MyAxLjY2NDQ1IDIuODUwOTEgMC4xMDQ0NDYgNS41MDEwNiAwLjEwNDQ0NkM4LjI5MTc2IDAuMTA0NDQ2IDEwLjEzODggMS44NDQ0NSAxMC4xMzg4IDQuNDg0NDVINy44Mjk5OUM3LjgyOTk5IDIuOTQ0NDUgNi45ODY3NiAyLjA2NDQ1IDUuNDIwNzYgMi4wNjQ0NUM0LjA5NTY4IDIuMDY0NDUgMy4zMzI3NiAyLjc0NDQ1IDMuMzMyNzYgMy44MDQ0NUMzLjMzMjc2IDQuNzY0NDUgNC4wMTUzNyA1LjQwNDQ1IDUuNjYxNjggNi4wNjQ0NUw3LjcyOTYgNi45MDQ0NUM5LjQ5NjM3IDcuNjI0NDUgMTAuMzc5OCA4Ljg2NDQ1IDEwLjM3OTggMTAuNzY0NEMxMC4zNzk4IDEzLjE2NDQgOC41MTI2IDE0Ljc4NDQgNS43MjE5MSAxNC43ODQ0Wk0xNy41MDIyIDE0Ljc4NDRDMTQuMzUwMSAxNC43ODQ0IDEyLjI4MjIgMTMuMDY0NCAxMi4yODIyIDkuODA0NDVWMC40NDQ0NDVIMTQuNTkxMVY5LjY2NDQ1QzE0LjU5MTEgMTEuNjg0NCAxNS43MzU0IDEyLjcyNDQgMTcuNTAyMiAxMi43MjQ0QzE5LjI2OSAxMi43MjQ0IDIwLjQxMzQgMTEuNjg0NCAyMC40MTM0IDkuNjY0NDVWMC40NDQ0NDVIMjIuNzIyMlY5LjgwNDQ1QzIyLjcyMjIgMTMuMDY0NCAyMC42NTQzIDE0Ljc4NDQgMTcuNTAyMiAxNC43ODQ0Wk0yOC4wOTU3IDE0LjQ0NDRWMi42MDQ0NUgyNC4wMjAxVjAuNDQ0NDQ1SDM0LjUwMDNWMi42MDQ0NUgzMC40MDQ2VjE0LjQ0NDRIMjguMDk1N1pNNDYuMTA5MyA1LjM0NDQ1QzQ2LjEwOTMgNy41MjQ0NSA0NC45MjQ4IDkuMjQ0NDUgNDMuMTE3OSA5LjkyNDQ1TDQ1Ljg4ODUgMTQuNDQ0NEg0My4yNTg0TDQwLjY4ODUgMTAuMjQ0NEgzOC40MTk5VjE0LjQ0NDRIMzYuMTExVjAuNDQ0NDQ1SDQxLjI5MDlDNDQuMTAxNiAwLjQ0NDQ0NSA0Ni4xMDkzIDIuNDg0NDUgNDYuMTA5MyA1LjM0NDQ1Wk0zOC40MTk5IDIuNTA0NDVWOC4xODQ0NUg0MS4xNTAzQzQyLjY3NjIgOC4xODQ0NSA0My44MDA1IDYuOTg0NDUgNDMuODAwNSA1LjM0NDQ1QzQzLjgwMDUgMy43MDQ0NSA0Mi42NzYyIDIuNTA0NDUgNDEuMTUwMyAyLjUwNDQ1SDM4LjQxOTlaIiBmaWxsPSIjMzA2MEZGIi8+CjxwYXRoIGQ9Ik01NS4zMzgxIDExLjY2NjdMNTQuOTAxNSAxMC4yMzI4SDUwLjU1MDlMNTAuMTE0MiAxMS42NjY3SDQ4LjA5MjZMNTEuOTA5NCAwLjM4ODg4NUg1My41NDI5TDU3LjM1OTggMTEuNjY2N0g1NS4zMzgxWk01MS4wMzYxIDguNTczMzNINTQuNDAwMUw1Mi43MTgxIDIuOTgyNzhMNTEuMDM2MSA4LjU3MzMzWiIgZmlsbD0iIzMwNjBGRiIvPgo8cGF0aCBkPSJNNTggMTIuODMzM0g0Ny40MDM4VjE0LjVINThWMTIuODMzM1oiIGZpbGw9IiMzMDYwRkYiLz4KPC9nPgo8ZGVmcz4KPGNsaXBQYXRoIGlkPSJjbGlwMF8zXzIiPgo8cmVjdCB3aWR0aD0iNTgiIGhlaWdodD0iMTUiIGZpbGw9IndoaXRlIi8+CjwvY2xpcFBhdGg+CjwvZGVmcz4KPC9zdmc+' />", elem_classes=["logo-left"])
+        with gr.Column(scale=2):
+            gr.HTML(value="<img src='data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjAwIiBoZWlnaHQ9IjgwIiB2aWV3Qm94PSIwIDAgMjAwIDgwIiBmaWxsPSJub25lIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPgo8dGV4dCB4PSIxMCIgeT0iNTUiIGZvbnQtZmFtaWx5PSJBcmlhbCwgc2Fucy1zZXJpZiIgZm9udC1zaXplPSI0OCIgZm9udC13ZWlnaHQ9ImJvbGQiIGZpbGw9IiMwMDAwMDAiPkFWQVRBUjwvdGV4dD4KPC9zdmc+' />", elem_classes=["logo-right"])
     gr.TabbedInterface(
         interface_list=[demo_image, demo_video, showcase_examples],
+        tab_names=["IMAGE to AVATAR", "VIDEO to AVATAR", "SHOWCASE"],
     )
 if __name__ == "__main__":

base_task_executor.py CHANGED Viewed

@@ -35,40 +35,12 @@ def get_name_ext(filepath):
     return name, ext
-def sanitize_string(string):
-    sanitized_string = re.sub(r"[^A-Za-z0-9]", "", string)
-    max_len = 15
-    return sanitized_string[:max_len]
-def get_output_video_name(
-    input_base_path, input_driving_path, base_motion_expression, input_driving_audio_path, tag=""
-):
-    if not tag:
-        tag = get_formatted_datetime_name()
-    base_name, _ = get_name_ext(input_base_path)
-    base_name = sanitize_string(base_name)
-    driving_name = ""
-    if input_driving_path:
-        driving_name, _ = get_name_ext(input_driving_path)
-        driving_name = sanitize_string(driving_name)
-    elif base_motion_expression and is_image(input_base_path):
-        driving_name = base_motion_expression
-    audio_name = ""
-    if input_driving_audio_path:
-        audio_name, _ = get_name_ext(input_driving_audio_path)
-        audio_name = sanitize_string(audio_name)
-    output_video_name = f"{tag}--b-{base_name}"
-    if driving_name:
-        output_video_name += f"--d-{driving_name}"
-    if audio_name:
-        output_video_name += f"--a-{audio_name}"
     return output_video_name
@@ -143,11 +115,10 @@ class BaseTaskExecutor(ABC):
             request_id = get_unique_name(maxd=8, delim="")
             output_video_path = os.path.join(
                 self.tmp_dir,
-                get_output_video_name(
-                    input_base_path, input_driving_path, base_motion_expression, input_driving_audio_path
-                )
                 + ".mp4",
             )
             result, output_video_path = self.generate(
                 input_base_path,
                 input_driving_path,
@@ -156,12 +127,30 @@ class BaseTaskExecutor(ABC):
                 output_video_path,
                 request_id,
             )
             success = result["success"]
             messages = result["messages"]
             self.clean(output_dir)
             if success:
                 return output_video_path, gr.update(visible=True), messages
             else:
                 gr.Info("Task could not be completed", duration=4)

     return name, ext
+def get_output_video_name():
+    tag = get_formatted_datetime_name()
+    output_video_name = f"sutra-avatar-{tag}"
     return output_video_name
             request_id = get_unique_name(maxd=8, delim="")
             output_video_path = os.path.join(
                 self.tmp_dir,
+                get_output_video_name()
                 + ".mp4",
             )
+            time_start = time.time()
             result, output_video_path = self.generate(
                 input_base_path,
                 input_driving_path,
                 output_video_path,
                 request_id,
             )
+            time_end = time.time()
+            pipeline_time = int((time_end - time_start) * 1000)
             success = result["success"]
             messages = result["messages"]
+            if "tlpMetrics" in result:
+                tlp_metrics = result["tlpMetrics"]
+                if isinstance(tlp_metrics, dict):
+                    n_frames = tlp_metrics.get('nFrames', 'N/A')
+                    tlp_msec = tlp_metrics.get('tlpMsec', 'N/A')
+                    metrics_str = f"Frame per second: 30\nNumber of Frames: {n_frames}\nPipeline Time: {tlp_msec}ms"
+                    messages += metrics_str
+                else:
+                    messages += f"\n{tlp_metrics}"
+            if "n_frames" in result:
+                n_frames = result.get("n_frames", "N/A")
+                messages += "Frame per second: 30"
+                messages += f"\nNumber of Frames: {n_frames}"
+                messages += f"\nPipeline Time: {pipeline_time}ms"
             self.clean(output_dir)
             if success:
+                print(f"output_video_path: {output_video_path}")
                 return output_video_path, gr.update(visible=True), messages
             else:
                 gr.Info("Task could not be completed", duration=4)

cloud_task_executor.py CHANGED Viewed

@@ -112,7 +112,7 @@ class CloudTaskExecutor(BaseTaskExecutor):
             timeout += estimatedWaitSeconds
         start_time = time.time()
-        result = {"messages": ''}
         while True:
             status_reply = self.get_task_status(request_id)
             task_status = status_reply["taskStatus"]
@@ -133,6 +133,7 @@ class CloudTaskExecutor(BaseTaskExecutor):
             pipe_reply = status_reply["pipeReply"]
             result["success"] = pipe_reply["status"] == "success"
             result["messages"] = pipe_reply["messages"]
             output_video_path = status_reply["videoURL"]
         else:
             messages = ""

             timeout += estimatedWaitSeconds
         start_time = time.time()
+        result = {}
         while True:
             status_reply = self.get_task_status(request_id)
             task_status = status_reply["taskStatus"]
             pipe_reply = status_reply["pipeReply"]
             result["success"] = pipe_reply["status"] == "success"
             result["messages"] = pipe_reply["messages"]
+            result["tlpMetrics"] = pipe_reply["tlpMetrics"]
             output_video_path = status_reply["videoURL"]
         else:
             messages = ""

common_util.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import subprocess
+import ffmpeg
+import imagesize
+class CommonUtil:
+    valid_image_exts = (".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp")
+    valid_video_exts = (".mp4", ".mov", ".avi", ".webm")
+    valid_audio_exts = (".mp3", ".wav")
+    valid_template_ext = ".npz"
+    valid_min_media_dim = 480  # pixels
+    valid_max_media_dim = 3840
+    valid_min_media_duration = 0.1  # seconds
+    valid_max_media_duration = 120  # seconds
+    valid_min_sample_rate = 16000
+    valid_max_sample_rate = 44100
+    valid_video_fps = 30  # fps
+    @staticmethod
+    def check_dim(width, height):
+        min_d = CommonUtil.valid_min_media_dim
+        max_d = CommonUtil.valid_max_media_dim
+        if width < min_d or width > max_d or height < min_d or height > max_d:
+            return False
+        return True
+    @staticmethod
+    def check_duration(duration):
+        if duration < CommonUtil.valid_min_media_duration:
+            return False
+        if duration > CommonUtil.valid_max_media_duration:
+            return False
+        return True
+    @staticmethod
+    def check_fps(fps):
+        if fps != CommonUtil.valid_video_fps:
+            return False
+        return True
+    @staticmethod
+    def get_audio_stream(video_path):
+        probe = ffmpeg.probe(video_path)
+        return next((stream for stream in probe["streams"] if stream["codec_type"] == "audio"), None)
+    @staticmethod
+    def get_video_stream(video_path):
+        probe = ffmpeg.probe(video_path)
+        return next((stream for stream in probe["streams"] if stream["codec_type"] == "video"), None)
+    @staticmethod
+    def exec_cmd(cmd):
+        return subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    @staticmethod
+    def get_media_properties(media):
+        is_image = CommonUtil.is_image(media)
+        is_video = CommonUtil.is_video(media)
+        is_audio = CommonUtil.is_audio(media)
+        if is_image:
+            width, height = imagesize.get(media)
+            return (is_image, is_video, is_audio, width, height, -1, -1)
+        elif is_video:
+            video_stream = CommonUtil.get_video_stream(media)
+            duration = float(video_stream["duration"])
+            width = int(video_stream["width"])
+            height = int(video_stream["height"])
+            sample_rate = video_stream["r_frame_rate"]
+            if sample_rate == "30/1":
+                sample_rate = int(30)
+            return (is_image, is_video, is_audio, width, height, duration, sample_rate)
+        elif is_audio:
+            audio_stream = CommonUtil.get_audio_stream(media)
+            duration = float(audio_stream["duration"])
+            sample_rate = int(audio_stream["sample_rate"])
+            return (is_image, is_video, is_audio, -1, -1, duration, sample_rate)
+        else:
+            return (is_image, is_video, is_audio, -1, -1, -1, -1)
+    @staticmethod
+    def is_image(file_path):
+        return file_path.lower().endswith(CommonUtil.valid_image_exts)
+    @staticmethod
+    def is_video(file_path):
+        return file_path.lower().endswith(CommonUtil.valid_video_exts)
+    @staticmethod
+    def is_audio(file_path):
+        return file_path.lower().endswith(CommonUtil.valid_audio_exts)
+    @staticmethod
+    def is_template(file_path):
+        if file_path.endswith(CommonUtil.valid_template_ext):
+            return True
+        return False
+    @staticmethod
+    def change_video_fps(input_file, output_file, fps=20, codec="libx264", crf=12):
+        cmd = f'ffmpeg -i "{input_file}" -c:v {codec} -crf {crf} -r {fps} "{output_file}" -y'
+        CommonUtil.exec_cmd(cmd)

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
-gradio==5.3.0
 elevenlabs==1.8.1
 google-cloud-storage

+gradio==5.37.0
 elevenlabs==1.8.1
 google-cloud-storage
+ffmpeg-python==0.2.0
+imagesize==1.4.1