kc-two commited on
Commit
adb363a
·
1 Parent(s): 5df99cd

v2: simplified UI

Browse files

adding common util

Files changed (6) hide show
  1. README.md +1 -1
  2. app.py +380 -179
  3. base_task_executor.py +24 -35
  4. cloud_task_executor.py +2 -1
  5. common_util.py +110 -0
  6. requirements.txt +3 -1
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🐨
4
  colorFrom: blue
5
  colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.4.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
4
  colorFrom: blue
5
  colorTo: red
6
  sdk: gradio
7
+ sdk_version: 5.37.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
app.py CHANGED
@@ -4,28 +4,28 @@ import argparse
4
  import glob
5
  import os
6
  from pathlib import Path
 
7
 
8
  import gradio as gr
9
 
10
  from cloud_task_executor import CloudTaskExecutor
11
  from elevenlabs_helper import ElevenLabsHelper
 
12
 
13
  # ---
14
  talk_key = "talk"
15
- valid_base_motion_expressions = [
16
- f"{talk_key}-head",
17
- f"{talk_key}-neutral",
18
- "smile",
19
- "approve",
20
- "disapprove",
21
- "confused",
22
- "sad",
23
- "surprised",
24
- ]
25
-
26
-
27
- def get_default_base_motion_expression():
28
- return valid_base_motion_expressions[0]
29
 
30
 
31
  # ---
@@ -51,9 +51,6 @@ def get_sorted_filenames_in_dir(dir_path: str, ext: str = ".jpg", throw_if_empty
51
  # ---
52
 
53
 
54
- description = """Experience a demo of the world's most advanced Text/Audio To Video (TTV) system, crafted by Two AI.
55
- Sign up with Two AI to gain rapid, long-form generation, API keys, and more!"""
56
-
57
  # Core constants
58
  tmp_dir = "/tmp/gradio"
59
  data_dir = "./data"
@@ -62,6 +59,9 @@ female_key = "female"
62
  unknown_key = "unknown"
63
  media_height = 512
64
 
 
 
 
65
  # Male/Female
66
  female_terms = ["Female", "Lady", "Woman"]
67
  male_terms = ["Male", "Lad", "Man"]
@@ -101,31 +101,35 @@ example_driving_audios_female = get_sorted_filenames_in_dir(
101
  )
102
  example_driving_audios = {female_key: example_driving_audios_female, male_key: example_driving_audios_male}
103
 
104
- # Driving Text
105
- audio_text_groups = ["General", "Promotional Messages", "Pronunciation Practice"]
106
- example_driving_audio_texts = {
107
- "General": [
108
- "The 2026 World Cup final match is in New York.",
109
- "Enhance efficiency and cut costs with AI.",
110
- "A bee's wings beat more than 200 times per second.",
111
- "2026년 월드컵 결승전은 뉴욕에서 열립니다.",
112
- "AI로 효율성을 높이고 비용을 절감하세요.",
113
- "벌은 초당 200회 이상의 날개짓을 합니다.",
114
- "2026 विश्व कप फाइनल मैच न्यूयॉर्क में होगा।",
115
- "AI के साथ दक्षता बढ़ाएं और लागत कम करें।",
116
- "मधुमक्खी के पंख सेकंड में 200 बार से अधिक फड़फड़ाते हैं।",
117
- ],
118
- "Promotional Messages": [
119
- "Welcome to our kiosk, where you can easily purchase tickets, or access various services by simply tapping the display!",
120
- "Catch all the drama, emotion, and energy in my new film, now available on Netflix—it's a must-watch!",
121
- "This season of IPL is full of surprises, and I’d love to see you supporting us as we fight for victory on the ground.",
122
- "Transform your health with our latest fitness programs! Join us today and take the first step toward a stronger, energized you.",
123
- ],
124
- "Pronunciation Practice": [
125
- "A big black bug bit a big black dog on his big black nose.",
126
- "Fuzzy Wuzzy was a bear. Fuzzy Wuzzy had no hair. Fuzzy Wuzzy wasn't very fuzzy, was he?",
127
- ],
128
- }
 
 
 
 
129
 
130
  example_showcase_dir = os.path.join(data_dir, "showcase_examples")
131
  examples_showcase = {
@@ -177,6 +181,11 @@ def update_voices(media_path):
177
  )
178
  return driving_input_voice
179
 
 
 
 
 
 
180
 
181
  def task_executor_fn(
182
  input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input
@@ -186,94 +195,256 @@ def task_executor_fn(
186
  input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input
187
  )
188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")])) as demo_image:
190
  with gr.Row():
191
  # Step 1: Choose Image
192
  with gr.Column(scale=4):
193
- gr.Markdown("### Step 1: Choose Image")
194
- gr.Markdown("Upload or select an example image to drive.")
195
- with gr.Accordion(open=True, label="Base Image"):
196
- base_image_input = gr.Image(type="filepath", sources="upload", height=media_height)
197
- gr.Examples(
198
- examples=[[example] for example in example_base_images[female_key]],
199
- inputs=[base_image_input],
200
- cache_examples=False,
201
- label="Female",
202
- )
203
- gr.Examples(
204
- examples=[[example] for example in example_base_images[male_key]],
205
- inputs=[base_image_input],
206
- cache_examples=False,
207
- label="Male",
208
- )
 
 
209
 
210
  # Step 2: Motion and Audio/TTS
211
  with gr.Column(scale=4):
212
- gr.Markdown("### Step 2: Motion and Audio/TTS")
213
- gr.Markdown("Select motion and provide audio or text for lip-sync.")
214
- with gr.Accordion(open=True, label="Base Motion"):
215
- base_motion_expression = gr.Radio(
216
- choices=valid_base_motion_expressions,
217
- label="Select base motion",
218
- value=get_default_base_motion_expression(),
219
- )
220
  with gr.Tabs():
221
- with gr.TabItem("Driving Audio: File") as tab_audio_file:
222
- with gr.Accordion(open=True, label="Driving Audio: From File"):
223
- driving_audio_input = gr.Audio(sources=["upload"], type="filepath")
224
- gr.Examples(
225
- examples=[[example] for example in example_driving_audios[female_key]],
226
- inputs=[driving_audio_input],
227
- cache_examples=False,
228
- examples_per_page=18,
229
- label="Female",
230
- )
231
- gr.Examples(
232
- examples=[[example] for example in example_driving_audios[male_key]],
233
- inputs=[driving_audio_input],
234
- cache_examples=False,
235
- examples_per_page=18,
236
- label="Male",
237
  )
238
 
239
- with gr.TabItem("Driving Audio: TTS") as tab_audio_tts:
240
- with gr.Accordion(open=True, label="Driving Audio: From Text"):
241
- driving_input_voice = gr.Dropdown(
242
- choices=voices[unknown_key], value=voices[unknown_key][0], label="Voice"
243
- )
244
- driving_text_input = gr.Textbox(
245
- label="Input Text (300 characters max)",
246
- lines=2,
247
- )
248
- for group in audio_text_groups:
249
- gr.Examples(
250
- examples=[[example] for example in example_driving_audio_texts[group]],
251
- inputs=[driving_text_input],
252
- cache_examples=False,
253
- label=group,
254
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
 
256
  # Step 3: Result
257
  with gr.Column(scale=4):
258
- gr.Markdown("### Step 3: Result")
259
- gr.Markdown("Generate and view the output video.")
260
- process_button_animation = gr.Button("🌟 Generate", variant="primary")
261
- output_video_i2v = gr.Video(autoplay=True, label="The Output Video", height=media_height)
262
- message = gr.Textbox(label="Info")
263
  process_button_reset = gr.ClearButton(
264
  [
265
  base_image_input,
 
 
 
266
  driving_audio_input,
267
  driving_text_input,
268
  driving_input_voice,
 
 
269
  output_video_i2v,
270
  ],
271
  value="🧹 Clear",
 
272
  )
273
 
274
- base_image_input.change(fn=update_voices, inputs=[base_image_input], outputs=[driving_input_voice])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
 
276
- # binding functions for buttons
277
  process_button_animation.click(
278
  fn=task_executor_fn,
279
  inputs=[
@@ -291,76 +462,107 @@ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta San
291
  with gr.Row():
292
  # Step 1: Choose Video
293
  with gr.Column(scale=4):
294
- gr.Markdown("### Step 1: Choose Video")
295
- gr.Markdown("Upload or select an example video to drive.")
296
- with gr.Accordion(open=True, label="Base Video"):
297
- base_video_input = gr.Video(sources="upload", height=media_height, interactive=True)
298
- gr.Examples(
299
- examples=[[example] for example in example_source_videos[female_key]],
300
- inputs=[base_video_input],
301
- cache_examples=False,
302
- label="Female",
303
- )
304
- gr.Examples(
305
- examples=[[example] for example in example_source_videos[male_key]],
306
- inputs=[base_video_input],
307
- cache_examples=False,
308
- label="Male",
309
- )
 
 
 
 
310
 
311
  # Step 2: Audio/TTS
312
  with gr.Column(scale=4):
313
- gr.Markdown("### Step 2: Audio/TTS")
314
- gr.Markdown("Provide audio or text for lip-sync.")
315
  with gr.Tabs():
316
- with gr.TabItem("Driving Audio: File") as tab_audio_file:
317
- with gr.Accordion(open=True, label="Driving Audio: From File"):
318
- driving_audio_input = gr.Audio(sources=["upload"], type="filepath")
319
- gr.Examples(
320
- examples=[[example] for example in example_driving_audios[female_key]],
321
- inputs=[driving_audio_input],
322
- cache_examples=False,
323
- examples_per_page=18,
324
- label="Female",
325
- )
326
- gr.Examples(
327
- examples=[[example] for example in example_driving_audios[male_key]],
328
- inputs=[driving_audio_input],
329
- cache_examples=False,
330
- examples_per_page=18,
331
- label="Male",
332
- )
333
- with gr.TabItem("Driving Audio: TTS") as tab_audio_tts:
334
- with gr.Accordion(open=True, label="Driving Audio: From Text"):
335
- driving_input_voice = gr.Dropdown(
336
- choices=voices[unknown_key], value=voices[unknown_key][0], label="Voice"
337
- )
338
- driving_text_input = gr.Textbox(
339
- label="Input Text (300 characters max)",
340
- lines=2,
341
- )
342
- for group in audio_text_groups:
343
- gr.Examples(
344
- examples=[[example] for example in example_driving_audio_texts[group]],
345
- inputs=[driving_text_input],
346
- cache_examples=False,
347
- label=group,
348
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
349
  # Step 3: Result
350
  with gr.Column(scale=4):
351
- gr.Markdown("### Step 3: Result")
352
- gr.Markdown("Generate and view the output video.")
353
- process_button_animation = gr.Button("🌟 Generate", variant="primary")
354
- output_video_i2v = gr.Video(autoplay=True, label="The Output Video", height=media_height)
355
- message = gr.Textbox(label="Info")
356
- process_button_reset = gr.ClearButton(
357
- [base_video_input, driving_audio_input, driving_text_input, driving_input_voice, output_video_i2v],
358
- value="🧹 Clear",
 
 
 
 
 
359
  )
360
 
361
- base_video_input.change(fn=update_voices, inputs=[base_video_input], outputs=[driving_input_voice])
 
 
 
 
 
 
 
 
 
 
 
362
 
363
- # binding functions for buttons
364
  base_motion_expression = gr.Radio(value=None, visible=False)
365
  process_button_animation.click(
366
  fn=task_executor_fn,
@@ -376,7 +578,7 @@ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta San
376
  )
377
 
378
  with gr.Blocks() as showcase_examples:
379
- gr.Markdown("# Make Image Talk")
380
  with gr.Row():
381
  with gr.Column(scale=7):
382
  for path in examples_showcase["make_image_talk_multilingual"]:
@@ -395,7 +597,7 @@ with gr.Blocks() as showcase_examples:
395
  for path in examples_showcase['make_image_talk_selfie']:
396
  gr.Video(value=path, label=os.path.basename(path), height=430)
397
 
398
- gr.Markdown("# Make Video Talk")
399
  with gr.Row():
400
  with gr.Column(scale=7):
401
  for path in examples_showcase["make_video_talk_multilingual"]:
@@ -407,7 +609,7 @@ with gr.Blocks() as showcase_examples:
407
  for path in examples_showcase["make_video_talk_rap_multii"]:
408
  gr.Video(value=path, label=os.path.basename(path), height=500)
409
 
410
- gr.Markdown("# Dubbing")
411
  with gr.Row():
412
  for path in examples_showcase["dubbing_superpowerman"]:
413
  gr.Video(value=path, label=os.path.basename(path), height=320)
@@ -415,19 +617,18 @@ with gr.Blocks() as showcase_examples:
415
  for path in examples_showcase["dubbing_coffee"]:
416
  gr.Video(value=path, label=os.path.basename(path), height=440)
417
 
418
- with gr.Blocks(analytics_enabled=False, css="footer{display:none !important}", title="SUTRA Avatar v2") as demo:
419
- gr.Markdown(
420
- """
421
- ## <img src="https://playground.two.ai/sutra.svg" height="20"/>
422
- """
423
- )
424
- title = "# 🌟 SUTRA Avatar v2 🌟\n## Drive Image or Video with LipSync from Audio or Text"
425
- gr.Markdown(title)
426
- gr.Markdown(description)
427
 
428
  gr.TabbedInterface(
429
  interface_list=[demo_image, demo_video, showcase_examples],
430
- tab_names=["Drive Image", "Drive Video", "Showcase Examples"],
431
  )
432
 
433
  if __name__ == "__main__":
 
4
  import glob
5
  import os
6
  from pathlib import Path
7
+ import tempfile
8
 
9
  import gradio as gr
10
 
11
  from cloud_task_executor import CloudTaskExecutor
12
  from elevenlabs_helper import ElevenLabsHelper
13
+ from common_util import CommonUtil
14
 
15
  # ---
16
  talk_key = "talk"
17
+ valid_talking_expressions = [
18
+ f"{talk_key}-head",
19
+ f"{talk_key}-neutral",
20
+ ]
21
+ valid_nontalking_expressions = [
22
+ "smile",
23
+ "approve",
24
+ "disapprove",
25
+ "confused",
26
+ "sad",
27
+ "surprised",
28
+ ]
 
 
29
 
30
 
31
  # ---
 
51
  # ---
52
 
53
 
 
 
 
54
  # Core constants
55
  tmp_dir = "/tmp/gradio"
56
  data_dir = "./data"
 
59
  unknown_key = "unknown"
60
  media_height = 512
61
 
62
+ # Global variables
63
+ temp_video_files = set()
64
+
65
  # Male/Female
66
  female_terms = ["Female", "Lady", "Woman"]
67
  male_terms = ["Male", "Lad", "Man"]
 
101
  )
102
  example_driving_audios = {female_key: example_driving_audios_female, male_key: example_driving_audios_male}
103
 
104
+ def get_audio_dropdown_choices(audio_paths, base_dir):
105
+ return [
106
+ (path.replace(base_dir, "").lstrip("/"), path)
107
+ for path in audio_paths
108
+ ]
109
+
110
+ example_driving_audio_base_dir = os.path.join("./data/input_audio/gradio/")
111
+ example_driving_audio_dropdown_choices = (
112
+ get_audio_dropdown_choices(example_driving_audios[female_key], example_driving_audio_base_dir) +
113
+ get_audio_dropdown_choices(example_driving_audios[male_key], example_driving_audio_base_dir)
114
+ )
115
+
116
+ example_driving_audio_texts = [
117
+ "The 2026 World Cup final match is in New York.",
118
+ "Enhance efficiency and cut costs with AI.",
119
+ "A bee's wings beat more than 200 times per second.",
120
+ "2026년 월드컵 결승전은 뉴욕에서 열립니다.",
121
+ "AI로 효율성을 높이고 비용을 절감하세요.",
122
+ "벌은 초당 200회 이상의 날개짓을 합니다.",
123
+ "2026 विश्व कप फाइनल मैच न्यूयॉर्क में होगा।",
124
+ "AI के साथ दक्षता बढ़ाएं और लागत कम करें।",
125
+ "मधुमक्खी के पंख सेकंड में 200 बार से अधिक फड़फड़ाते हैं।",
126
+ "Welcome to our kiosk, where you can easily purchase tickets, or access various services by simply tapping the display!",
127
+ "Catch all the drama, emotion, and energy in my new film, now available on Netflix—it's a must-watch!",
128
+ "This season of IPL is full of surprises, and I’d love to see you supporting us as we fight for victory on the ground.",
129
+ "Transform your health with our latest fitness programs! Join us today and take the first step toward a stronger, energized you.",
130
+ "A big black bug bit a big black dog on his big black nose.",
131
+ "Fuzzy Wuzzy was a bear. Fuzzy Wuzzy had no hair. Fuzzy Wuzzy wasn't very fuzzy, was he?",
132
+ ]
133
 
134
  example_showcase_dir = os.path.join(data_dir, "showcase_examples")
135
  examples_showcase = {
 
181
  )
182
  return driving_input_voice
183
 
184
+ def update_audio_tabs_visibility(motion_type):
185
+ if motion_type == "talking":
186
+ return gr.update(visible=True), gr.update(visible=True)
187
+ else:
188
+ return gr.update(visible=False), gr.update(visible=False)
189
 
190
  def task_executor_fn(
191
  input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input
 
195
  input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input
196
  )
197
 
198
+ def check_and_convert_video_fps(video_path):
199
+ if not video_path:
200
+ return None
201
+
202
+ try:
203
+ _, is_video, _, width, height, duration, fps = CommonUtil.get_media_properties(video_path)
204
+ if not is_video:
205
+ raise gr.Error("Not a video file")
206
+
207
+ if not CommonUtil.check_dim(width, height):
208
+ min_dim = CommonUtil.valid_min_media_dim
209
+ max_dim = CommonUtil.valid_max_media_dim
210
+ raise gr.Error(f"⚠️ Video dimensions must be between {min_dim}-{max_dim} pixels.\n\nCurrent size: {width}(w)x{height}(h)")
211
+
212
+ if not CommonUtil.check_duration(duration):
213
+ min_duration = CommonUtil.valid_min_media_duration
214
+ max_duration = CommonUtil.valid_max_media_duration
215
+ raise gr.Error(f"⚠️ Video duration must be between {min_duration}-{max_duration} seconds.\n\nCurrent duration: {duration}s")
216
+
217
+ if CommonUtil.check_fps(fps):
218
+ return video_path
219
+
220
+ target_fps = CommonUtil.valid_video_fps
221
+ print(f"Converting video from {fps}fps to {target_fps}fps: {video_path}")
222
+
223
+ temp_dir = tempfile.mkdtemp()
224
+ base_name = os.path.splitext(os.path.basename(video_path))[0]
225
+ converted_path = os.path.join(temp_dir, f"{base_name}_{target_fps}fps.mp4")
226
+
227
+ CommonUtil.change_video_fps(video_path, converted_path, fps=target_fps)
228
+
229
+ temp_video_files.add(converted_path)
230
+
231
+ return converted_path
232
+
233
+ except gr.Error:
234
+ # Re-raise gr.Error to show notification
235
+ raise
236
+ except Exception as e:
237
+ print(f"Error processing video FPS: {e}")
238
+ raise gr.Error(f"Error processing video: {str(e)}")
239
+
240
+
241
+ def check_and_validate_image(image_path):
242
+ """Check and validate image properties"""
243
+ if not image_path:
244
+ return None
245
+
246
+ try:
247
+ is_image, _, _, width, height, _, _ = CommonUtil.get_media_properties(image_path)
248
+
249
+ if not is_image:
250
+ raise gr.Error("⚠️ Not an image file. Please upload a valid image file.")
251
+
252
+ if not CommonUtil.check_dim(width, height):
253
+ min_dim = CommonUtil.valid_min_media_dim
254
+ max_dim = CommonUtil.valid_max_media_dim
255
+ raise gr.Error(f"⚠️ Image dimensions must be between {min_dim}-{max_dim} pixels.\n\nCurrent size: {width}(w)x{height}(h)")
256
+
257
+ return image_path
258
+
259
+ except gr.Error:
260
+ # Re-raise gr.Error to show notification
261
+ raise
262
+ except Exception as e:
263
+ print(f"Error validating image: {e}")
264
+ raise gr.Error(f"❌ Error processing image: {str(e)}")
265
+
266
+ def process_video_input(video_path):
267
+ if not video_path:
268
+ return None
269
+
270
+ converted_path = check_and_convert_video_fps(video_path)
271
+ print(f"Video processing result: {converted_path}")
272
+
273
+ return converted_path
274
+
275
+
276
+ def cleanup_temp_video_files():
277
+ global temp_video_files
278
+ for temp_file in temp_video_files:
279
+ try:
280
+ if os.path.exists(temp_file):
281
+ os.remove(temp_file)
282
+ print(f"Cleaned up temporary file: {temp_file}")
283
+ except Exception as e:
284
+ print(f"Error cleaning up {temp_file}: {e}")
285
+
286
+ # Clear the set
287
+ temp_video_files.clear()
288
+
289
  with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")])) as demo_image:
290
  with gr.Row():
291
  # Step 1: Choose Image
292
  with gr.Column(scale=4):
293
+ gr.Markdown("### STEP 1 - SELECT IMAGE")
294
+ base_image_input = gr.Image(label="IMAGE", type="filepath", sources="upload", height=media_height, interactive=True)
295
+ gr.Examples(
296
+ examples=[[example] for example in example_base_images[female_key]],
297
+ inputs=[base_image_input],
298
+ fn=lambda x: x,
299
+ outputs=[base_image_input],
300
+ cache_examples=False,
301
+ label="Female",
302
+ )
303
+ gr.Examples(
304
+ examples=[[example] for example in example_base_images[male_key]],
305
+ inputs=[base_image_input],
306
+ fn=lambda x: x,
307
+ outputs=[base_image_input],
308
+ cache_examples=False,
309
+ label="Male",
310
+ )
311
 
312
  # Step 2: Motion and Audio/TTS
313
  with gr.Column(scale=4):
314
+ gr.Markdown("### STEP 2 - SELECT MOTION & AUDIO")
315
+ base_motion_expression = gr.Radio(
316
+ choices=valid_talking_expressions,
317
+ value=valid_talking_expressions[0],
318
+ visible=False,
319
+ )
320
+
 
321
  with gr.Tabs():
322
+ with gr.TabItem("TALKING MOTION") as tab_talking_motion:
323
+ base_talking_expression = gr.Radio(
324
+ choices=valid_talking_expressions,
325
+ label="STEP 2.1 - TALKING MOTION",
326
+ value=valid_talking_expressions[0],
327
+ )
328
+ with gr.TabItem("EXPRESSION MOTION") as tab_expression_motion:
329
+ base_expression_expression = gr.Radio(
330
+ choices=valid_nontalking_expressions,
331
+ label="STEP 2 - EXPRESSION MOTION",
332
+ value=None,
 
 
 
 
 
333
  )
334
 
335
+ with gr.Tabs():
336
+ with gr.TabItem("DRIVING AUDIO: FILE") as tab_audio_file:
337
+ driving_audio_input = gr.File(label="STEP 2.2 - AUDIO FILE", file_types=[".mp3", ".wav"], type="filepath", height=287)
338
+ example_driving_audio_dropdown = gr.Dropdown(
339
+ choices=example_driving_audio_dropdown_choices,
340
+ value=None,
341
+ label="OR SELECT FROM EXAMPLES",
342
+ interactive=True,
343
+ allow_custom_value=False
344
+ )
345
+
346
+ def update_audio_input(selected_audio):
347
+ return selected_audio if selected_audio else None
348
+
349
+ example_driving_audio_dropdown.change(
350
+ fn=update_audio_input,
351
+ inputs=[example_driving_audio_dropdown],
352
+ outputs=[driving_audio_input]
353
+ )
354
+
355
+ with gr.TabItem("DRIVING AUDIO: TTS") as tab_audio_tts:
356
+ driving_input_voice = gr.Dropdown(
357
+ choices=voices[unknown_key], value=voices[unknown_key][0], label="STEP 2.2 - VOICE"
358
+ )
359
+ driving_text_input = gr.Textbox(
360
+ label="INPUT TEXT (300 characters max)",
361
+ lines=2,
362
+ )
363
+ example_text_dropdown = gr.Dropdown(
364
+ choices=example_driving_audio_texts,
365
+ value=None,
366
+ label="OR SELECT FROM EXAMPLES",
367
+ interactive=True,
368
+ allow_custom_value=False
369
+ )
370
+
371
+ def update_text_input(selected_text):
372
+ return selected_text if selected_text else ""
373
+
374
+ example_text_dropdown.change(
375
+ fn=update_text_input,
376
+ inputs=[example_text_dropdown],
377
+ outputs=[driving_text_input]
378
+ )
379
+ process_button_animation = gr.Button("🌟 Generate", variant="primary", elem_classes=["generate-button"])
380
 
381
  # Step 3: Result
382
  with gr.Column(scale=4):
383
+ gr.Markdown("### RESULT")
384
+ output_video_i2v = gr.Video(autoplay=True, label="OUTPUT VIDEO", height=512, show_download_button=True)
385
+ message = gr.Textbox(label="INFO", max_lines=8)
 
 
386
  process_button_reset = gr.ClearButton(
387
  [
388
  base_image_input,
389
+ base_motion_expression,
390
+ base_talking_expression,
391
+ base_expression_expression,
392
  driving_audio_input,
393
  driving_text_input,
394
  driving_input_voice,
395
+ example_text_dropdown,
396
+ example_driving_audio_dropdown,
397
  output_video_i2v,
398
  ],
399
  value="🧹 Clear",
400
+ variant="secondary",
401
  )
402
 
403
+ def process_image_and_update_voices(image_path):
404
+ validated_image = check_and_validate_image(image_path)
405
+
406
+ voice_dropdown = update_voices(validated_image)
407
+
408
+ return validated_image, voice_dropdown
409
+
410
+ base_image_input.change(
411
+ fn=process_image_and_update_voices,
412
+ inputs=[base_image_input],
413
+ outputs=[base_image_input, driving_input_voice]
414
+ )
415
+
416
+ base_talking_expression.change(
417
+ fn=lambda x: x,
418
+ inputs=[base_talking_expression],
419
+ outputs=[base_motion_expression],
420
+ )
421
+
422
+ base_expression_expression.change(
423
+ fn=lambda x: gr.update(value=x),
424
+ inputs=[base_expression_expression],
425
+ outputs=[base_motion_expression],
426
+ )
427
+
428
+ def update_talking_tab():
429
+ audio_visibility = update_audio_tabs_visibility("talking")
430
+ return audio_visibility[0], audio_visibility[1], gr.update(choices=valid_talking_expressions, value=valid_talking_expressions[0])
431
+
432
+ def update_expression_tab():
433
+ audio_visibility = update_audio_tabs_visibility("expression")
434
+ return audio_visibility[1], audio_visibility[0], gr.update(choices=valid_nontalking_expressions, value=valid_nontalking_expressions[0])
435
+
436
+ tab_talking_motion.select(
437
+ fn=update_talking_tab,
438
+ inputs=[],
439
+ outputs=[tab_audio_file, tab_audio_tts, base_motion_expression],
440
+ )
441
+
442
+ tab_expression_motion.select(
443
+ fn=update_expression_tab,
444
+ inputs=[],
445
+ outputs=[tab_audio_file, tab_audio_tts, base_motion_expression],
446
+ )
447
 
 
448
  process_button_animation.click(
449
  fn=task_executor_fn,
450
  inputs=[
 
462
  with gr.Row():
463
  # Step 1: Choose Video
464
  with gr.Column(scale=4):
465
+ gr.Markdown("### STEP 1 - SELECT VIDEO")
466
+ base_video_input = gr.Video(label="VIDEO", sources="upload", height=media_height, interactive=True)
467
+ gr.Examples(
468
+ examples=[[example] for example in example_source_videos[female_key]],
469
+ inputs=[base_video_input],
470
+ fn=lambda x: x,
471
+ outputs=[base_video_input],
472
+ cache_examples=False,
473
+ label="Female",
474
+ elem_id="female-video-examples"
475
+ )
476
+ gr.Examples(
477
+ examples=[[example] for example in example_source_videos[male_key]],
478
+ inputs=[base_video_input],
479
+ fn=lambda x: x,
480
+ outputs=[base_video_input],
481
+ cache_examples=False,
482
+ label="Male",
483
+ elem_id="male-video-examples"
484
+ )
485
 
486
  # Step 2: Audio/TTS
487
  with gr.Column(scale=4):
488
+ gr.Markdown("### STEP 2 - SELECT AUDIO")
 
489
  with gr.Tabs():
490
+ with gr.TabItem("DRIVING AUDIO: FILE") as tab_audio_file:
491
+
492
+ driving_audio_input = gr.File(label="AUDIO", file_types=[".mp3", ".wav"], type="filepath", height=454)
493
+ example_driving_audio_dropdown = gr.Dropdown(
494
+ choices=example_driving_audio_dropdown_choices,
495
+ value=None,
496
+ label="OR SELECT FROM EXAMPLES",
497
+ interactive=True,
498
+ allow_custom_value=False
499
+ )
500
+
501
+ def update_audio_input(selected_audio):
502
+ return selected_audio if selected_audio else None
503
+
504
+ example_driving_audio_dropdown.change(
505
+ fn=update_audio_input,
506
+ inputs=[example_driving_audio_dropdown],
507
+ outputs=[driving_audio_input]
508
+ )
509
+
510
+ with gr.TabItem("DRIVING AUDIO: TTS") as tab_audio_tts:
511
+
512
+ driving_input_voice = gr.Dropdown(
513
+ choices=voices[unknown_key], value=voices[unknown_key][0], label="VOICE"
514
+ )
515
+ driving_text_input = gr.Textbox(
516
+ label="INPUT TEXT (300 characters max)",
517
+ lines=5,
518
+ )
519
+ example_text_dropdown = gr.Dropdown(
520
+ choices=example_driving_audio_texts,
521
+ value=None,
522
+ label="OR SELECT FROM EXAMPLES",
523
+ interactive=True,
524
+ allow_custom_value=False
525
+ )
526
+
527
+ def update_text_input(selected_text):
528
+ return selected_text if selected_text else ""
529
+
530
+ example_text_dropdown.change(
531
+ fn=update_text_input,
532
+ inputs=[example_text_dropdown],
533
+ outputs=[driving_text_input]
534
+ )
535
+ process_button_animation = gr.Button("🌟 Generate", variant="primary", elem_classes=["generate-button"])
536
  # Step 3: Result
537
  with gr.Column(scale=4):
538
+ gr.Markdown("### RESULT")
539
+ output_video_i2v = gr.Video(autoplay=True, label="OUTPUT VIDEO", height=512, show_download_button=True)
540
+ message = gr.Textbox(label="INFO", max_lines=8)
541
+ process_button_reset = gr.Button("🧹 Clear", variant="secondary")
542
+
543
+ def clear_all():
544
+ cleanup_temp_video_files()
545
+ return None, None, None, None, None
546
+
547
+ process_button_reset.click(
548
+ fn=clear_all,
549
+ inputs=[],
550
+ outputs=[base_video_input, driving_audio_input, driving_text_input, driving_input_voice, output_video_i2v]
551
  )
552
 
553
+ def process_video_and_update_voices(video_path):
554
+ processed_video = process_video_input(video_path)
555
+
556
+ voice_dropdown = update_voices(processed_video)
557
+
558
+ return processed_video, voice_dropdown
559
+
560
+ base_video_input.change(
561
+ fn=process_video_and_update_voices,
562
+ inputs=[base_video_input],
563
+ outputs=[base_video_input, driving_input_voice]
564
+ )
565
 
 
566
  base_motion_expression = gr.Radio(value=None, visible=False)
567
  process_button_animation.click(
568
  fn=task_executor_fn,
 
578
  )
579
 
580
  with gr.Blocks() as showcase_examples:
581
+ gr.Markdown("# IMAGE TO AVATAR")
582
  with gr.Row():
583
  with gr.Column(scale=7):
584
  for path in examples_showcase["make_image_talk_multilingual"]:
 
597
  for path in examples_showcase['make_image_talk_selfie']:
598
  gr.Video(value=path, label=os.path.basename(path), height=430)
599
 
600
+ gr.Markdown("# VIDEO TO AVATAR")
601
  with gr.Row():
602
  with gr.Column(scale=7):
603
  for path in examples_showcase["make_video_talk_multilingual"]:
 
609
  for path in examples_showcase["make_video_talk_rap_multii"]:
610
  gr.Video(value=path, label=os.path.basename(path), height=500)
611
 
612
+ gr.Markdown("# VIDEO TO AVATAR: DUBBING")
613
  with gr.Row():
614
  for path in examples_showcase["dubbing_superpowerman"]:
615
  gr.Video(value=path, label=os.path.basename(path), height=320)
 
617
  for path in examples_showcase["dubbing_coffee"]:
618
  gr.Video(value=path, label=os.path.basename(path), height=440)
619
 
620
+ with gr.Blocks(analytics_enabled=False,
621
+ css="footer{display:none !important} .generate-button{margin-top:-10px !important;} #female-video-examples .gallery *, #male-video-examples .gallery *{height:142.1px !important; min-height:142.1px !important; max-height:142.1px !important;} #female-video-examples .gallery img, #male-video-examples .gallery img, #female-video-examples .gallery video, #male-video-examples .gallery video{width:80px !important; height:142.1px !important; object-fit:cover !important; min-height:142.1px !important; max-height:142.1px !important;} #female-video-examples .gallery > div, #male-video-examples .gallery > div{width:80px !important; height:142.1px !important; min-height:142.1px !important; max-height:142.1px !important; margin:2px !important;} .logo-left{text-align:left !important; margin:0 !important; padding:0 !important; border:none !important; outline:none !important; box-shadow:none !important; min-height:auto !important; height:auto !important; overflow:visible !important;} .logo-left > div{text-align:left !important; margin:0 !important; padding:0 !important; overflow:visible !important;} .logo-left img{height:45px !important; min-height:45px !important; max-height:45px !important;} .logo-right{text-align:right !important; margin:0 !important; padding:0 !important; border:none !important; outline:none !important; box-shadow:none !important; min-height:auto !important; height:auto !important; overflow:visible !important; display:flex !important; justify-content:flex-end !important; align-items:center !important;} .logo-right > div{text-align:right !important; margin:0 !important; padding:0 !important; overflow:visible !important; width:100% !important; display:flex !important; justify-content:flex-end !important;} .logo-right img{height:70px !important; min-height:70px !important; max-height:70px !important;}",
622
+ title="SUTRA Avatar v2") as demo:
623
+ with gr.Row():
624
+ with gr.Column(scale=10):
625
+ gr.HTML(value="<img src='' />", elem_classes=["logo-left"])
626
+ with gr.Column(scale=2):
627
+ gr.HTML(value="<img src='' />", elem_classes=["logo-right"])
 
628
 
629
  gr.TabbedInterface(
630
  interface_list=[demo_image, demo_video, showcase_examples],
631
+ tab_names=["IMAGE to AVATAR", "VIDEO to AVATAR", "SHOWCASE"],
632
  )
633
 
634
  if __name__ == "__main__":
base_task_executor.py CHANGED
@@ -35,40 +35,12 @@ def get_name_ext(filepath):
35
  return name, ext
36
 
37
 
38
- def sanitize_string(string):
39
- sanitized_string = re.sub(r"[^A-Za-z0-9]", "", string)
40
- max_len = 15
41
- return sanitized_string[:max_len]
42
 
 
43
 
44
- def get_output_video_name(
45
- input_base_path, input_driving_path, base_motion_expression, input_driving_audio_path, tag=""
46
- ):
47
- if not tag:
48
- tag = get_formatted_datetime_name()
49
-
50
- base_name, _ = get_name_ext(input_base_path)
51
- base_name = sanitize_string(base_name)
52
-
53
- driving_name = ""
54
- if input_driving_path:
55
- driving_name, _ = get_name_ext(input_driving_path)
56
- driving_name = sanitize_string(driving_name)
57
- elif base_motion_expression and is_image(input_base_path):
58
- driving_name = base_motion_expression
59
-
60
- audio_name = ""
61
- if input_driving_audio_path:
62
- audio_name, _ = get_name_ext(input_driving_audio_path)
63
- audio_name = sanitize_string(audio_name)
64
-
65
- output_video_name = f"{tag}--b-{base_name}"
66
-
67
- if driving_name:
68
- output_video_name += f"--d-{driving_name}"
69
-
70
- if audio_name:
71
- output_video_name += f"--a-{audio_name}"
72
  return output_video_name
73
 
74
 
@@ -143,11 +115,10 @@ class BaseTaskExecutor(ABC):
143
  request_id = get_unique_name(maxd=8, delim="")
144
  output_video_path = os.path.join(
145
  self.tmp_dir,
146
- get_output_video_name(
147
- input_base_path, input_driving_path, base_motion_expression, input_driving_audio_path
148
- )
149
  + ".mp4",
150
  )
 
151
  result, output_video_path = self.generate(
152
  input_base_path,
153
  input_driving_path,
@@ -156,12 +127,30 @@ class BaseTaskExecutor(ABC):
156
  output_video_path,
157
  request_id,
158
  )
 
 
159
  success = result["success"]
160
  messages = result["messages"]
161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  self.clean(output_dir)
163
 
164
  if success:
 
165
  return output_video_path, gr.update(visible=True), messages
166
  else:
167
  gr.Info("Task could not be completed", duration=4)
 
35
  return name, ext
36
 
37
 
38
+ def get_output_video_name():
39
+
40
+ tag = get_formatted_datetime_name()
 
41
 
42
+ output_video_name = f"sutra-avatar-{tag}"
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  return output_video_name
45
 
46
 
 
115
  request_id = get_unique_name(maxd=8, delim="")
116
  output_video_path = os.path.join(
117
  self.tmp_dir,
118
+ get_output_video_name()
 
 
119
  + ".mp4",
120
  )
121
+ time_start = time.time()
122
  result, output_video_path = self.generate(
123
  input_base_path,
124
  input_driving_path,
 
127
  output_video_path,
128
  request_id,
129
  )
130
+ time_end = time.time()
131
+ pipeline_time = int((time_end - time_start) * 1000)
132
  success = result["success"]
133
  messages = result["messages"]
134
 
135
+ if "tlpMetrics" in result:
136
+ tlp_metrics = result["tlpMetrics"]
137
+ if isinstance(tlp_metrics, dict):
138
+ n_frames = tlp_metrics.get('nFrames', 'N/A')
139
+ tlp_msec = tlp_metrics.get('tlpMsec', 'N/A')
140
+ metrics_str = f"Frame per second: 30\nNumber of Frames: {n_frames}\nPipeline Time: {tlp_msec}ms"
141
+ messages += metrics_str
142
+ else:
143
+ messages += f"\n{tlp_metrics}"
144
+ if "n_frames" in result:
145
+ n_frames = result.get("n_frames", "N/A")
146
+ messages += "Frame per second: 30"
147
+ messages += f"\nNumber of Frames: {n_frames}"
148
+ messages += f"\nPipeline Time: {pipeline_time}ms"
149
+
150
  self.clean(output_dir)
151
 
152
  if success:
153
+ print(f"output_video_path: {output_video_path}")
154
  return output_video_path, gr.update(visible=True), messages
155
  else:
156
  gr.Info("Task could not be completed", duration=4)
cloud_task_executor.py CHANGED
@@ -112,7 +112,7 @@ class CloudTaskExecutor(BaseTaskExecutor):
112
  timeout += estimatedWaitSeconds
113
  start_time = time.time()
114
 
115
- result = {"messages": ''}
116
  while True:
117
  status_reply = self.get_task_status(request_id)
118
  task_status = status_reply["taskStatus"]
@@ -133,6 +133,7 @@ class CloudTaskExecutor(BaseTaskExecutor):
133
  pipe_reply = status_reply["pipeReply"]
134
  result["success"] = pipe_reply["status"] == "success"
135
  result["messages"] = pipe_reply["messages"]
 
136
  output_video_path = status_reply["videoURL"]
137
  else:
138
  messages = ""
 
112
  timeout += estimatedWaitSeconds
113
  start_time = time.time()
114
 
115
+ result = {}
116
  while True:
117
  status_reply = self.get_task_status(request_id)
118
  task_status = status_reply["taskStatus"]
 
133
  pipe_reply = status_reply["pipeReply"]
134
  result["success"] = pipe_reply["status"] == "success"
135
  result["messages"] = pipe_reply["messages"]
136
+ result["tlpMetrics"] = pipe_reply["tlpMetrics"]
137
  output_video_path = status_reply["videoURL"]
138
  else:
139
  messages = ""
common_util.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import ffmpeg
3
+ import imagesize
4
+
5
+ class CommonUtil:
6
+ valid_image_exts = (".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp")
7
+ valid_video_exts = (".mp4", ".mov", ".avi", ".webm")
8
+ valid_audio_exts = (".mp3", ".wav")
9
+ valid_template_ext = ".npz"
10
+
11
+ valid_min_media_dim = 480 # pixels
12
+ valid_max_media_dim = 3840
13
+
14
+ valid_min_media_duration = 0.1 # seconds
15
+ valid_max_media_duration = 120 # seconds
16
+
17
+ valid_min_sample_rate = 16000
18
+ valid_max_sample_rate = 44100
19
+
20
+ valid_video_fps = 30 # fps
21
+
22
+ @staticmethod
23
+ def check_dim(width, height):
24
+ min_d = CommonUtil.valid_min_media_dim
25
+ max_d = CommonUtil.valid_max_media_dim
26
+ if width < min_d or width > max_d or height < min_d or height > max_d:
27
+ return False
28
+ return True
29
+
30
+ @staticmethod
31
+ def check_duration(duration):
32
+ if duration < CommonUtil.valid_min_media_duration:
33
+ return False
34
+
35
+ if duration > CommonUtil.valid_max_media_duration:
36
+ return False
37
+
38
+ return True
39
+
40
+ @staticmethod
41
+ def check_fps(fps):
42
+ if fps != CommonUtil.valid_video_fps:
43
+ return False
44
+ return True
45
+
46
+ @staticmethod
47
+ def get_audio_stream(video_path):
48
+ probe = ffmpeg.probe(video_path)
49
+ return next((stream for stream in probe["streams"] if stream["codec_type"] == "audio"), None)
50
+
51
+ @staticmethod
52
+ def get_video_stream(video_path):
53
+ probe = ffmpeg.probe(video_path)
54
+ return next((stream for stream in probe["streams"] if stream["codec_type"] == "video"), None)
55
+
56
+ @staticmethod
57
+ def exec_cmd(cmd):
58
+ return subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
59
+
60
+ @staticmethod
61
+ def get_media_properties(media):
62
+ is_image = CommonUtil.is_image(media)
63
+ is_video = CommonUtil.is_video(media)
64
+ is_audio = CommonUtil.is_audio(media)
65
+
66
+ if is_image:
67
+ width, height = imagesize.get(media)
68
+ return (is_image, is_video, is_audio, width, height, -1, -1)
69
+
70
+ elif is_video:
71
+ video_stream = CommonUtil.get_video_stream(media)
72
+ duration = float(video_stream["duration"])
73
+ width = int(video_stream["width"])
74
+ height = int(video_stream["height"])
75
+ sample_rate = video_stream["r_frame_rate"]
76
+ if sample_rate == "30/1":
77
+ sample_rate = int(30)
78
+ return (is_image, is_video, is_audio, width, height, duration, sample_rate)
79
+
80
+ elif is_audio:
81
+ audio_stream = CommonUtil.get_audio_stream(media)
82
+ duration = float(audio_stream["duration"])
83
+ sample_rate = int(audio_stream["sample_rate"])
84
+ return (is_image, is_video, is_audio, -1, -1, duration, sample_rate)
85
+ else:
86
+ return (is_image, is_video, is_audio, -1, -1, -1, -1)
87
+
88
+ @staticmethod
89
+ def is_image(file_path):
90
+ return file_path.lower().endswith(CommonUtil.valid_image_exts)
91
+
92
+ @staticmethod
93
+ def is_video(file_path):
94
+ return file_path.lower().endswith(CommonUtil.valid_video_exts)
95
+
96
+ @staticmethod
97
+ def is_audio(file_path):
98
+ return file_path.lower().endswith(CommonUtil.valid_audio_exts)
99
+
100
+ @staticmethod
101
+ def is_template(file_path):
102
+ if file_path.endswith(CommonUtil.valid_template_ext):
103
+ return True
104
+ return False
105
+
106
+ @staticmethod
107
+ def change_video_fps(input_file, output_file, fps=20, codec="libx264", crf=12):
108
+ cmd = f'ffmpeg -i "{input_file}" -c:v {codec} -crf {crf} -r {fps} "{output_file}" -y'
109
+ CommonUtil.exec_cmd(cmd)
110
+
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
- gradio==5.3.0
2
  elevenlabs==1.8.1
3
  google-cloud-storage
 
 
 
1
+ gradio==5.37.0
2
  elevenlabs==1.8.1
3
  google-cloud-storage
4
+ ffmpeg-python==0.2.0
5
+ imagesize==1.4.1