Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
v2: simplified UI
Browse filesadding common util
- README.md +1 -1
- app.py +380 -179
- base_task_executor.py +24 -35
- cloud_task_executor.py +2 -1
- common_util.py +110 -0
- requirements.txt +3 -1
README.md
CHANGED
@@ -4,7 +4,7 @@ emoji: 🐨
|
|
4 |
colorFrom: blue
|
5 |
colorTo: red
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 5.
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
|
|
4 |
colorFrom: blue
|
5 |
colorTo: red
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 5.37.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: mit
|
app.py
CHANGED
@@ -4,28 +4,28 @@ import argparse
|
|
4 |
import glob
|
5 |
import os
|
6 |
from pathlib import Path
|
|
|
7 |
|
8 |
import gradio as gr
|
9 |
|
10 |
from cloud_task_executor import CloudTaskExecutor
|
11 |
from elevenlabs_helper import ElevenLabsHelper
|
|
|
12 |
|
13 |
# ---
|
14 |
talk_key = "talk"
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
def get_default_base_motion_expression():
|
28 |
-
return valid_base_motion_expressions[0]
|
29 |
|
30 |
|
31 |
# ---
|
@@ -51,9 +51,6 @@ def get_sorted_filenames_in_dir(dir_path: str, ext: str = ".jpg", throw_if_empty
|
|
51 |
# ---
|
52 |
|
53 |
|
54 |
-
description = """Experience a demo of the world's most advanced Text/Audio To Video (TTV) system, crafted by Two AI.
|
55 |
-
Sign up with Two AI to gain rapid, long-form generation, API keys, and more!"""
|
56 |
-
|
57 |
# Core constants
|
58 |
tmp_dir = "/tmp/gradio"
|
59 |
data_dir = "./data"
|
@@ -62,6 +59,9 @@ female_key = "female"
|
|
62 |
unknown_key = "unknown"
|
63 |
media_height = 512
|
64 |
|
|
|
|
|
|
|
65 |
# Male/Female
|
66 |
female_terms = ["Female", "Lady", "Woman"]
|
67 |
male_terms = ["Male", "Lad", "Man"]
|
@@ -101,31 +101,35 @@ example_driving_audios_female = get_sorted_filenames_in_dir(
|
|
101 |
)
|
102 |
example_driving_audios = {female_key: example_driving_audios_female, male_key: example_driving_audios_male}
|
103 |
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
"
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
"
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
|
|
|
|
|
|
|
|
129 |
|
130 |
example_showcase_dir = os.path.join(data_dir, "showcase_examples")
|
131 |
examples_showcase = {
|
@@ -177,6 +181,11 @@ def update_voices(media_path):
|
|
177 |
)
|
178 |
return driving_input_voice
|
179 |
|
|
|
|
|
|
|
|
|
|
|
180 |
|
181 |
def task_executor_fn(
|
182 |
input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input
|
@@ -186,94 +195,256 @@ def task_executor_fn(
|
|
186 |
input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input
|
187 |
)
|
188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")])) as demo_image:
|
190 |
with gr.Row():
|
191 |
# Step 1: Choose Image
|
192 |
with gr.Column(scale=4):
|
193 |
-
gr.Markdown("###
|
194 |
-
gr.
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
|
|
|
|
209 |
|
210 |
# Step 2: Motion and Audio/TTS
|
211 |
with gr.Column(scale=4):
|
212 |
-
gr.Markdown("###
|
213 |
-
gr.
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
)
|
220 |
with gr.Tabs():
|
221 |
-
with gr.TabItem("
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
examples=[[example] for example in example_driving_audios[male_key]],
|
233 |
-
inputs=[driving_audio_input],
|
234 |
-
cache_examples=False,
|
235 |
-
examples_per_page=18,
|
236 |
-
label="Male",
|
237 |
)
|
238 |
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
|
256 |
# Step 3: Result
|
257 |
with gr.Column(scale=4):
|
258 |
-
gr.Markdown("###
|
259 |
-
gr.
|
260 |
-
|
261 |
-
output_video_i2v = gr.Video(autoplay=True, label="The Output Video", height=media_height)
|
262 |
-
message = gr.Textbox(label="Info")
|
263 |
process_button_reset = gr.ClearButton(
|
264 |
[
|
265 |
base_image_input,
|
|
|
|
|
|
|
266 |
driving_audio_input,
|
267 |
driving_text_input,
|
268 |
driving_input_voice,
|
|
|
|
|
269 |
output_video_i2v,
|
270 |
],
|
271 |
value="🧹 Clear",
|
|
|
272 |
)
|
273 |
|
274 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
275 |
|
276 |
-
# binding functions for buttons
|
277 |
process_button_animation.click(
|
278 |
fn=task_executor_fn,
|
279 |
inputs=[
|
@@ -291,76 +462,107 @@ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta San
|
|
291 |
with gr.Row():
|
292 |
# Step 1: Choose Video
|
293 |
with gr.Column(scale=4):
|
294 |
-
gr.Markdown("###
|
295 |
-
gr.
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
|
|
|
|
|
|
|
|
310 |
|
311 |
# Step 2: Audio/TTS
|
312 |
with gr.Column(scale=4):
|
313 |
-
gr.Markdown("###
|
314 |
-
gr.Markdown("Provide audio or text for lip-sync.")
|
315 |
with gr.Tabs():
|
316 |
-
with gr.TabItem("
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
349 |
# Step 3: Result
|
350 |
with gr.Column(scale=4):
|
351 |
-
gr.Markdown("###
|
352 |
-
gr.
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
|
|
|
|
|
|
|
|
|
|
359 |
)
|
360 |
|
361 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
362 |
|
363 |
-
# binding functions for buttons
|
364 |
base_motion_expression = gr.Radio(value=None, visible=False)
|
365 |
process_button_animation.click(
|
366 |
fn=task_executor_fn,
|
@@ -376,7 +578,7 @@ with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta San
|
|
376 |
)
|
377 |
|
378 |
with gr.Blocks() as showcase_examples:
|
379 |
-
gr.Markdown("#
|
380 |
with gr.Row():
|
381 |
with gr.Column(scale=7):
|
382 |
for path in examples_showcase["make_image_talk_multilingual"]:
|
@@ -395,7 +597,7 @@ with gr.Blocks() as showcase_examples:
|
|
395 |
for path in examples_showcase['make_image_talk_selfie']:
|
396 |
gr.Video(value=path, label=os.path.basename(path), height=430)
|
397 |
|
398 |
-
gr.Markdown("#
|
399 |
with gr.Row():
|
400 |
with gr.Column(scale=7):
|
401 |
for path in examples_showcase["make_video_talk_multilingual"]:
|
@@ -407,7 +609,7 @@ with gr.Blocks() as showcase_examples:
|
|
407 |
for path in examples_showcase["make_video_talk_rap_multii"]:
|
408 |
gr.Video(value=path, label=os.path.basename(path), height=500)
|
409 |
|
410 |
-
gr.Markdown("#
|
411 |
with gr.Row():
|
412 |
for path in examples_showcase["dubbing_superpowerman"]:
|
413 |
gr.Video(value=path, label=os.path.basename(path), height=320)
|
@@ -415,19 +617,18 @@ with gr.Blocks() as showcase_examples:
|
|
415 |
for path in examples_showcase["dubbing_coffee"]:
|
416 |
gr.Video(value=path, label=os.path.basename(path), height=440)
|
417 |
|
418 |
-
with gr.Blocks(analytics_enabled=False,
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
gr.Markdown(description)
|
427 |
|
428 |
gr.TabbedInterface(
|
429 |
interface_list=[demo_image, demo_video, showcase_examples],
|
430 |
-
tab_names=["
|
431 |
)
|
432 |
|
433 |
if __name__ == "__main__":
|
|
|
4 |
import glob
|
5 |
import os
|
6 |
from pathlib import Path
|
7 |
+
import tempfile
|
8 |
|
9 |
import gradio as gr
|
10 |
|
11 |
from cloud_task_executor import CloudTaskExecutor
|
12 |
from elevenlabs_helper import ElevenLabsHelper
|
13 |
+
from common_util import CommonUtil
|
14 |
|
15 |
# ---
|
16 |
talk_key = "talk"
|
17 |
+
valid_talking_expressions = [
|
18 |
+
f"{talk_key}-head",
|
19 |
+
f"{talk_key}-neutral",
|
20 |
+
]
|
21 |
+
valid_nontalking_expressions = [
|
22 |
+
"smile",
|
23 |
+
"approve",
|
24 |
+
"disapprove",
|
25 |
+
"confused",
|
26 |
+
"sad",
|
27 |
+
"surprised",
|
28 |
+
]
|
|
|
|
|
29 |
|
30 |
|
31 |
# ---
|
|
|
51 |
# ---
|
52 |
|
53 |
|
|
|
|
|
|
|
54 |
# Core constants
|
55 |
tmp_dir = "/tmp/gradio"
|
56 |
data_dir = "./data"
|
|
|
59 |
unknown_key = "unknown"
|
60 |
media_height = 512
|
61 |
|
62 |
+
# Global variables
|
63 |
+
temp_video_files = set()
|
64 |
+
|
65 |
# Male/Female
|
66 |
female_terms = ["Female", "Lady", "Woman"]
|
67 |
male_terms = ["Male", "Lad", "Man"]
|
|
|
101 |
)
|
102 |
example_driving_audios = {female_key: example_driving_audios_female, male_key: example_driving_audios_male}
|
103 |
|
104 |
+
def get_audio_dropdown_choices(audio_paths, base_dir):
|
105 |
+
return [
|
106 |
+
(path.replace(base_dir, "").lstrip("/"), path)
|
107 |
+
for path in audio_paths
|
108 |
+
]
|
109 |
+
|
110 |
+
example_driving_audio_base_dir = os.path.join("./data/input_audio/gradio/")
|
111 |
+
example_driving_audio_dropdown_choices = (
|
112 |
+
get_audio_dropdown_choices(example_driving_audios[female_key], example_driving_audio_base_dir) +
|
113 |
+
get_audio_dropdown_choices(example_driving_audios[male_key], example_driving_audio_base_dir)
|
114 |
+
)
|
115 |
+
|
116 |
+
example_driving_audio_texts = [
|
117 |
+
"The 2026 World Cup final match is in New York.",
|
118 |
+
"Enhance efficiency and cut costs with AI.",
|
119 |
+
"A bee's wings beat more than 200 times per second.",
|
120 |
+
"2026년 월드컵 결승전은 뉴욕에서 열립니다.",
|
121 |
+
"AI로 효율성을 높이고 비용을 절감하세요.",
|
122 |
+
"벌은 초당 200회 이상의 날개짓을 합니다.",
|
123 |
+
"2026 विश्व कप फाइनल मैच न्यूयॉर्क में होगा।",
|
124 |
+
"AI के साथ दक्षता बढ़ाएं और लागत कम करें।",
|
125 |
+
"मधुमक्खी के पंख सेकंड में 200 बार से अधिक फड़फड़ाते हैं।",
|
126 |
+
"Welcome to our kiosk, where you can easily purchase tickets, or access various services by simply tapping the display!",
|
127 |
+
"Catch all the drama, emotion, and energy in my new film, now available on Netflix—it's a must-watch!",
|
128 |
+
"This season of IPL is full of surprises, and I’d love to see you supporting us as we fight for victory on the ground.",
|
129 |
+
"Transform your health with our latest fitness programs! Join us today and take the first step toward a stronger, energized you.",
|
130 |
+
"A big black bug bit a big black dog on his big black nose.",
|
131 |
+
"Fuzzy Wuzzy was a bear. Fuzzy Wuzzy had no hair. Fuzzy Wuzzy wasn't very fuzzy, was he?",
|
132 |
+
]
|
133 |
|
134 |
example_showcase_dir = os.path.join(data_dir, "showcase_examples")
|
135 |
examples_showcase = {
|
|
|
181 |
)
|
182 |
return driving_input_voice
|
183 |
|
184 |
+
def update_audio_tabs_visibility(motion_type):
|
185 |
+
if motion_type == "talking":
|
186 |
+
return gr.update(visible=True), gr.update(visible=True)
|
187 |
+
else:
|
188 |
+
return gr.update(visible=False), gr.update(visible=False)
|
189 |
|
190 |
def task_executor_fn(
|
191 |
input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input
|
|
|
195 |
input_base_path, base_motion_expression, input_driving_audio_path, driving_text_input, driving_voice_input
|
196 |
)
|
197 |
|
198 |
+
def check_and_convert_video_fps(video_path):
|
199 |
+
if not video_path:
|
200 |
+
return None
|
201 |
+
|
202 |
+
try:
|
203 |
+
_, is_video, _, width, height, duration, fps = CommonUtil.get_media_properties(video_path)
|
204 |
+
if not is_video:
|
205 |
+
raise gr.Error("Not a video file")
|
206 |
+
|
207 |
+
if not CommonUtil.check_dim(width, height):
|
208 |
+
min_dim = CommonUtil.valid_min_media_dim
|
209 |
+
max_dim = CommonUtil.valid_max_media_dim
|
210 |
+
raise gr.Error(f"⚠️ Video dimensions must be between {min_dim}-{max_dim} pixels.\n\nCurrent size: {width}(w)x{height}(h)")
|
211 |
+
|
212 |
+
if not CommonUtil.check_duration(duration):
|
213 |
+
min_duration = CommonUtil.valid_min_media_duration
|
214 |
+
max_duration = CommonUtil.valid_max_media_duration
|
215 |
+
raise gr.Error(f"⚠️ Video duration must be between {min_duration}-{max_duration} seconds.\n\nCurrent duration: {duration}s")
|
216 |
+
|
217 |
+
if CommonUtil.check_fps(fps):
|
218 |
+
return video_path
|
219 |
+
|
220 |
+
target_fps = CommonUtil.valid_video_fps
|
221 |
+
print(f"Converting video from {fps}fps to {target_fps}fps: {video_path}")
|
222 |
+
|
223 |
+
temp_dir = tempfile.mkdtemp()
|
224 |
+
base_name = os.path.splitext(os.path.basename(video_path))[0]
|
225 |
+
converted_path = os.path.join(temp_dir, f"{base_name}_{target_fps}fps.mp4")
|
226 |
+
|
227 |
+
CommonUtil.change_video_fps(video_path, converted_path, fps=target_fps)
|
228 |
+
|
229 |
+
temp_video_files.add(converted_path)
|
230 |
+
|
231 |
+
return converted_path
|
232 |
+
|
233 |
+
except gr.Error:
|
234 |
+
# Re-raise gr.Error to show notification
|
235 |
+
raise
|
236 |
+
except Exception as e:
|
237 |
+
print(f"Error processing video FPS: {e}")
|
238 |
+
raise gr.Error(f"Error processing video: {str(e)}")
|
239 |
+
|
240 |
+
|
241 |
+
def check_and_validate_image(image_path):
|
242 |
+
"""Check and validate image properties"""
|
243 |
+
if not image_path:
|
244 |
+
return None
|
245 |
+
|
246 |
+
try:
|
247 |
+
is_image, _, _, width, height, _, _ = CommonUtil.get_media_properties(image_path)
|
248 |
+
|
249 |
+
if not is_image:
|
250 |
+
raise gr.Error("⚠️ Not an image file. Please upload a valid image file.")
|
251 |
+
|
252 |
+
if not CommonUtil.check_dim(width, height):
|
253 |
+
min_dim = CommonUtil.valid_min_media_dim
|
254 |
+
max_dim = CommonUtil.valid_max_media_dim
|
255 |
+
raise gr.Error(f"⚠️ Image dimensions must be between {min_dim}-{max_dim} pixels.\n\nCurrent size: {width}(w)x{height}(h)")
|
256 |
+
|
257 |
+
return image_path
|
258 |
+
|
259 |
+
except gr.Error:
|
260 |
+
# Re-raise gr.Error to show notification
|
261 |
+
raise
|
262 |
+
except Exception as e:
|
263 |
+
print(f"Error validating image: {e}")
|
264 |
+
raise gr.Error(f"❌ Error processing image: {str(e)}")
|
265 |
+
|
266 |
+
def process_video_input(video_path):
|
267 |
+
if not video_path:
|
268 |
+
return None
|
269 |
+
|
270 |
+
converted_path = check_and_convert_video_fps(video_path)
|
271 |
+
print(f"Video processing result: {converted_path}")
|
272 |
+
|
273 |
+
return converted_path
|
274 |
+
|
275 |
+
|
276 |
+
def cleanup_temp_video_files():
|
277 |
+
global temp_video_files
|
278 |
+
for temp_file in temp_video_files:
|
279 |
+
try:
|
280 |
+
if os.path.exists(temp_file):
|
281 |
+
os.remove(temp_file)
|
282 |
+
print(f"Cleaned up temporary file: {temp_file}")
|
283 |
+
except Exception as e:
|
284 |
+
print(f"Error cleaning up {temp_file}: {e}")
|
285 |
+
|
286 |
+
# Clear the set
|
287 |
+
temp_video_files.clear()
|
288 |
+
|
289 |
with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Plus Jakarta Sans")])) as demo_image:
|
290 |
with gr.Row():
|
291 |
# Step 1: Choose Image
|
292 |
with gr.Column(scale=4):
|
293 |
+
gr.Markdown("### STEP 1 - SELECT IMAGE")
|
294 |
+
base_image_input = gr.Image(label="IMAGE", type="filepath", sources="upload", height=media_height, interactive=True)
|
295 |
+
gr.Examples(
|
296 |
+
examples=[[example] for example in example_base_images[female_key]],
|
297 |
+
inputs=[base_image_input],
|
298 |
+
fn=lambda x: x,
|
299 |
+
outputs=[base_image_input],
|
300 |
+
cache_examples=False,
|
301 |
+
label="Female",
|
302 |
+
)
|
303 |
+
gr.Examples(
|
304 |
+
examples=[[example] for example in example_base_images[male_key]],
|
305 |
+
inputs=[base_image_input],
|
306 |
+
fn=lambda x: x,
|
307 |
+
outputs=[base_image_input],
|
308 |
+
cache_examples=False,
|
309 |
+
label="Male",
|
310 |
+
)
|
311 |
|
312 |
# Step 2: Motion and Audio/TTS
|
313 |
with gr.Column(scale=4):
|
314 |
+
gr.Markdown("### STEP 2 - SELECT MOTION & AUDIO")
|
315 |
+
base_motion_expression = gr.Radio(
|
316 |
+
choices=valid_talking_expressions,
|
317 |
+
value=valid_talking_expressions[0],
|
318 |
+
visible=False,
|
319 |
+
)
|
320 |
+
|
|
|
321 |
with gr.Tabs():
|
322 |
+
with gr.TabItem("TALKING MOTION") as tab_talking_motion:
|
323 |
+
base_talking_expression = gr.Radio(
|
324 |
+
choices=valid_talking_expressions,
|
325 |
+
label="STEP 2.1 - TALKING MOTION",
|
326 |
+
value=valid_talking_expressions[0],
|
327 |
+
)
|
328 |
+
with gr.TabItem("EXPRESSION MOTION") as tab_expression_motion:
|
329 |
+
base_expression_expression = gr.Radio(
|
330 |
+
choices=valid_nontalking_expressions,
|
331 |
+
label="STEP 2 - EXPRESSION MOTION",
|
332 |
+
value=None,
|
|
|
|
|
|
|
|
|
|
|
333 |
)
|
334 |
|
335 |
+
with gr.Tabs():
|
336 |
+
with gr.TabItem("DRIVING AUDIO: FILE") as tab_audio_file:
|
337 |
+
driving_audio_input = gr.File(label="STEP 2.2 - AUDIO FILE", file_types=[".mp3", ".wav"], type="filepath", height=287)
|
338 |
+
example_driving_audio_dropdown = gr.Dropdown(
|
339 |
+
choices=example_driving_audio_dropdown_choices,
|
340 |
+
value=None,
|
341 |
+
label="OR SELECT FROM EXAMPLES",
|
342 |
+
interactive=True,
|
343 |
+
allow_custom_value=False
|
344 |
+
)
|
345 |
+
|
346 |
+
def update_audio_input(selected_audio):
|
347 |
+
return selected_audio if selected_audio else None
|
348 |
+
|
349 |
+
example_driving_audio_dropdown.change(
|
350 |
+
fn=update_audio_input,
|
351 |
+
inputs=[example_driving_audio_dropdown],
|
352 |
+
outputs=[driving_audio_input]
|
353 |
+
)
|
354 |
+
|
355 |
+
with gr.TabItem("DRIVING AUDIO: TTS") as tab_audio_tts:
|
356 |
+
driving_input_voice = gr.Dropdown(
|
357 |
+
choices=voices[unknown_key], value=voices[unknown_key][0], label="STEP 2.2 - VOICE"
|
358 |
+
)
|
359 |
+
driving_text_input = gr.Textbox(
|
360 |
+
label="INPUT TEXT (300 characters max)",
|
361 |
+
lines=2,
|
362 |
+
)
|
363 |
+
example_text_dropdown = gr.Dropdown(
|
364 |
+
choices=example_driving_audio_texts,
|
365 |
+
value=None,
|
366 |
+
label="OR SELECT FROM EXAMPLES",
|
367 |
+
interactive=True,
|
368 |
+
allow_custom_value=False
|
369 |
+
)
|
370 |
+
|
371 |
+
def update_text_input(selected_text):
|
372 |
+
return selected_text if selected_text else ""
|
373 |
+
|
374 |
+
example_text_dropdown.change(
|
375 |
+
fn=update_text_input,
|
376 |
+
inputs=[example_text_dropdown],
|
377 |
+
outputs=[driving_text_input]
|
378 |
+
)
|
379 |
+
process_button_animation = gr.Button("🌟 Generate", variant="primary", elem_classes=["generate-button"])
|
380 |
|
381 |
# Step 3: Result
|
382 |
with gr.Column(scale=4):
|
383 |
+
gr.Markdown("### RESULT")
|
384 |
+
output_video_i2v = gr.Video(autoplay=True, label="OUTPUT VIDEO", height=512, show_download_button=True)
|
385 |
+
message = gr.Textbox(label="INFO", max_lines=8)
|
|
|
|
|
386 |
process_button_reset = gr.ClearButton(
|
387 |
[
|
388 |
base_image_input,
|
389 |
+
base_motion_expression,
|
390 |
+
base_talking_expression,
|
391 |
+
base_expression_expression,
|
392 |
driving_audio_input,
|
393 |
driving_text_input,
|
394 |
driving_input_voice,
|
395 |
+
example_text_dropdown,
|
396 |
+
example_driving_audio_dropdown,
|
397 |
output_video_i2v,
|
398 |
],
|
399 |
value="🧹 Clear",
|
400 |
+
variant="secondary",
|
401 |
)
|
402 |
|
403 |
+
def process_image_and_update_voices(image_path):
|
404 |
+
validated_image = check_and_validate_image(image_path)
|
405 |
+
|
406 |
+
voice_dropdown = update_voices(validated_image)
|
407 |
+
|
408 |
+
return validated_image, voice_dropdown
|
409 |
+
|
410 |
+
base_image_input.change(
|
411 |
+
fn=process_image_and_update_voices,
|
412 |
+
inputs=[base_image_input],
|
413 |
+
outputs=[base_image_input, driving_input_voice]
|
414 |
+
)
|
415 |
+
|
416 |
+
base_talking_expression.change(
|
417 |
+
fn=lambda x: x,
|
418 |
+
inputs=[base_talking_expression],
|
419 |
+
outputs=[base_motion_expression],
|
420 |
+
)
|
421 |
+
|
422 |
+
base_expression_expression.change(
|
423 |
+
fn=lambda x: gr.update(value=x),
|
424 |
+
inputs=[base_expression_expression],
|
425 |
+
outputs=[base_motion_expression],
|
426 |
+
)
|
427 |
+
|
428 |
+
def update_talking_tab():
|
429 |
+
audio_visibility = update_audio_tabs_visibility("talking")
|
430 |
+
return audio_visibility[0], audio_visibility[1], gr.update(choices=valid_talking_expressions, value=valid_talking_expressions[0])
|
431 |
+
|
432 |
+
def update_expression_tab():
|
433 |
+
audio_visibility = update_audio_tabs_visibility("expression")
|
434 |
+
return audio_visibility[1], audio_visibility[0], gr.update(choices=valid_nontalking_expressions, value=valid_nontalking_expressions[0])
|
435 |
+
|
436 |
+
tab_talking_motion.select(
|
437 |
+
fn=update_talking_tab,
|
438 |
+
inputs=[],
|
439 |
+
outputs=[tab_audio_file, tab_audio_tts, base_motion_expression],
|
440 |
+
)
|
441 |
+
|
442 |
+
tab_expression_motion.select(
|
443 |
+
fn=update_expression_tab,
|
444 |
+
inputs=[],
|
445 |
+
outputs=[tab_audio_file, tab_audio_tts, base_motion_expression],
|
446 |
+
)
|
447 |
|
|
|
448 |
process_button_animation.click(
|
449 |
fn=task_executor_fn,
|
450 |
inputs=[
|
|
|
462 |
with gr.Row():
|
463 |
# Step 1: Choose Video
|
464 |
with gr.Column(scale=4):
|
465 |
+
gr.Markdown("### STEP 1 - SELECT VIDEO")
|
466 |
+
base_video_input = gr.Video(label="VIDEO", sources="upload", height=media_height, interactive=True)
|
467 |
+
gr.Examples(
|
468 |
+
examples=[[example] for example in example_source_videos[female_key]],
|
469 |
+
inputs=[base_video_input],
|
470 |
+
fn=lambda x: x,
|
471 |
+
outputs=[base_video_input],
|
472 |
+
cache_examples=False,
|
473 |
+
label="Female",
|
474 |
+
elem_id="female-video-examples"
|
475 |
+
)
|
476 |
+
gr.Examples(
|
477 |
+
examples=[[example] for example in example_source_videos[male_key]],
|
478 |
+
inputs=[base_video_input],
|
479 |
+
fn=lambda x: x,
|
480 |
+
outputs=[base_video_input],
|
481 |
+
cache_examples=False,
|
482 |
+
label="Male",
|
483 |
+
elem_id="male-video-examples"
|
484 |
+
)
|
485 |
|
486 |
# Step 2: Audio/TTS
|
487 |
with gr.Column(scale=4):
|
488 |
+
gr.Markdown("### STEP 2 - SELECT AUDIO")
|
|
|
489 |
with gr.Tabs():
|
490 |
+
with gr.TabItem("DRIVING AUDIO: FILE") as tab_audio_file:
|
491 |
+
|
492 |
+
driving_audio_input = gr.File(label="AUDIO", file_types=[".mp3", ".wav"], type="filepath", height=454)
|
493 |
+
example_driving_audio_dropdown = gr.Dropdown(
|
494 |
+
choices=example_driving_audio_dropdown_choices,
|
495 |
+
value=None,
|
496 |
+
label="OR SELECT FROM EXAMPLES",
|
497 |
+
interactive=True,
|
498 |
+
allow_custom_value=False
|
499 |
+
)
|
500 |
+
|
501 |
+
def update_audio_input(selected_audio):
|
502 |
+
return selected_audio if selected_audio else None
|
503 |
+
|
504 |
+
example_driving_audio_dropdown.change(
|
505 |
+
fn=update_audio_input,
|
506 |
+
inputs=[example_driving_audio_dropdown],
|
507 |
+
outputs=[driving_audio_input]
|
508 |
+
)
|
509 |
+
|
510 |
+
with gr.TabItem("DRIVING AUDIO: TTS") as tab_audio_tts:
|
511 |
+
|
512 |
+
driving_input_voice = gr.Dropdown(
|
513 |
+
choices=voices[unknown_key], value=voices[unknown_key][0], label="VOICE"
|
514 |
+
)
|
515 |
+
driving_text_input = gr.Textbox(
|
516 |
+
label="INPUT TEXT (300 characters max)",
|
517 |
+
lines=5,
|
518 |
+
)
|
519 |
+
example_text_dropdown = gr.Dropdown(
|
520 |
+
choices=example_driving_audio_texts,
|
521 |
+
value=None,
|
522 |
+
label="OR SELECT FROM EXAMPLES",
|
523 |
+
interactive=True,
|
524 |
+
allow_custom_value=False
|
525 |
+
)
|
526 |
+
|
527 |
+
def update_text_input(selected_text):
|
528 |
+
return selected_text if selected_text else ""
|
529 |
+
|
530 |
+
example_text_dropdown.change(
|
531 |
+
fn=update_text_input,
|
532 |
+
inputs=[example_text_dropdown],
|
533 |
+
outputs=[driving_text_input]
|
534 |
+
)
|
535 |
+
process_button_animation = gr.Button("🌟 Generate", variant="primary", elem_classes=["generate-button"])
|
536 |
# Step 3: Result
|
537 |
with gr.Column(scale=4):
|
538 |
+
gr.Markdown("### RESULT")
|
539 |
+
output_video_i2v = gr.Video(autoplay=True, label="OUTPUT VIDEO", height=512, show_download_button=True)
|
540 |
+
message = gr.Textbox(label="INFO", max_lines=8)
|
541 |
+
process_button_reset = gr.Button("🧹 Clear", variant="secondary")
|
542 |
+
|
543 |
+
def clear_all():
|
544 |
+
cleanup_temp_video_files()
|
545 |
+
return None, None, None, None, None
|
546 |
+
|
547 |
+
process_button_reset.click(
|
548 |
+
fn=clear_all,
|
549 |
+
inputs=[],
|
550 |
+
outputs=[base_video_input, driving_audio_input, driving_text_input, driving_input_voice, output_video_i2v]
|
551 |
)
|
552 |
|
553 |
+
def process_video_and_update_voices(video_path):
|
554 |
+
processed_video = process_video_input(video_path)
|
555 |
+
|
556 |
+
voice_dropdown = update_voices(processed_video)
|
557 |
+
|
558 |
+
return processed_video, voice_dropdown
|
559 |
+
|
560 |
+
base_video_input.change(
|
561 |
+
fn=process_video_and_update_voices,
|
562 |
+
inputs=[base_video_input],
|
563 |
+
outputs=[base_video_input, driving_input_voice]
|
564 |
+
)
|
565 |
|
|
|
566 |
base_motion_expression = gr.Radio(value=None, visible=False)
|
567 |
process_button_animation.click(
|
568 |
fn=task_executor_fn,
|
|
|
578 |
)
|
579 |
|
580 |
with gr.Blocks() as showcase_examples:
|
581 |
+
gr.Markdown("# IMAGE TO AVATAR")
|
582 |
with gr.Row():
|
583 |
with gr.Column(scale=7):
|
584 |
for path in examples_showcase["make_image_talk_multilingual"]:
|
|
|
597 |
for path in examples_showcase['make_image_talk_selfie']:
|
598 |
gr.Video(value=path, label=os.path.basename(path), height=430)
|
599 |
|
600 |
+
gr.Markdown("# VIDEO TO AVATAR")
|
601 |
with gr.Row():
|
602 |
with gr.Column(scale=7):
|
603 |
for path in examples_showcase["make_video_talk_multilingual"]:
|
|
|
609 |
for path in examples_showcase["make_video_talk_rap_multii"]:
|
610 |
gr.Video(value=path, label=os.path.basename(path), height=500)
|
611 |
|
612 |
+
gr.Markdown("# VIDEO TO AVATAR: DUBBING")
|
613 |
with gr.Row():
|
614 |
for path in examples_showcase["dubbing_superpowerman"]:
|
615 |
gr.Video(value=path, label=os.path.basename(path), height=320)
|
|
|
617 |
for path in examples_showcase["dubbing_coffee"]:
|
618 |
gr.Video(value=path, label=os.path.basename(path), height=440)
|
619 |
|
620 |
+
with gr.Blocks(analytics_enabled=False,
|
621 |
+
css="footer{display:none !important} .generate-button{margin-top:-10px !important;} #female-video-examples .gallery *, #male-video-examples .gallery *{height:142.1px !important; min-height:142.1px !important; max-height:142.1px !important;} #female-video-examples .gallery img, #male-video-examples .gallery img, #female-video-examples .gallery video, #male-video-examples .gallery video{width:80px !important; height:142.1px !important; object-fit:cover !important; min-height:142.1px !important; max-height:142.1px !important;} #female-video-examples .gallery > div, #male-video-examples .gallery > div{width:80px !important; height:142.1px !important; min-height:142.1px !important; max-height:142.1px !important; margin:2px !important;} .logo-left{text-align:left !important; margin:0 !important; padding:0 !important; border:none !important; outline:none !important; box-shadow:none !important; min-height:auto !important; height:auto !important; overflow:visible !important;} .logo-left > div{text-align:left !important; margin:0 !important; padding:0 !important; overflow:visible !important;} .logo-left img{height:45px !important; min-height:45px !important; max-height:45px !important;} .logo-right{text-align:right !important; margin:0 !important; padding:0 !important; border:none !important; outline:none !important; box-shadow:none !important; min-height:auto !important; height:auto !important; overflow:visible !important; display:flex !important; justify-content:flex-end !important; align-items:center !important;} .logo-right > div{text-align:right !important; margin:0 !important; padding:0 !important; overflow:visible !important; width:100% !important; display:flex !important; justify-content:flex-end !important;} .logo-right img{height:70px !important; min-height:70px !important; max-height:70px !important;}",
|
622 |
+
title="SUTRA Avatar v2") as demo:
|
623 |
+
with gr.Row():
|
624 |
+
with gr.Column(scale=10):
|
625 |
+
gr.HTML(value="<img src='data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iNTgiIGhlaWdodD0iMTUiIHZpZXdCb3g9IjAgMCA1OCAxNSIgZmlsbD0ibm9uZSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj4KPGcgY2xpcC1wYXRoPSJ1cmwoI2NsaXAwXzNfMikiPgo8cGF0aCBkPSJNNS43MjE5MSAxNC43ODQ0QzIuNDY5NDUgMTQuNzg0NCAwLjYwMjI5NSAxMy4wMDQ0IDAuNjAyMjk1IDkuODY0NDVIMi45MTExNEMyLjkxMTE0IDExLjY4NDQgMy45NTUxNCAxMi43ODQ0IDUuNzAxODMgMTIuNzg0NEM3LjIyNzY4IDEyLjc4NDQgOC4wNzA5MSAxMi4wODQ0IDguMDcwOTEgMTAuODI0NEM4LjA3MDkxIDkuNzY0NDUgNy41Njg5OSA5LjIyNDQ1IDUuOTgyOTEgOC41ODQ0NUwzLjY5NDE0IDcuNjY0NDVDMS45MjczNyA2Ljk4NDQ1IDEuMDAzODMgNS43MjQ0NSAxLjAwMzgzIDMuOTQ0NDVDMS4wMDM4MyAxLjY2NDQ1IDIuODUwOTEgMC4xMDQ0NDYgNS41MDEwNiAwLjEwNDQ0NkM4LjI5MTc2IDAuMTA0NDQ2IDEwLjEzODggMS44NDQ0NSAxMC4xMzg4IDQuNDg0NDVINy44Mjk5OUM3LjgyOTk5IDIuOTQ0NDUgNi45ODY3NiAyLjA2NDQ1IDUuNDIwNzYgMi4wNjQ0NUM0LjA5NTY4IDIuMDY0NDUgMy4zMzI3NiAyLjc0NDQ1IDMuMzMyNzYgMy44MDQ0NUMzLjMzMjc2IDQuNzY0NDUgNC4wMTUzNyA1LjQwNDQ1IDUuNjYxNjggNi4wNjQ0NUw3LjcyOTYgNi45MDQ0NUM5LjQ5NjM3IDcuNjI0NDUgMTAuMzc5OCA4Ljg2NDQ1IDEwLjM3OTggMTAuNzY0NEMxMC4zNzk4IDEzLjE2NDQgOC41MTI2IDE0Ljc4NDQgNS43MjE5MSAxNC43ODQ0Wk0xNy41MDIyIDE0Ljc4NDRDMTQuMzUwMSAxNC43ODQ0IDEyLjI4MjIgMTMuMDY0NCAxMi4yODIyIDkuODA0NDVWMC40NDQ0NDVIMTQuNTkxMVY5LjY2NDQ1QzE0LjU5MTEgMTEuNjg0NCAxNS43MzU0IDEyLjcyNDQgMTcuNTAyMiAxMi43MjQ0QzE5LjI2OSAxMi43MjQ0IDIwLjQxMzQgMTEuNjg0NCAyMC40MTM0IDkuNjY0NDVWMC40NDQ0NDVIMjIuNzIyMlY5LjgwNDQ1QzIyLjcyMjIgMTMuMDY0NCAyMC42NTQzIDE0Ljc4NDQgMTcuNTAyMiAxNC43ODQ0Wk0yOC4wOTU3IDE0LjQ0NDRWMi42MDQ0NUgyNC4wMjAxVjAuNDQ0NDQ1SDM0LjUwMDNWMi42MDQ0NUgzMC40MDQ2VjE0LjQ0NDRIMjguMDk1N1pNNDYuMTA5MyA1LjM0NDQ1QzQ2LjEwOTMgNy41MjQ0NSA0NC45MjQ4IDkuMjQ0NDUgNDMuMTE3OSA5LjkyNDQ1TDQ1Ljg4ODUgMTQuNDQ0NEg0My4yNTg0TDQwLjY4ODUgMTAuMjQ0NEgzOC40MTk5VjE0LjQ0NDRIMzYuMTExVjAuNDQ0NDQ1SDQxLjI5MDlDNDQuMTAxNiAwLjQ0NDQ0NSA0Ni4xMDkzIDIuNDg0NDUgNDYuMTA5MyA1LjM0NDQ1Wk0zOC40MTk5IDIuNTA0NDVWOC4xODQ0NUg0MS4xNTAzQzQyLjY3NjIgOC4xODQ0NSA0My44MDA1IDYuOTg0NDUgNDMuODAwNSA1LjM0NDQ1QzQzLjgwMDUgMy43MDQ0NSA0Mi42NzYyIDIuNTA0NDUgNDEuMTUwMyAyLjUwNDQ1SDM4LjQxOTlaIiBmaWxsPSIjMzA2MEZGIi8+CjxwYXRoIGQ9Ik01NS4zMzgxIDExLjY2NjdMNTQuOTAxNSAxMC4yMzI4SDUwLjU1MDlMNTAuMTE0MiAxMS42NjY3SDQ4LjA5MjZMNTEuOTA5NCAwLjM4ODg4NUg1My41NDI5TDU3LjM1OTggMTEuNjY2N0g1NS4zMzgxWk01MS4wMzYxIDguNTczMzNINTQuNDAwMUw1Mi43MTgxIDIuOTgyNzhMNTEuMDM2MSA4LjU3MzMzWiIgZmlsbD0iIzMwNjBGRiIvPgo8cGF0aCBkPSJNNTggMTIuODMzM0g0Ny40MDM4VjE0LjVINThWMTIuODMzM1oiIGZpbGw9IiMzMDYwRkYiLz4KPC9nPgo8ZGVmcz4KPGNsaXBQYXRoIGlkPSJjbGlwMF8zXzIiPgo8cmVjdCB3aWR0aD0iNTgiIGhlaWdodD0iMTUiIGZpbGw9IndoaXRlIi8+CjwvY2xpcFBhdGg+CjwvZGVmcz4KPC9zdmc+' />", elem_classes=["logo-left"])
|
626 |
+
with gr.Column(scale=2):
|
627 |
+
gr.HTML(value="<img src='data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjAwIiBoZWlnaHQ9IjgwIiB2aWV3Qm94PSIwIDAgMjAwIDgwIiBmaWxsPSJub25lIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPgo8dGV4dCB4PSIxMCIgeT0iNTUiIGZvbnQtZmFtaWx5PSJBcmlhbCwgc2Fucy1zZXJpZiIgZm9udC1zaXplPSI0OCIgZm9udC13ZWlnaHQ9ImJvbGQiIGZpbGw9IiMwMDAwMDAiPkFWQVRBUjwvdGV4dD4KPC9zdmc+' />", elem_classes=["logo-right"])
|
|
|
628 |
|
629 |
gr.TabbedInterface(
|
630 |
interface_list=[demo_image, demo_video, showcase_examples],
|
631 |
+
tab_names=["IMAGE to AVATAR", "VIDEO to AVATAR", "SHOWCASE"],
|
632 |
)
|
633 |
|
634 |
if __name__ == "__main__":
|
base_task_executor.py
CHANGED
@@ -35,40 +35,12 @@ def get_name_ext(filepath):
|
|
35 |
return name, ext
|
36 |
|
37 |
|
38 |
-
def
|
39 |
-
|
40 |
-
|
41 |
-
return sanitized_string[:max_len]
|
42 |
|
|
|
43 |
|
44 |
-
def get_output_video_name(
|
45 |
-
input_base_path, input_driving_path, base_motion_expression, input_driving_audio_path, tag=""
|
46 |
-
):
|
47 |
-
if not tag:
|
48 |
-
tag = get_formatted_datetime_name()
|
49 |
-
|
50 |
-
base_name, _ = get_name_ext(input_base_path)
|
51 |
-
base_name = sanitize_string(base_name)
|
52 |
-
|
53 |
-
driving_name = ""
|
54 |
-
if input_driving_path:
|
55 |
-
driving_name, _ = get_name_ext(input_driving_path)
|
56 |
-
driving_name = sanitize_string(driving_name)
|
57 |
-
elif base_motion_expression and is_image(input_base_path):
|
58 |
-
driving_name = base_motion_expression
|
59 |
-
|
60 |
-
audio_name = ""
|
61 |
-
if input_driving_audio_path:
|
62 |
-
audio_name, _ = get_name_ext(input_driving_audio_path)
|
63 |
-
audio_name = sanitize_string(audio_name)
|
64 |
-
|
65 |
-
output_video_name = f"{tag}--b-{base_name}"
|
66 |
-
|
67 |
-
if driving_name:
|
68 |
-
output_video_name += f"--d-{driving_name}"
|
69 |
-
|
70 |
-
if audio_name:
|
71 |
-
output_video_name += f"--a-{audio_name}"
|
72 |
return output_video_name
|
73 |
|
74 |
|
@@ -143,11 +115,10 @@ class BaseTaskExecutor(ABC):
|
|
143 |
request_id = get_unique_name(maxd=8, delim="")
|
144 |
output_video_path = os.path.join(
|
145 |
self.tmp_dir,
|
146 |
-
get_output_video_name(
|
147 |
-
input_base_path, input_driving_path, base_motion_expression, input_driving_audio_path
|
148 |
-
)
|
149 |
+ ".mp4",
|
150 |
)
|
|
|
151 |
result, output_video_path = self.generate(
|
152 |
input_base_path,
|
153 |
input_driving_path,
|
@@ -156,12 +127,30 @@ class BaseTaskExecutor(ABC):
|
|
156 |
output_video_path,
|
157 |
request_id,
|
158 |
)
|
|
|
|
|
159 |
success = result["success"]
|
160 |
messages = result["messages"]
|
161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
self.clean(output_dir)
|
163 |
|
164 |
if success:
|
|
|
165 |
return output_video_path, gr.update(visible=True), messages
|
166 |
else:
|
167 |
gr.Info("Task could not be completed", duration=4)
|
|
|
35 |
return name, ext
|
36 |
|
37 |
|
38 |
+
def get_output_video_name():
|
39 |
+
|
40 |
+
tag = get_formatted_datetime_name()
|
|
|
41 |
|
42 |
+
output_video_name = f"sutra-avatar-{tag}"
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
return output_video_name
|
45 |
|
46 |
|
|
|
115 |
request_id = get_unique_name(maxd=8, delim="")
|
116 |
output_video_path = os.path.join(
|
117 |
self.tmp_dir,
|
118 |
+
get_output_video_name()
|
|
|
|
|
119 |
+ ".mp4",
|
120 |
)
|
121 |
+
time_start = time.time()
|
122 |
result, output_video_path = self.generate(
|
123 |
input_base_path,
|
124 |
input_driving_path,
|
|
|
127 |
output_video_path,
|
128 |
request_id,
|
129 |
)
|
130 |
+
time_end = time.time()
|
131 |
+
pipeline_time = int((time_end - time_start) * 1000)
|
132 |
success = result["success"]
|
133 |
messages = result["messages"]
|
134 |
|
135 |
+
if "tlpMetrics" in result:
|
136 |
+
tlp_metrics = result["tlpMetrics"]
|
137 |
+
if isinstance(tlp_metrics, dict):
|
138 |
+
n_frames = tlp_metrics.get('nFrames', 'N/A')
|
139 |
+
tlp_msec = tlp_metrics.get('tlpMsec', 'N/A')
|
140 |
+
metrics_str = f"Frame per second: 30\nNumber of Frames: {n_frames}\nPipeline Time: {tlp_msec}ms"
|
141 |
+
messages += metrics_str
|
142 |
+
else:
|
143 |
+
messages += f"\n{tlp_metrics}"
|
144 |
+
if "n_frames" in result:
|
145 |
+
n_frames = result.get("n_frames", "N/A")
|
146 |
+
messages += "Frame per second: 30"
|
147 |
+
messages += f"\nNumber of Frames: {n_frames}"
|
148 |
+
messages += f"\nPipeline Time: {pipeline_time}ms"
|
149 |
+
|
150 |
self.clean(output_dir)
|
151 |
|
152 |
if success:
|
153 |
+
print(f"output_video_path: {output_video_path}")
|
154 |
return output_video_path, gr.update(visible=True), messages
|
155 |
else:
|
156 |
gr.Info("Task could not be completed", duration=4)
|
cloud_task_executor.py
CHANGED
@@ -112,7 +112,7 @@ class CloudTaskExecutor(BaseTaskExecutor):
|
|
112 |
timeout += estimatedWaitSeconds
|
113 |
start_time = time.time()
|
114 |
|
115 |
-
result = {
|
116 |
while True:
|
117 |
status_reply = self.get_task_status(request_id)
|
118 |
task_status = status_reply["taskStatus"]
|
@@ -133,6 +133,7 @@ class CloudTaskExecutor(BaseTaskExecutor):
|
|
133 |
pipe_reply = status_reply["pipeReply"]
|
134 |
result["success"] = pipe_reply["status"] == "success"
|
135 |
result["messages"] = pipe_reply["messages"]
|
|
|
136 |
output_video_path = status_reply["videoURL"]
|
137 |
else:
|
138 |
messages = ""
|
|
|
112 |
timeout += estimatedWaitSeconds
|
113 |
start_time = time.time()
|
114 |
|
115 |
+
result = {}
|
116 |
while True:
|
117 |
status_reply = self.get_task_status(request_id)
|
118 |
task_status = status_reply["taskStatus"]
|
|
|
133 |
pipe_reply = status_reply["pipeReply"]
|
134 |
result["success"] = pipe_reply["status"] == "success"
|
135 |
result["messages"] = pipe_reply["messages"]
|
136 |
+
result["tlpMetrics"] = pipe_reply["tlpMetrics"]
|
137 |
output_video_path = status_reply["videoURL"]
|
138 |
else:
|
139 |
messages = ""
|
common_util.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import subprocess
|
2 |
+
import ffmpeg
|
3 |
+
import imagesize
|
4 |
+
|
5 |
+
class CommonUtil:
|
6 |
+
valid_image_exts = (".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp")
|
7 |
+
valid_video_exts = (".mp4", ".mov", ".avi", ".webm")
|
8 |
+
valid_audio_exts = (".mp3", ".wav")
|
9 |
+
valid_template_ext = ".npz"
|
10 |
+
|
11 |
+
valid_min_media_dim = 480 # pixels
|
12 |
+
valid_max_media_dim = 3840
|
13 |
+
|
14 |
+
valid_min_media_duration = 0.1 # seconds
|
15 |
+
valid_max_media_duration = 120 # seconds
|
16 |
+
|
17 |
+
valid_min_sample_rate = 16000
|
18 |
+
valid_max_sample_rate = 44100
|
19 |
+
|
20 |
+
valid_video_fps = 30 # fps
|
21 |
+
|
22 |
+
@staticmethod
|
23 |
+
def check_dim(width, height):
|
24 |
+
min_d = CommonUtil.valid_min_media_dim
|
25 |
+
max_d = CommonUtil.valid_max_media_dim
|
26 |
+
if width < min_d or width > max_d or height < min_d or height > max_d:
|
27 |
+
return False
|
28 |
+
return True
|
29 |
+
|
30 |
+
@staticmethod
|
31 |
+
def check_duration(duration):
|
32 |
+
if duration < CommonUtil.valid_min_media_duration:
|
33 |
+
return False
|
34 |
+
|
35 |
+
if duration > CommonUtil.valid_max_media_duration:
|
36 |
+
return False
|
37 |
+
|
38 |
+
return True
|
39 |
+
|
40 |
+
@staticmethod
|
41 |
+
def check_fps(fps):
|
42 |
+
if fps != CommonUtil.valid_video_fps:
|
43 |
+
return False
|
44 |
+
return True
|
45 |
+
|
46 |
+
@staticmethod
|
47 |
+
def get_audio_stream(video_path):
|
48 |
+
probe = ffmpeg.probe(video_path)
|
49 |
+
return next((stream for stream in probe["streams"] if stream["codec_type"] == "audio"), None)
|
50 |
+
|
51 |
+
@staticmethod
|
52 |
+
def get_video_stream(video_path):
|
53 |
+
probe = ffmpeg.probe(video_path)
|
54 |
+
return next((stream for stream in probe["streams"] if stream["codec_type"] == "video"), None)
|
55 |
+
|
56 |
+
@staticmethod
|
57 |
+
def exec_cmd(cmd):
|
58 |
+
return subprocess.run(cmd, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
59 |
+
|
60 |
+
@staticmethod
|
61 |
+
def get_media_properties(media):
|
62 |
+
is_image = CommonUtil.is_image(media)
|
63 |
+
is_video = CommonUtil.is_video(media)
|
64 |
+
is_audio = CommonUtil.is_audio(media)
|
65 |
+
|
66 |
+
if is_image:
|
67 |
+
width, height = imagesize.get(media)
|
68 |
+
return (is_image, is_video, is_audio, width, height, -1, -1)
|
69 |
+
|
70 |
+
elif is_video:
|
71 |
+
video_stream = CommonUtil.get_video_stream(media)
|
72 |
+
duration = float(video_stream["duration"])
|
73 |
+
width = int(video_stream["width"])
|
74 |
+
height = int(video_stream["height"])
|
75 |
+
sample_rate = video_stream["r_frame_rate"]
|
76 |
+
if sample_rate == "30/1":
|
77 |
+
sample_rate = int(30)
|
78 |
+
return (is_image, is_video, is_audio, width, height, duration, sample_rate)
|
79 |
+
|
80 |
+
elif is_audio:
|
81 |
+
audio_stream = CommonUtil.get_audio_stream(media)
|
82 |
+
duration = float(audio_stream["duration"])
|
83 |
+
sample_rate = int(audio_stream["sample_rate"])
|
84 |
+
return (is_image, is_video, is_audio, -1, -1, duration, sample_rate)
|
85 |
+
else:
|
86 |
+
return (is_image, is_video, is_audio, -1, -1, -1, -1)
|
87 |
+
|
88 |
+
@staticmethod
|
89 |
+
def is_image(file_path):
|
90 |
+
return file_path.lower().endswith(CommonUtil.valid_image_exts)
|
91 |
+
|
92 |
+
@staticmethod
|
93 |
+
def is_video(file_path):
|
94 |
+
return file_path.lower().endswith(CommonUtil.valid_video_exts)
|
95 |
+
|
96 |
+
@staticmethod
|
97 |
+
def is_audio(file_path):
|
98 |
+
return file_path.lower().endswith(CommonUtil.valid_audio_exts)
|
99 |
+
|
100 |
+
@staticmethod
|
101 |
+
def is_template(file_path):
|
102 |
+
if file_path.endswith(CommonUtil.valid_template_ext):
|
103 |
+
return True
|
104 |
+
return False
|
105 |
+
|
106 |
+
@staticmethod
|
107 |
+
def change_video_fps(input_file, output_file, fps=20, codec="libx264", crf=12):
|
108 |
+
cmd = f'ffmpeg -i "{input_file}" -c:v {codec} -crf {crf} -r {fps} "{output_file}" -y'
|
109 |
+
CommonUtil.exec_cmd(cmd)
|
110 |
+
|
requirements.txt
CHANGED
@@ -1,3 +1,5 @@
|
|
1 |
-
gradio==5.
|
2 |
elevenlabs==1.8.1
|
3 |
google-cloud-storage
|
|
|
|
|
|
1 |
+
gradio==5.37.0
|
2 |
elevenlabs==1.8.1
|
3 |
google-cloud-storage
|
4 |
+
ffmpeg-python==0.2.0
|
5 |
+
imagesize==1.4.1
|