Spaces:
Running
Running
Update app.py
Browse filesImprove robustness of transcript voiceover generation
- Fix 'Invalid dimensions (0,)' crash when no audio segments are generated
- Prevent 'list index out of range' by aligning result mapping with input indices
- Now supports subtitle-only mode and partial voiceover failures gracefully
app.py
CHANGED
@@ -1050,34 +1050,78 @@ def add_transcript_voiceover(video_path, translated_json, output_path, process_m
|
|
1050 |
return f"Error loading XTTS model: {e}"
|
1051 |
|
1052 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
1053 |
-
futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
|
1054 |
-
for i, entry in enumerate(translated_json)]
|
1055 |
-
|
1056 |
-
results = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1057 |
for future in concurrent.futures.as_completed(futures):
|
|
|
1058 |
try:
|
1059 |
-
|
1060 |
-
|
1061 |
-
if
|
1062 |
-
error_messages.append(f"[Entry {
|
1063 |
except Exception as e:
|
1064 |
-
|
1065 |
-
error_messages.append(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1066 |
|
1067 |
-
results.sort(key=lambda x: x[0])
|
1068 |
-
text_clips = [clip for _, clip, _, _ in results if clip]
|
1069 |
-
generated_durations = [dur for _, _, _, dur in results if dur > 0]
|
1070 |
|
1071 |
# Align using optimization (modifies translated_json in-place)
|
1072 |
-
|
1073 |
-
|
1074 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1075 |
audio_segments = []
|
1076 |
for i, entry in enumerate(translated_json):
|
1077 |
-
|
1078 |
-
if
|
1079 |
-
|
1080 |
-
|
|
|
1081 |
|
1082 |
final_video = CompositeVideoClip([video] + text_clips)
|
1083 |
|
|
|
1050 |
return f"Error loading XTTS model: {e}"
|
1051 |
|
1052 |
with concurrent.futures.ThreadPoolExecutor() as executor:
|
1053 |
+
# futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
|
1054 |
+
# # for i, entry in enumerate(translated_json)]
|
1055 |
+
|
1056 |
+
# results = []
|
1057 |
+
# for future in concurrent.futures.as_completed(futures):
|
1058 |
+
# try:
|
1059 |
+
# i, txt_clip, audio_segment, actual_duration, error = future.result()
|
1060 |
+
# results.append((i, txt_clip, audio_segment, actual_duration))
|
1061 |
+
# if error:
|
1062 |
+
# error_messages.append(f"[Entry {i}] {error}")
|
1063 |
+
# except Exception as e:
|
1064 |
+
# err = f"❌ Unexpected error in future result: {e}"
|
1065 |
+
# error_messages.append(err)
|
1066 |
+
# Use dict as a placeholder, any failure will leave a (None, None, 0)
|
1067 |
+
futures = {
|
1068 |
+
executor.submit(
|
1069 |
+
process_entry, entry, idx, tts_model, video.w, video.h,
|
1070 |
+
process_mode, target_language, font_path, speaker_sample_paths
|
1071 |
+
): idx
|
1072 |
+
for idx, entry in enumerate(translated_json)
|
1073 |
+
}
|
1074 |
+
|
1075 |
+
# Give each entry a placeholder first to prevent overstepping boundaries
|
1076 |
+
result_map = {idx: (None, None, 0) for idx in range(len(translated_json))}
|
1077 |
+
|
1078 |
for future in concurrent.futures.as_completed(futures):
|
1079 |
+
idx = futures[future]
|
1080 |
try:
|
1081 |
+
_idx, txt, aud, dur, err = future.result()
|
1082 |
+
result_map[idx] = (txt, aud, dur)
|
1083 |
+
if err:
|
1084 |
+
error_messages.append(f"[Entry {idx}] {err}")
|
1085 |
except Exception as e:
|
1086 |
+
# Threads that throw errors also need to take up space to prevent the list index from going out of range
|
1087 |
+
error_messages.append(f"[Entry {idx}] unexpected error: {e}")
|
1088 |
+
|
1089 |
+
# results.sort(key=lambda x: x[0])
|
1090 |
+
# text_clips = [clip for _, clip, _, _ in results if clip]
|
1091 |
+
# generated_durations = [dur for _, _, _, dur in results if dur > 0]
|
1092 |
+
|
1093 |
+
# Generate text_clips / generated_durations with result_map
|
1094 |
+
ordered_idx = sorted(result_map.keys())
|
1095 |
+
text_clips = [
|
1096 |
+
result_map[i][0] for i in ordered_idx
|
1097 |
+
if result_map[i][0]
|
1098 |
+
]
|
1099 |
+
generated_durations = [
|
1100 |
+
result_map[i][2] for i in ordered_idx
|
1101 |
+
if result_map[i][2] > 0
|
1102 |
+
]
|
1103 |
|
|
|
|
|
|
|
1104 |
|
1105 |
# Align using optimization (modifies translated_json in-place)
|
1106 |
+
if generated_durations:
|
1107 |
+
translated_json = solve_optimal_alignment(translated_json, generated_durations, video.duration)
|
1108 |
+
else:
|
1109 |
+
logger.warning("No generated audio; skip alignment optimisation.")
|
1110 |
+
|
1111 |
+
# Set aligned timings
|
1112 |
+
# audio_segments = []
|
1113 |
+
# for i, entry in enumerate(translated_json):
|
1114 |
+
# segment = results[i][2] # AudioFileClip
|
1115 |
+
# if segment:
|
1116 |
+
# segment = segment.set_start(entry['start']).set_duration(entry['end'] - entry['start'])
|
1117 |
+
# audio_segments.append(segment)
|
1118 |
audio_segments = []
|
1119 |
for i, entry in enumerate(translated_json):
|
1120 |
+
_, seg, _dur = result_map[i] # seg is AudioFileClip
|
1121 |
+
if seg:
|
1122 |
+
audio_segments.append(
|
1123 |
+
seg.set_start(entry["start"]).set_duration(entry["end"] - entry["start"])
|
1124 |
+
)
|
1125 |
|
1126 |
final_video = CompositeVideoClip([video] + text_clips)
|
1127 |
|