CindyChen19 commited on
Commit
129e2e0
·
verified ·
1 Parent(s): 6da3f79

Update app.py

Browse files

Improve robustness of transcript voiceover generation

- Fix 'Invalid dimensions (0,)' crash when no audio segments are generated
- Prevent 'list index out of range' by aligning result mapping with input indices
- Now supports subtitle-only mode and partial voiceover failures gracefully

Files changed (1) hide show
  1. app.py +64 -20
app.py CHANGED
@@ -1050,34 +1050,78 @@ def add_transcript_voiceover(video_path, translated_json, output_path, process_m
1050
  return f"Error loading XTTS model: {e}"
1051
 
1052
  with concurrent.futures.ThreadPoolExecutor() as executor:
1053
- futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
1054
- for i, entry in enumerate(translated_json)]
1055
-
1056
- results = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1057
  for future in concurrent.futures.as_completed(futures):
 
1058
  try:
1059
- i, txt_clip, audio_segment, actual_duration, error = future.result()
1060
- results.append((i, txt_clip, audio_segment, actual_duration))
1061
- if error:
1062
- error_messages.append(f"[Entry {i}] {error}")
1063
  except Exception as e:
1064
- err = f"❌ Unexpected error in future result: {e}"
1065
- error_messages.append(err)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1066
 
1067
- results.sort(key=lambda x: x[0])
1068
- text_clips = [clip for _, clip, _, _ in results if clip]
1069
- generated_durations = [dur for _, _, _, dur in results if dur > 0]
1070
 
1071
  # Align using optimization (modifies translated_json in-place)
1072
- translated_json = solve_optimal_alignment(translated_json, generated_durations, video.duration)
1073
-
1074
- # Set aligned timings
 
 
 
 
 
 
 
 
 
1075
  audio_segments = []
1076
  for i, entry in enumerate(translated_json):
1077
- segment = results[i][2] # AudioFileClip
1078
- if segment:
1079
- segment = segment.set_start(entry['start']).set_duration(entry['end'] - entry['start'])
1080
- audio_segments.append(segment)
 
1081
 
1082
  final_video = CompositeVideoClip([video] + text_clips)
1083
 
 
1050
  return f"Error loading XTTS model: {e}"
1051
 
1052
  with concurrent.futures.ThreadPoolExecutor() as executor:
1053
+ # futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
1054
+ # # for i, entry in enumerate(translated_json)]
1055
+
1056
+ # results = []
1057
+ # for future in concurrent.futures.as_completed(futures):
1058
+ # try:
1059
+ # i, txt_clip, audio_segment, actual_duration, error = future.result()
1060
+ # results.append((i, txt_clip, audio_segment, actual_duration))
1061
+ # if error:
1062
+ # error_messages.append(f"[Entry {i}] {error}")
1063
+ # except Exception as e:
1064
+ # err = f"❌ Unexpected error in future result: {e}"
1065
+ # error_messages.append(err)
1066
+ # Use dict as a placeholder, any failure will leave a (None, None, 0)
1067
+ futures = {
1068
+ executor.submit(
1069
+ process_entry, entry, idx, tts_model, video.w, video.h,
1070
+ process_mode, target_language, font_path, speaker_sample_paths
1071
+ ): idx
1072
+ for idx, entry in enumerate(translated_json)
1073
+ }
1074
+
1075
+ # Give each entry a placeholder first to prevent overstepping boundaries
1076
+ result_map = {idx: (None, None, 0) for idx in range(len(translated_json))}
1077
+
1078
  for future in concurrent.futures.as_completed(futures):
1079
+ idx = futures[future]
1080
  try:
1081
+ _idx, txt, aud, dur, err = future.result()
1082
+ result_map[idx] = (txt, aud, dur)
1083
+ if err:
1084
+ error_messages.append(f"[Entry {idx}] {err}")
1085
  except Exception as e:
1086
+ # Threads that throw errors also need to take up space to prevent the list index from going out of range
1087
+ error_messages.append(f"[Entry {idx}] unexpected error: {e}")
1088
+
1089
+ # results.sort(key=lambda x: x[0])
1090
+ # text_clips = [clip for _, clip, _, _ in results if clip]
1091
+ # generated_durations = [dur for _, _, _, dur in results if dur > 0]
1092
+
1093
+ # Generate text_clips / generated_durations with result_map
1094
+ ordered_idx = sorted(result_map.keys())
1095
+ text_clips = [
1096
+ result_map[i][0] for i in ordered_idx
1097
+ if result_map[i][0]
1098
+ ]
1099
+ generated_durations = [
1100
+ result_map[i][2] for i in ordered_idx
1101
+ if result_map[i][2] > 0
1102
+ ]
1103
 
 
 
 
1104
 
1105
  # Align using optimization (modifies translated_json in-place)
1106
+ if generated_durations:
1107
+ translated_json = solve_optimal_alignment(translated_json, generated_durations, video.duration)
1108
+ else:
1109
+ logger.warning("No generated audio; skip alignment optimisation.")
1110
+
1111
+ # Set aligned timings
1112
+ # audio_segments = []
1113
+ # for i, entry in enumerate(translated_json):
1114
+ # segment = results[i][2] # AudioFileClip
1115
+ # if segment:
1116
+ # segment = segment.set_start(entry['start']).set_duration(entry['end'] - entry['start'])
1117
+ # audio_segments.append(segment)
1118
  audio_segments = []
1119
  for i, entry in enumerate(translated_json):
1120
+ _, seg, _dur = result_map[i] # seg is AudioFileClip
1121
+ if seg:
1122
+ audio_segments.append(
1123
+ seg.set_start(entry["start"]).set_duration(entry["end"] - entry["start"])
1124
+ )
1125
 
1126
  final_video = CompositeVideoClip([video] + text_clips)
1127