studio_V1_4_asr_GPT

Running

App Files Files Community

CindyChen19 commited on 14 days ago

Commit

129e2e0

verified ·

1 Parent(s): 6da3f79

Update app.py

Browse files

Improve robustness of transcript voiceover generation

- Fix 'Invalid dimensions (0,)' crash when no audio segments are generated
- Prevent 'list index out of range' by aligning result mapping with input indices
- Now supports subtitle-only mode and partial voiceover failures gracefully

Files changed (1) hide show

app.py +64 -20

app.py CHANGED Viewed

@@ -1050,34 +1050,78 @@ def add_transcript_voiceover(video_path, translated_json, output_path, process_m
                 return f"Error loading XTTS model: {e}"
     with concurrent.futures.ThreadPoolExecutor() as executor:
-        futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
-                   for i, entry in enumerate(translated_json)]
-        results = []
         for future in concurrent.futures.as_completed(futures):
             try:
-                i, txt_clip, audio_segment, actual_duration, error = future.result()
-                results.append((i, txt_clip, audio_segment, actual_duration))
-                if error:
-                    error_messages.append(f"[Entry {i}] {error}")
             except Exception as e:
-                err = f"❌ Unexpected error in future result: {e}"
-                error_messages.append(err)
-    results.sort(key=lambda x: x[0])
-    text_clips = [clip for _, clip, _, _ in results if clip]
-    generated_durations = [dur for _, _, _, dur in results if dur > 0]
     # Align using optimization (modifies translated_json in-place)
-    translated_json = solve_optimal_alignment(translated_json, generated_durations, video.duration)
-    # Set aligned timings
     audio_segments = []
     for i, entry in enumerate(translated_json):
-        segment = results[i][2]  # AudioFileClip
-        if segment:
-            segment = segment.set_start(entry['start']).set_duration(entry['end'] - entry['start'])
-            audio_segments.append(segment)
     final_video = CompositeVideoClip([video] + text_clips)

                 return f"Error loading XTTS model: {e}"
     with concurrent.futures.ThreadPoolExecutor() as executor:
+#        futures = [executor.submit(process_entry, entry, i, tts_model, video.w, video.h, process_mode, target_language, font_path, speaker_sample_paths)
+#                   # for i, entry in enumerate(translated_json)]
+#        results = []
+#        for future in concurrent.futures.as_completed(futures):
+#            try:
+#                i, txt_clip, audio_segment, actual_duration, error = future.result()
+#                results.append((i, txt_clip, audio_segment, actual_duration))
+#                if error:
+#                    error_messages.append(f"[Entry {i}] {error}")
+#            except Exception as e:
+#                err = f"❌ Unexpected error in future result: {e}"
+#                error_messages.append(err)
+#        Use dict as a placeholder, any failure will leave a (None, None, 0)
+        futures = {
+            executor.submit(
+                process_entry, entry, idx, tts_model, video.w, video.h,
+                process_mode, target_language, font_path, speaker_sample_paths
+            ): idx
+            for idx, entry in enumerate(translated_json)
+        }
+        # Give each entry a placeholder first to prevent overstepping boundaries
+        result_map = {idx: (None, None, 0) for idx in range(len(translated_json))}
         for future in concurrent.futures.as_completed(futures):
+            idx = futures[future]
             try:
+                _idx, txt, aud, dur, err = future.result()
+                result_map[idx] = (txt, aud, dur)
+                if err:
+                    error_messages.append(f"[Entry {idx}] {err}")
             except Exception as e:
+                # Threads that throw errors also need to take up space to prevent the list index from going out of range
+                error_messages.append(f"[Entry {idx}] unexpected error: {e}")
+#     results.sort(key=lambda x: x[0])
+#     text_clips = [clip for _, clip, _, _ in results if clip]
+#     generated_durations = [dur for _, _, _, dur in results if dur > 0]
+    # Generate text_clips / generated_durations with result_map
+    ordered_idx = sorted(result_map.keys())
+    text_clips = [
+        result_map[i][0] for i in ordered_idx
+        if result_map[i][0]
+    ]
+    generated_durations = [
+        result_map[i][2] for i in ordered_idx
+        if result_map[i][2] > 0
+    ]
     # Align using optimization (modifies translated_json in-place)
+    if generated_durations:
+        translated_json = solve_optimal_alignment(translated_json, generated_durations, video.duration)
+    else:
+        logger.warning("No generated audio; skip alignment optimisation.")
+#    Set aligned timings
+#    audio_segments = []
+#    for i, entry in enumerate(translated_json):
+#        segment = results[i][2]  # AudioFileClip
+#        if segment:
+#            segment = segment.set_start(entry['start']).set_duration(entry['end'] - entry['start'])
+#            audio_segments.append(segment)
     audio_segments = []
     for i, entry in enumerate(translated_json):
+        _, seg, _dur = result_map[i]    # seg is AudioFileClip
+        if seg:
+            audio_segments.append(
+                seg.set_start(entry["start"]).set_duration(entry["end"] - entry["start"])
+            )
     final_video = CompositeVideoClip([video] + text_clips)