studio_V1_4_asr_GPT

Running

App Files Files Community

qqwjq1981 commited on Apr 27

Commit

b83a564

verified ·

1 Parent(s): 15a60a2

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -11

app.py CHANGED Viewed

@@ -547,7 +547,7 @@ def post_edit_transcribed_segments(transcription_json, video_path,
                                    num_workers=4):
     """
     Given WhisperX transcription (transcription_json) and video,
-    use OCR subtitles to post-correct and merge the transcriptions.
     """
     # Step 1: Extract OCR subtitles
@@ -556,18 +556,19 @@ def post_edit_transcribed_segments(transcription_json, video_path,
     # Step 2: Collapse repetitive OCR
     collapsed_ocr = collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90)
-    # Step 3: Merge OCR with WhisperX
     merged_segments = []
-    for entry in transcription_json:
         start = entry.get("start", 0)
         end = entry.get("end", 0)
         base_text = entry.get("text", "")
-        best_match = None
         best_score = -1
-        for ocr in collapsed_ocr:
             # Check time overlap
             time_overlap = not (ocr["end"] < start - time_tolerance or ocr["start"] > end + time_tolerance)
             if not time_overlap:
@@ -577,22 +578,41 @@ def post_edit_transcribed_segments(transcription_json, video_path,
             sim = fuzz.ratio(ocr["text"], base_text)
             if sim > best_score:
                 best_score = sim
-                best_match = ocr
-        # If good match found, replace the original text
         updated_entry = entry.copy()
-        if best_match and best_score >= text_similarity_threshold:
-            updated_entry["text"] = best_match["text"]
             updated_entry["ocr_matched"] = True
             updated_entry["ocr_similarity"] = best_score
         else:
             updated_entry["ocr_matched"] = False
             updated_entry["ocr_similarity"] = best_score if best_score >= 0 else None
         merged_segments.append(updated_entry)
-    print(f"✅ Post-editing completed: {len(merged_segments)} segments")
-    return merged_segments
 def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
     logger.debug(f"Processing entry {i}: {entry}")

                                    num_workers=4):
     """
     Given WhisperX transcription (transcription_json) and video,
+    use OCR subtitles to post-correct and safely insert missing captions.
     """
     # Step 1: Extract OCR subtitles
     # Step 2: Collapse repetitive OCR
     collapsed_ocr = collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90)
+    # Step 3: Refine existing WhisperX segments (Phase 1)
     merged_segments = []
+    used_ocr_indices = set()
+    for entry_idx, entry in enumerate(transcription_json):
         start = entry.get("start", 0)
         end = entry.get("end", 0)
         base_text = entry.get("text", "")
+        best_match_idx = None
         best_score = -1
+        for ocr_idx, ocr in enumerate(collapsed_ocr):
             # Check time overlap
             time_overlap = not (ocr["end"] < start - time_tolerance or ocr["start"] > end + time_tolerance)
             if not time_overlap:
             sim = fuzz.ratio(ocr["text"], base_text)
             if sim > best_score:
                 best_score = sim
+                best_match_idx = ocr_idx
+        # Update WhisperX segment if matched
         updated_entry = entry.copy()
+        if best_match_idx is not None and best_score >= text_similarity_threshold:
+            updated_entry["text"] = collapsed_ocr[best_match_idx]["text"]
             updated_entry["ocr_matched"] = True
             updated_entry["ocr_similarity"] = best_score
+            used_ocr_indices.add(best_match_idx)
         else:
             updated_entry["ocr_matched"] = False
             updated_entry["ocr_similarity"] = best_score if best_score >= 0 else None
         merged_segments.append(updated_entry)
+    # Step 4: Insert unused OCR segments (Phase 2)
+    inserted_segments = []
+    for ocr_idx, ocr in enumerate(collapsed_ocr):
+        if ocr_idx not in used_ocr_indices:
+            inserted_segment = {
+                "start": ocr["start"],
+                "end": ocr["end"],
+                "text": ocr["text"],
+                "ocr_only": True
+            }
+            inserted_segments.append(inserted_segment)
+    # Step 5: Combine and sort
+    final_segments = merged_segments + inserted_segments
+    final_segments = sorted(final_segments, key=lambda x: x["start"])
+    print(f"✅ Post-editing completed: {len(final_segments)} total segments "
+          f"({len(inserted_segments)} OCR-only inserted)")
+    return final_segments
 def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
     logger.debug(f"Processing entry {i}: {entry}")