Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -547,7 +547,7 @@ def post_edit_transcribed_segments(transcription_json, video_path,
|
|
547 |
num_workers=4):
|
548 |
"""
|
549 |
Given WhisperX transcription (transcription_json) and video,
|
550 |
-
use OCR subtitles to post-correct and
|
551 |
"""
|
552 |
|
553 |
# Step 1: Extract OCR subtitles
|
@@ -556,18 +556,19 @@ def post_edit_transcribed_segments(transcription_json, video_path,
|
|
556 |
# Step 2: Collapse repetitive OCR
|
557 |
collapsed_ocr = collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90)
|
558 |
|
559 |
-
# Step 3:
|
560 |
merged_segments = []
|
|
|
561 |
|
562 |
-
for entry in transcription_json:
|
563 |
start = entry.get("start", 0)
|
564 |
end = entry.get("end", 0)
|
565 |
base_text = entry.get("text", "")
|
566 |
|
567 |
-
|
568 |
best_score = -1
|
569 |
|
570 |
-
for ocr in collapsed_ocr:
|
571 |
# Check time overlap
|
572 |
time_overlap = not (ocr["end"] < start - time_tolerance or ocr["start"] > end + time_tolerance)
|
573 |
if not time_overlap:
|
@@ -577,22 +578,41 @@ def post_edit_transcribed_segments(transcription_json, video_path,
|
|
577 |
sim = fuzz.ratio(ocr["text"], base_text)
|
578 |
if sim > best_score:
|
579 |
best_score = sim
|
580 |
-
|
581 |
|
582 |
-
#
|
583 |
updated_entry = entry.copy()
|
584 |
-
if
|
585 |
-
updated_entry["text"] =
|
586 |
updated_entry["ocr_matched"] = True
|
587 |
updated_entry["ocr_similarity"] = best_score
|
|
|
588 |
else:
|
589 |
updated_entry["ocr_matched"] = False
|
590 |
updated_entry["ocr_similarity"] = best_score if best_score >= 0 else None
|
591 |
|
592 |
merged_segments.append(updated_entry)
|
593 |
|
594 |
-
|
595 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
596 |
|
597 |
def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
|
598 |
logger.debug(f"Processing entry {i}: {entry}")
|
|
|
547 |
num_workers=4):
|
548 |
"""
|
549 |
Given WhisperX transcription (transcription_json) and video,
|
550 |
+
use OCR subtitles to post-correct and safely insert missing captions.
|
551 |
"""
|
552 |
|
553 |
# Step 1: Extract OCR subtitles
|
|
|
556 |
# Step 2: Collapse repetitive OCR
|
557 |
collapsed_ocr = collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90)
|
558 |
|
559 |
+
# Step 3: Refine existing WhisperX segments (Phase 1)
|
560 |
merged_segments = []
|
561 |
+
used_ocr_indices = set()
|
562 |
|
563 |
+
for entry_idx, entry in enumerate(transcription_json):
|
564 |
start = entry.get("start", 0)
|
565 |
end = entry.get("end", 0)
|
566 |
base_text = entry.get("text", "")
|
567 |
|
568 |
+
best_match_idx = None
|
569 |
best_score = -1
|
570 |
|
571 |
+
for ocr_idx, ocr in enumerate(collapsed_ocr):
|
572 |
# Check time overlap
|
573 |
time_overlap = not (ocr["end"] < start - time_tolerance or ocr["start"] > end + time_tolerance)
|
574 |
if not time_overlap:
|
|
|
578 |
sim = fuzz.ratio(ocr["text"], base_text)
|
579 |
if sim > best_score:
|
580 |
best_score = sim
|
581 |
+
best_match_idx = ocr_idx
|
582 |
|
583 |
+
# Update WhisperX segment if matched
|
584 |
updated_entry = entry.copy()
|
585 |
+
if best_match_idx is not None and best_score >= text_similarity_threshold:
|
586 |
+
updated_entry["text"] = collapsed_ocr[best_match_idx]["text"]
|
587 |
updated_entry["ocr_matched"] = True
|
588 |
updated_entry["ocr_similarity"] = best_score
|
589 |
+
used_ocr_indices.add(best_match_idx)
|
590 |
else:
|
591 |
updated_entry["ocr_matched"] = False
|
592 |
updated_entry["ocr_similarity"] = best_score if best_score >= 0 else None
|
593 |
|
594 |
merged_segments.append(updated_entry)
|
595 |
|
596 |
+
# Step 4: Insert unused OCR segments (Phase 2)
|
597 |
+
inserted_segments = []
|
598 |
+
for ocr_idx, ocr in enumerate(collapsed_ocr):
|
599 |
+
if ocr_idx not in used_ocr_indices:
|
600 |
+
inserted_segment = {
|
601 |
+
"start": ocr["start"],
|
602 |
+
"end": ocr["end"],
|
603 |
+
"text": ocr["text"],
|
604 |
+
"ocr_only": True
|
605 |
+
}
|
606 |
+
inserted_segments.append(inserted_segment)
|
607 |
+
|
608 |
+
# Step 5: Combine and sort
|
609 |
+
final_segments = merged_segments + inserted_segments
|
610 |
+
final_segments = sorted(final_segments, key=lambda x: x["start"])
|
611 |
+
|
612 |
+
print(f"✅ Post-editing completed: {len(final_segments)} total segments "
|
613 |
+
f"({len(inserted_segments)} OCR-only inserted)")
|
614 |
+
|
615 |
+
return final_segments
|
616 |
|
617 |
def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
|
618 |
logger.debug(f"Processing entry {i}: {entry}")
|