Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -668,13 +668,19 @@ def collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90):
|
|
668 |
logger.debug(f"[OCR Collapsed {idx}] {seg['start']:.2f}s - {seg['end']:.2f}s: {seg['text'][:50]}...")
|
669 |
return collapsed
|
670 |
|
671 |
-
def merge_speaker_and_time_from_whisperx(
|
|
|
|
|
|
|
|
|
|
|
672 |
merged = []
|
673 |
used_whisperx = set()
|
|
|
674 |
|
|
|
675 |
for ocr in ocr_json:
|
676 |
-
ocr_start = ocr["start"]
|
677 |
-
ocr_end = ocr["end"]
|
678 |
ocr_text = ocr["text"]
|
679 |
|
680 |
best_match = None
|
@@ -685,11 +691,9 @@ def merge_speaker_and_time_from_whisperx(ocr_json, whisperx_json, text_sim_thres
|
|
685 |
wx_start, wx_end = wx["start"], wx["end"]
|
686 |
wx_text = wx["text"]
|
687 |
|
688 |
-
|
689 |
-
|
690 |
-
|
691 |
-
time_center_diff = abs((ocr_start + ocr_end)/2 - (wx_start + wx_end)/2)
|
692 |
-
if time_center_diff > 3:
|
693 |
continue
|
694 |
|
695 |
sim = fuzz.ratio(ocr_text, wx_text)
|
@@ -698,23 +702,83 @@ def merge_speaker_and_time_from_whisperx(ocr_json, whisperx_json, text_sim_thres
|
|
698 |
best_match = wx
|
699 |
best_idx = idx
|
700 |
|
701 |
-
|
702 |
-
|
703 |
-
|
704 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
705 |
|
706 |
-
|
707 |
-
|
708 |
-
|
709 |
-
|
710 |
|
711 |
-
|
712 |
-
|
713 |
-
new_entry["ocr_similarity"] = None
|
714 |
|
715 |
-
merged.append(new_entry)
|
716 |
return merged
|
717 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
718 |
def realign_ocr_segments(merged_ocr_json, min_gap=0.2):
|
719 |
"""
|
720 |
Realign OCR segments to avoid overlaps using midpoint-based adjustment.
|
|
|
668 |
logger.debug(f"[OCR Collapsed {idx}] {seg['start']:.2f}s - {seg['end']:.2f}s: {seg['text'][:50]}...")
|
669 |
return collapsed
|
670 |
|
671 |
+
def merge_speaker_and_time_from_whisperx(
|
672 |
+
ocr_json,
|
673 |
+
whisperx_json,
|
674 |
+
replace_threshold=90,
|
675 |
+
time_tolerance=1.0
|
676 |
+
):
|
677 |
merged = []
|
678 |
used_whisperx = set()
|
679 |
+
whisperx_used_flags = [False] * len(whisperx_json)
|
680 |
|
681 |
+
# Step 1: Attempt to match each OCR entry to a WhisperX entry
|
682 |
for ocr in ocr_json:
|
683 |
+
ocr_start, ocr_end = ocr["start"], ocr["end"]
|
|
|
684 |
ocr_text = ocr["text"]
|
685 |
|
686 |
best_match = None
|
|
|
691 |
wx_start, wx_end = wx["start"], wx["end"]
|
692 |
wx_text = wx["text"]
|
693 |
|
694 |
+
# Check for time overlap
|
695 |
+
overlap = not (ocr_end < wx_start - time_tolerance or ocr_start > wx_end + time_tolerance)
|
696 |
+
if not overlap:
|
|
|
|
|
697 |
continue
|
698 |
|
699 |
sim = fuzz.ratio(ocr_text, wx_text)
|
|
|
702 |
best_match = wx
|
703 |
best_idx = idx
|
704 |
|
705 |
+
if best_match and best_score >= replace_threshold:
|
706 |
+
# Replace WhisperX segment with higher quality OCR text
|
707 |
+
new_segment = copy.deepcopy(best_match)
|
708 |
+
new_segment["text"] = ocr_text
|
709 |
+
new_segment["ocr_replaced"] = True
|
710 |
+
new_segment["ocr_similarity"] = best_score
|
711 |
+
whisperx_used_flags[best_idx] = True
|
712 |
+
merged.append(new_segment)
|
713 |
+
else:
|
714 |
+
# No replacement, check if this OCR is outside WhisperX time coverage
|
715 |
+
covered = any(
|
716 |
+
abs((ocr_start + ocr_end)/2 - (wx["start"] + wx["end"])/2) < time_tolerance
|
717 |
+
for wx in whisperx_json
|
718 |
+
)
|
719 |
+
if not covered:
|
720 |
+
new_segment = copy.deepcopy(ocr)
|
721 |
+
new_segment["ocr_added"] = True
|
722 |
+
new_segment["speaker"] = "UNKNOWN"
|
723 |
+
merged.append(new_segment)
|
724 |
|
725 |
+
# Step 2: Add untouched WhisperX segments
|
726 |
+
for idx, wx in enumerate(whisperx_json):
|
727 |
+
if not whisperx_used_flags[idx]:
|
728 |
+
merged.append(wx)
|
729 |
|
730 |
+
# Step 3: Sort all merged segments
|
731 |
+
merged = sorted(merged, key=lambda x: x["start"])
|
|
|
732 |
|
|
|
733 |
return merged
|
734 |
|
735 |
+
# def merge_speaker_and_time_from_whisperx(ocr_json, whisperx_json, text_sim_threshold=80, replace_threshold=90):
|
736 |
+
# merged = []
|
737 |
+
# used_whisperx = set()
|
738 |
+
|
739 |
+
# for ocr in ocr_json:
|
740 |
+
# ocr_start = ocr["start"]
|
741 |
+
# ocr_end = ocr["end"]
|
742 |
+
# ocr_text = ocr["text"]
|
743 |
+
|
744 |
+
# best_match = None
|
745 |
+
# best_score = -1
|
746 |
+
# best_idx = None
|
747 |
+
|
748 |
+
# for idx, wx in enumerate(whisperx_json):
|
749 |
+
# wx_start, wx_end = wx["start"], wx["end"]
|
750 |
+
# wx_text = wx["text"]
|
751 |
+
|
752 |
+
# if idx in used_whisperx:
|
753 |
+
# continue # Already matched
|
754 |
+
|
755 |
+
# time_center_diff = abs((ocr_start + ocr_end)/2 - (wx_start + wx_end)/2)
|
756 |
+
# if time_center_diff > 3:
|
757 |
+
# continue
|
758 |
+
|
759 |
+
# sim = fuzz.ratio(ocr_text, wx_text)
|
760 |
+
# if sim > best_score:
|
761 |
+
# best_score = sim
|
762 |
+
# best_match = wx
|
763 |
+
# best_idx = idx
|
764 |
+
|
765 |
+
# new_entry = copy.deepcopy(ocr)
|
766 |
+
# if best_match:
|
767 |
+
# new_entry["speaker"] = best_match.get("speaker", "UNKNOWN")
|
768 |
+
# new_entry["ocr_similarity"] = best_score
|
769 |
+
|
770 |
+
# if best_score >= replace_threshold:
|
771 |
+
# new_entry["start"] = best_match["start"]
|
772 |
+
# new_entry["end"] = best_match["end"]
|
773 |
+
# used_whisperx.add(best_idx) # Mark used
|
774 |
+
|
775 |
+
# else:
|
776 |
+
# new_entry["speaker"] = "UNKNOWN"
|
777 |
+
# new_entry["ocr_similarity"] = None
|
778 |
+
|
779 |
+
# merged.append(new_entry)
|
780 |
+
# return merged
|
781 |
+
|
782 |
def realign_ocr_segments(merged_ocr_json, min_gap=0.2):
|
783 |
"""
|
784 |
Realign OCR segments to avoid overlaps using midpoint-based adjustment.
|