Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -668,33 +668,34 @@ def collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90):
|
|
668 |
return collapsed
|
669 |
|
670 |
def merge_speaker_and_time_from_whisperx(ocr_json, whisperx_json, text_sim_threshold=80, replace_threshold=90):
|
671 |
-
"""
|
672 |
-
Given OCR and WhisperX segments, merge speaker ID and optionally replace time.
|
673 |
-
"""
|
674 |
merged = []
|
|
|
675 |
|
676 |
for ocr in ocr_json:
|
677 |
ocr_start = ocr["start"]
|
678 |
ocr_end = ocr["end"]
|
679 |
ocr_text = ocr["text"]
|
680 |
-
|
681 |
best_match = None
|
682 |
best_score = -1
|
|
|
683 |
|
684 |
-
for wx in whisperx_json:
|
685 |
wx_start, wx_end = wx["start"], wx["end"]
|
686 |
wx_text = wx["text"]
|
687 |
-
|
688 |
-
|
|
|
|
|
689 |
time_center_diff = abs((ocr_start + ocr_end)/2 - (wx_start + wx_end)/2)
|
690 |
-
if time_center_diff > 3:
|
691 |
continue
|
692 |
|
693 |
-
# Text similarity
|
694 |
sim = fuzz.ratio(ocr_text, wx_text)
|
695 |
if sim > best_score:
|
696 |
best_score = sim
|
697 |
best_match = wx
|
|
|
698 |
|
699 |
new_entry = copy.deepcopy(ocr)
|
700 |
if best_match:
|
@@ -704,16 +705,15 @@ def merge_speaker_and_time_from_whisperx(ocr_json, whisperx_json, text_sim_thres
|
|
704 |
if best_score >= replace_threshold:
|
705 |
new_entry["start"] = best_match["start"]
|
706 |
new_entry["end"] = best_match["end"]
|
|
|
707 |
|
708 |
else:
|
709 |
new_entry["speaker"] = "UNKNOWN"
|
710 |
new_entry["ocr_similarity"] = None
|
711 |
|
712 |
merged.append(new_entry)
|
713 |
-
|
714 |
return merged
|
715 |
|
716 |
-
|
717 |
def realign_ocr_segments(merged_ocr_json, min_gap=0.2):
|
718 |
"""
|
719 |
Realign OCR segments to avoid overlaps using midpoint-based adjustment.
|
|
|
668 |
return collapsed
|
669 |
|
670 |
def merge_speaker_and_time_from_whisperx(ocr_json, whisperx_json, text_sim_threshold=80, replace_threshold=90):
|
|
|
|
|
|
|
671 |
merged = []
|
672 |
+
used_whisperx = set()
|
673 |
|
674 |
for ocr in ocr_json:
|
675 |
ocr_start = ocr["start"]
|
676 |
ocr_end = ocr["end"]
|
677 |
ocr_text = ocr["text"]
|
678 |
+
|
679 |
best_match = None
|
680 |
best_score = -1
|
681 |
+
best_idx = None
|
682 |
|
683 |
+
for idx, wx in enumerate(whisperx_json):
|
684 |
wx_start, wx_end = wx["start"], wx["end"]
|
685 |
wx_text = wx["text"]
|
686 |
+
|
687 |
+
if idx in used_whisperx:
|
688 |
+
continue # Already matched
|
689 |
+
|
690 |
time_center_diff = abs((ocr_start + ocr_end)/2 - (wx_start + wx_end)/2)
|
691 |
+
if time_center_diff > 3:
|
692 |
continue
|
693 |
|
|
|
694 |
sim = fuzz.ratio(ocr_text, wx_text)
|
695 |
if sim > best_score:
|
696 |
best_score = sim
|
697 |
best_match = wx
|
698 |
+
best_idx = idx
|
699 |
|
700 |
new_entry = copy.deepcopy(ocr)
|
701 |
if best_match:
|
|
|
705 |
if best_score >= replace_threshold:
|
706 |
new_entry["start"] = best_match["start"]
|
707 |
new_entry["end"] = best_match["end"]
|
708 |
+
used_whisperx.add(best_idx) # Mark used
|
709 |
|
710 |
else:
|
711 |
new_entry["speaker"] = "UNKNOWN"
|
712 |
new_entry["ocr_similarity"] = None
|
713 |
|
714 |
merged.append(new_entry)
|
|
|
715 |
return merged
|
716 |
|
|
|
717 |
def realign_ocr_segments(merged_ocr_json, min_gap=0.2):
|
718 |
"""
|
719 |
Realign OCR segments to avoid overlaps using midpoint-based adjustment.
|