qqwjq1981 commited on
Commit
66933cf
·
verified ·
1 Parent(s): 46034f5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -20
app.py CHANGED
@@ -668,13 +668,19 @@ def collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90):
668
  logger.debug(f"[OCR Collapsed {idx}] {seg['start']:.2f}s - {seg['end']:.2f}s: {seg['text'][:50]}...")
669
  return collapsed
670
 
671
- def merge_speaker_and_time_from_whisperx(ocr_json, whisperx_json, text_sim_threshold=80, replace_threshold=90):
 
 
 
 
 
672
  merged = []
673
  used_whisperx = set()
 
674
 
 
675
  for ocr in ocr_json:
676
- ocr_start = ocr["start"]
677
- ocr_end = ocr["end"]
678
  ocr_text = ocr["text"]
679
 
680
  best_match = None
@@ -685,11 +691,9 @@ def merge_speaker_and_time_from_whisperx(ocr_json, whisperx_json, text_sim_thres
685
  wx_start, wx_end = wx["start"], wx["end"]
686
  wx_text = wx["text"]
687
 
688
- if idx in used_whisperx:
689
- continue # Already matched
690
-
691
- time_center_diff = abs((ocr_start + ocr_end)/2 - (wx_start + wx_end)/2)
692
- if time_center_diff > 3:
693
  continue
694
 
695
  sim = fuzz.ratio(ocr_text, wx_text)
@@ -698,23 +702,83 @@ def merge_speaker_and_time_from_whisperx(ocr_json, whisperx_json, text_sim_thres
698
  best_match = wx
699
  best_idx = idx
700
 
701
- new_entry = copy.deepcopy(ocr)
702
- if best_match:
703
- new_entry["speaker"] = best_match.get("speaker", "UNKNOWN")
704
- new_entry["ocr_similarity"] = best_score
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
705
 
706
- if best_score >= replace_threshold:
707
- new_entry["start"] = best_match["start"]
708
- new_entry["end"] = best_match["end"]
709
- used_whisperx.add(best_idx) # Mark used
710
 
711
- else:
712
- new_entry["speaker"] = "UNKNOWN"
713
- new_entry["ocr_similarity"] = None
714
 
715
- merged.append(new_entry)
716
  return merged
717
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
718
  def realign_ocr_segments(merged_ocr_json, min_gap=0.2):
719
  """
720
  Realign OCR segments to avoid overlaps using midpoint-based adjustment.
 
668
  logger.debug(f"[OCR Collapsed {idx}] {seg['start']:.2f}s - {seg['end']:.2f}s: {seg['text'][:50]}...")
669
  return collapsed
670
 
671
+ def merge_speaker_and_time_from_whisperx(
672
+ ocr_json,
673
+ whisperx_json,
674
+ replace_threshold=90,
675
+ time_tolerance=1.0
676
+ ):
677
  merged = []
678
  used_whisperx = set()
679
+ whisperx_used_flags = [False] * len(whisperx_json)
680
 
681
+ # Step 1: Attempt to match each OCR entry to a WhisperX entry
682
  for ocr in ocr_json:
683
+ ocr_start, ocr_end = ocr["start"], ocr["end"]
 
684
  ocr_text = ocr["text"]
685
 
686
  best_match = None
 
691
  wx_start, wx_end = wx["start"], wx["end"]
692
  wx_text = wx["text"]
693
 
694
+ # Check for time overlap
695
+ overlap = not (ocr_end < wx_start - time_tolerance or ocr_start > wx_end + time_tolerance)
696
+ if not overlap:
 
 
697
  continue
698
 
699
  sim = fuzz.ratio(ocr_text, wx_text)
 
702
  best_match = wx
703
  best_idx = idx
704
 
705
+ if best_match and best_score >= replace_threshold:
706
+ # Replace WhisperX segment with higher quality OCR text
707
+ new_segment = copy.deepcopy(best_match)
708
+ new_segment["text"] = ocr_text
709
+ new_segment["ocr_replaced"] = True
710
+ new_segment["ocr_similarity"] = best_score
711
+ whisperx_used_flags[best_idx] = True
712
+ merged.append(new_segment)
713
+ else:
714
+ # No replacement, check if this OCR is outside WhisperX time coverage
715
+ covered = any(
716
+ abs((ocr_start + ocr_end)/2 - (wx["start"] + wx["end"])/2) < time_tolerance
717
+ for wx in whisperx_json
718
+ )
719
+ if not covered:
720
+ new_segment = copy.deepcopy(ocr)
721
+ new_segment["ocr_added"] = True
722
+ new_segment["speaker"] = "UNKNOWN"
723
+ merged.append(new_segment)
724
 
725
+ # Step 2: Add untouched WhisperX segments
726
+ for idx, wx in enumerate(whisperx_json):
727
+ if not whisperx_used_flags[idx]:
728
+ merged.append(wx)
729
 
730
+ # Step 3: Sort all merged segments
731
+ merged = sorted(merged, key=lambda x: x["start"])
 
732
 
 
733
  return merged
734
 
735
+ # def merge_speaker_and_time_from_whisperx(ocr_json, whisperx_json, text_sim_threshold=80, replace_threshold=90):
736
+ # merged = []
737
+ # used_whisperx = set()
738
+
739
+ # for ocr in ocr_json:
740
+ # ocr_start = ocr["start"]
741
+ # ocr_end = ocr["end"]
742
+ # ocr_text = ocr["text"]
743
+
744
+ # best_match = None
745
+ # best_score = -1
746
+ # best_idx = None
747
+
748
+ # for idx, wx in enumerate(whisperx_json):
749
+ # wx_start, wx_end = wx["start"], wx["end"]
750
+ # wx_text = wx["text"]
751
+
752
+ # if idx in used_whisperx:
753
+ # continue # Already matched
754
+
755
+ # time_center_diff = abs((ocr_start + ocr_end)/2 - (wx_start + wx_end)/2)
756
+ # if time_center_diff > 3:
757
+ # continue
758
+
759
+ # sim = fuzz.ratio(ocr_text, wx_text)
760
+ # if sim > best_score:
761
+ # best_score = sim
762
+ # best_match = wx
763
+ # best_idx = idx
764
+
765
+ # new_entry = copy.deepcopy(ocr)
766
+ # if best_match:
767
+ # new_entry["speaker"] = best_match.get("speaker", "UNKNOWN")
768
+ # new_entry["ocr_similarity"] = best_score
769
+
770
+ # if best_score >= replace_threshold:
771
+ # new_entry["start"] = best_match["start"]
772
+ # new_entry["end"] = best_match["end"]
773
+ # used_whisperx.add(best_idx) # Mark used
774
+
775
+ # else:
776
+ # new_entry["speaker"] = "UNKNOWN"
777
+ # new_entry["ocr_similarity"] = None
778
+
779
+ # merged.append(new_entry)
780
+ # return merged
781
+
782
  def realign_ocr_segments(merged_ocr_json, min_gap=0.2):
783
  """
784
  Realign OCR segments to avoid overlaps using midpoint-based adjustment.