qqwjq1981 commited on
Commit
c0f8674
·
verified ·
1 Parent(s): dd10881

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -11
app.py CHANGED
@@ -668,33 +668,34 @@ def collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90):
668
  return collapsed
669
 
670
  def merge_speaker_and_time_from_whisperx(ocr_json, whisperx_json, text_sim_threshold=80, replace_threshold=90):
671
- """
672
- Given OCR and WhisperX segments, merge speaker ID and optionally replace time.
673
- """
674
  merged = []
 
675
 
676
  for ocr in ocr_json:
677
  ocr_start = ocr["start"]
678
  ocr_end = ocr["end"]
679
  ocr_text = ocr["text"]
680
-
681
  best_match = None
682
  best_score = -1
 
683
 
684
- for wx in whisperx_json:
685
  wx_start, wx_end = wx["start"], wx["end"]
686
  wx_text = wx["text"]
687
-
688
- # Time overlap (soft constraint)
 
 
689
  time_center_diff = abs((ocr_start + ocr_end)/2 - (wx_start + wx_end)/2)
690
- if time_center_diff > 3: # skip if too far
691
  continue
692
 
693
- # Text similarity
694
  sim = fuzz.ratio(ocr_text, wx_text)
695
  if sim > best_score:
696
  best_score = sim
697
  best_match = wx
 
698
 
699
  new_entry = copy.deepcopy(ocr)
700
  if best_match:
@@ -704,16 +705,15 @@ def merge_speaker_and_time_from_whisperx(ocr_json, whisperx_json, text_sim_thres
704
  if best_score >= replace_threshold:
705
  new_entry["start"] = best_match["start"]
706
  new_entry["end"] = best_match["end"]
 
707
 
708
  else:
709
  new_entry["speaker"] = "UNKNOWN"
710
  new_entry["ocr_similarity"] = None
711
 
712
  merged.append(new_entry)
713
-
714
  return merged
715
 
716
-
717
  def realign_ocr_segments(merged_ocr_json, min_gap=0.2):
718
  """
719
  Realign OCR segments to avoid overlaps using midpoint-based adjustment.
 
668
  return collapsed
669
 
670
  def merge_speaker_and_time_from_whisperx(ocr_json, whisperx_json, text_sim_threshold=80, replace_threshold=90):
 
 
 
671
  merged = []
672
+ used_whisperx = set()
673
 
674
  for ocr in ocr_json:
675
  ocr_start = ocr["start"]
676
  ocr_end = ocr["end"]
677
  ocr_text = ocr["text"]
678
+
679
  best_match = None
680
  best_score = -1
681
+ best_idx = None
682
 
683
+ for idx, wx in enumerate(whisperx_json):
684
  wx_start, wx_end = wx["start"], wx["end"]
685
  wx_text = wx["text"]
686
+
687
+ if idx in used_whisperx:
688
+ continue # Already matched
689
+
690
  time_center_diff = abs((ocr_start + ocr_end)/2 - (wx_start + wx_end)/2)
691
+ if time_center_diff > 3:
692
  continue
693
 
 
694
  sim = fuzz.ratio(ocr_text, wx_text)
695
  if sim > best_score:
696
  best_score = sim
697
  best_match = wx
698
+ best_idx = idx
699
 
700
  new_entry = copy.deepcopy(ocr)
701
  if best_match:
 
705
  if best_score >= replace_threshold:
706
  new_entry["start"] = best_match["start"]
707
  new_entry["end"] = best_match["end"]
708
+ used_whisperx.add(best_idx) # Mark used
709
 
710
  else:
711
  new_entry["speaker"] = "UNKNOWN"
712
  new_entry["ocr_similarity"] = None
713
 
714
  merged.append(new_entry)
 
715
  return merged
716
 
 
717
  def realign_ocr_segments(merged_ocr_json, min_gap=0.2):
718
  """
719
  Realign OCR segments to avoid overlaps using midpoint-based adjustment.