qqwjq1981 commited on
Commit
7d826f9
·
verified ·
1 Parent(s): a83dd80

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -5
app.py CHANGED
@@ -601,6 +601,7 @@ def post_edit_transcribed_segments(transcription_json, video_path,
601
  interval_sec=interval_sec,
602
  num_workers=num_workers
603
  )
 
604
  # Step 2: Collapse repetitive OCR
605
  collapsed_ocr = collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90)
606
 
@@ -617,18 +618,15 @@ def post_edit_transcribed_segments(transcription_json, video_path,
617
  best_score = -1
618
 
619
  for ocr_idx, ocr in enumerate(collapsed_ocr):
620
- # Check time overlap
621
  time_overlap = not (ocr["end"] < start - time_tolerance or ocr["start"] > end + time_tolerance)
622
  if not time_overlap:
623
  continue
624
 
625
- # Text similarity
626
  sim = fuzz.ratio(ocr["text"], base_text)
627
  if sim > best_score:
628
  best_score = sim
629
  best_match_idx = ocr_idx
630
 
631
- # Update WhisperX segment if matched
632
  updated_entry = entry.copy()
633
  if best_match_idx is not None and best_score >= text_similarity_threshold:
634
  updated_entry["text"] = collapsed_ocr[best_match_idx]["text"]
@@ -645,11 +643,23 @@ def post_edit_transcribed_segments(transcription_json, video_path,
645
  inserted_segments = []
646
  for ocr_idx, ocr in enumerate(collapsed_ocr):
647
  if ocr_idx not in used_ocr_indices:
 
 
 
 
 
 
 
 
 
 
 
 
648
  inserted_segment = {
649
  "start": ocr["start"],
650
  "end": ocr["end"],
651
  "text": ocr["text"],
652
- "ocr_only": True
653
  }
654
  inserted_segments.append(inserted_segment)
655
 
@@ -658,10 +668,11 @@ def post_edit_transcribed_segments(transcription_json, video_path,
658
  final_segments = sorted(final_segments, key=lambda x: x["start"])
659
 
660
  print(f"✅ Post-editing completed: {len(final_segments)} total segments "
661
- f"({len(inserted_segments)} OCR-only inserted)")
662
 
663
  return final_segments
664
 
 
665
  def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
666
  logger.debug(f"Processing entry {i}: {entry}")
667
  error_message = None
 
601
  interval_sec=interval_sec,
602
  num_workers=num_workers
603
  )
604
+
605
  # Step 2: Collapse repetitive OCR
606
  collapsed_ocr = collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90)
607
 
 
618
  best_score = -1
619
 
620
  for ocr_idx, ocr in enumerate(collapsed_ocr):
 
621
  time_overlap = not (ocr["end"] < start - time_tolerance or ocr["start"] > end + time_tolerance)
622
  if not time_overlap:
623
  continue
624
 
 
625
  sim = fuzz.ratio(ocr["text"], base_text)
626
  if sim > best_score:
627
  best_score = sim
628
  best_match_idx = ocr_idx
629
 
 
630
  updated_entry = entry.copy()
631
  if best_match_idx is not None and best_score >= text_similarity_threshold:
632
  updated_entry["text"] = collapsed_ocr[best_match_idx]["text"]
 
643
  inserted_segments = []
644
  for ocr_idx, ocr in enumerate(collapsed_ocr):
645
  if ocr_idx not in used_ocr_indices:
646
+ # Try to assign the speaker based on nearby merged segments
647
+ nearby_speakers = []
648
+ for seg in merged_segments:
649
+ if abs(seg["start"] - ocr["start"]) <= 2.0 or abs(seg["end"] - ocr["end"]) <= 2.0:
650
+ if "speaker" in seg:
651
+ nearby_speakers.append(seg["speaker"])
652
+
653
+ if nearby_speakers:
654
+ assigned_speaker = nearby_speakers[0] # Take the first nearby speaker
655
+ else:
656
+ assigned_speaker = "SPEAKER_00"
657
+
658
  inserted_segment = {
659
  "start": ocr["start"],
660
  "end": ocr["end"],
661
  "text": ocr["text"],
662
+ "speaker": assigned_speaker
663
  }
664
  inserted_segments.append(inserted_segment)
665
 
 
668
  final_segments = sorted(final_segments, key=lambda x: x["start"])
669
 
670
  print(f"✅ Post-editing completed: {len(final_segments)} total segments "
671
+ f"({len(inserted_segments)} OCR-inserted segments)")
672
 
673
  return final_segments
674
 
675
+
676
  def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
677
  logger.debug(f"Processing entry {i}: {entry}")
678
  error_message = None