qqwjq1981 commited on
Commit
b83a564
·
verified ·
1 Parent(s): 15a60a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -11
app.py CHANGED
@@ -547,7 +547,7 @@ def post_edit_transcribed_segments(transcription_json, video_path,
547
  num_workers=4):
548
  """
549
  Given WhisperX transcription (transcription_json) and video,
550
- use OCR subtitles to post-correct and merge the transcriptions.
551
  """
552
 
553
  # Step 1: Extract OCR subtitles
@@ -556,18 +556,19 @@ def post_edit_transcribed_segments(transcription_json, video_path,
556
  # Step 2: Collapse repetitive OCR
557
  collapsed_ocr = collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90)
558
 
559
- # Step 3: Merge OCR with WhisperX
560
  merged_segments = []
 
561
 
562
- for entry in transcription_json:
563
  start = entry.get("start", 0)
564
  end = entry.get("end", 0)
565
  base_text = entry.get("text", "")
566
 
567
- best_match = None
568
  best_score = -1
569
 
570
- for ocr in collapsed_ocr:
571
  # Check time overlap
572
  time_overlap = not (ocr["end"] < start - time_tolerance or ocr["start"] > end + time_tolerance)
573
  if not time_overlap:
@@ -577,22 +578,41 @@ def post_edit_transcribed_segments(transcription_json, video_path,
577
  sim = fuzz.ratio(ocr["text"], base_text)
578
  if sim > best_score:
579
  best_score = sim
580
- best_match = ocr
581
 
582
- # If good match found, replace the original text
583
  updated_entry = entry.copy()
584
- if best_match and best_score >= text_similarity_threshold:
585
- updated_entry["text"] = best_match["text"]
586
  updated_entry["ocr_matched"] = True
587
  updated_entry["ocr_similarity"] = best_score
 
588
  else:
589
  updated_entry["ocr_matched"] = False
590
  updated_entry["ocr_similarity"] = best_score if best_score >= 0 else None
591
 
592
  merged_segments.append(updated_entry)
593
 
594
- print(f"✅ Post-editing completed: {len(merged_segments)} segments")
595
- return merged_segments
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
 
597
  def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
598
  logger.debug(f"Processing entry {i}: {entry}")
 
547
  num_workers=4):
548
  """
549
  Given WhisperX transcription (transcription_json) and video,
550
+ use OCR subtitles to post-correct and safely insert missing captions.
551
  """
552
 
553
  # Step 1: Extract OCR subtitles
 
556
  # Step 2: Collapse repetitive OCR
557
  collapsed_ocr = collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90)
558
 
559
+ # Step 3: Refine existing WhisperX segments (Phase 1)
560
  merged_segments = []
561
+ used_ocr_indices = set()
562
 
563
+ for entry_idx, entry in enumerate(transcription_json):
564
  start = entry.get("start", 0)
565
  end = entry.get("end", 0)
566
  base_text = entry.get("text", "")
567
 
568
+ best_match_idx = None
569
  best_score = -1
570
 
571
+ for ocr_idx, ocr in enumerate(collapsed_ocr):
572
  # Check time overlap
573
  time_overlap = not (ocr["end"] < start - time_tolerance or ocr["start"] > end + time_tolerance)
574
  if not time_overlap:
 
578
  sim = fuzz.ratio(ocr["text"], base_text)
579
  if sim > best_score:
580
  best_score = sim
581
+ best_match_idx = ocr_idx
582
 
583
+ # Update WhisperX segment if matched
584
  updated_entry = entry.copy()
585
+ if best_match_idx is not None and best_score >= text_similarity_threshold:
586
+ updated_entry["text"] = collapsed_ocr[best_match_idx]["text"]
587
  updated_entry["ocr_matched"] = True
588
  updated_entry["ocr_similarity"] = best_score
589
+ used_ocr_indices.add(best_match_idx)
590
  else:
591
  updated_entry["ocr_matched"] = False
592
  updated_entry["ocr_similarity"] = best_score if best_score >= 0 else None
593
 
594
  merged_segments.append(updated_entry)
595
 
596
+ # Step 4: Insert unused OCR segments (Phase 2)
597
+ inserted_segments = []
598
+ for ocr_idx, ocr in enumerate(collapsed_ocr):
599
+ if ocr_idx not in used_ocr_indices:
600
+ inserted_segment = {
601
+ "start": ocr["start"],
602
+ "end": ocr["end"],
603
+ "text": ocr["text"],
604
+ "ocr_only": True
605
+ }
606
+ inserted_segments.append(inserted_segment)
607
+
608
+ # Step 5: Combine and sort
609
+ final_segments = merged_segments + inserted_segments
610
+ final_segments = sorted(final_segments, key=lambda x: x["start"])
611
+
612
+ print(f"✅ Post-editing completed: {len(final_segments)} total segments "
613
+ f"({len(inserted_segments)} OCR-only inserted)")
614
+
615
+ return final_segments
616
 
617
  def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
618
  logger.debug(f"Processing entry {i}: {entry}")