Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -601,6 +601,7 @@ def post_edit_transcribed_segments(transcription_json, video_path,
|
|
601 |
interval_sec=interval_sec,
|
602 |
num_workers=num_workers
|
603 |
)
|
|
|
604 |
# Step 2: Collapse repetitive OCR
|
605 |
collapsed_ocr = collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90)
|
606 |
|
@@ -617,18 +618,15 @@ def post_edit_transcribed_segments(transcription_json, video_path,
|
|
617 |
best_score = -1
|
618 |
|
619 |
for ocr_idx, ocr in enumerate(collapsed_ocr):
|
620 |
-
# Check time overlap
|
621 |
time_overlap = not (ocr["end"] < start - time_tolerance or ocr["start"] > end + time_tolerance)
|
622 |
if not time_overlap:
|
623 |
continue
|
624 |
|
625 |
-
# Text similarity
|
626 |
sim = fuzz.ratio(ocr["text"], base_text)
|
627 |
if sim > best_score:
|
628 |
best_score = sim
|
629 |
best_match_idx = ocr_idx
|
630 |
|
631 |
-
# Update WhisperX segment if matched
|
632 |
updated_entry = entry.copy()
|
633 |
if best_match_idx is not None and best_score >= text_similarity_threshold:
|
634 |
updated_entry["text"] = collapsed_ocr[best_match_idx]["text"]
|
@@ -645,11 +643,23 @@ def post_edit_transcribed_segments(transcription_json, video_path,
|
|
645 |
inserted_segments = []
|
646 |
for ocr_idx, ocr in enumerate(collapsed_ocr):
|
647 |
if ocr_idx not in used_ocr_indices:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
648 |
inserted_segment = {
|
649 |
"start": ocr["start"],
|
650 |
"end": ocr["end"],
|
651 |
"text": ocr["text"],
|
652 |
-
"
|
653 |
}
|
654 |
inserted_segments.append(inserted_segment)
|
655 |
|
@@ -658,10 +668,11 @@ def post_edit_transcribed_segments(transcription_json, video_path,
|
|
658 |
final_segments = sorted(final_segments, key=lambda x: x["start"])
|
659 |
|
660 |
print(f"✅ Post-editing completed: {len(final_segments)} total segments "
|
661 |
-
f"({len(inserted_segments)} OCR-
|
662 |
|
663 |
return final_segments
|
664 |
|
|
|
665 |
def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
|
666 |
logger.debug(f"Processing entry {i}: {entry}")
|
667 |
error_message = None
|
|
|
601 |
interval_sec=interval_sec,
|
602 |
num_workers=num_workers
|
603 |
)
|
604 |
+
|
605 |
# Step 2: Collapse repetitive OCR
|
606 |
collapsed_ocr = collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90)
|
607 |
|
|
|
618 |
best_score = -1
|
619 |
|
620 |
for ocr_idx, ocr in enumerate(collapsed_ocr):
|
|
|
621 |
time_overlap = not (ocr["end"] < start - time_tolerance or ocr["start"] > end + time_tolerance)
|
622 |
if not time_overlap:
|
623 |
continue
|
624 |
|
|
|
625 |
sim = fuzz.ratio(ocr["text"], base_text)
|
626 |
if sim > best_score:
|
627 |
best_score = sim
|
628 |
best_match_idx = ocr_idx
|
629 |
|
|
|
630 |
updated_entry = entry.copy()
|
631 |
if best_match_idx is not None and best_score >= text_similarity_threshold:
|
632 |
updated_entry["text"] = collapsed_ocr[best_match_idx]["text"]
|
|
|
643 |
inserted_segments = []
|
644 |
for ocr_idx, ocr in enumerate(collapsed_ocr):
|
645 |
if ocr_idx not in used_ocr_indices:
|
646 |
+
# Try to assign the speaker based on nearby merged segments
|
647 |
+
nearby_speakers = []
|
648 |
+
for seg in merged_segments:
|
649 |
+
if abs(seg["start"] - ocr["start"]) <= 2.0 or abs(seg["end"] - ocr["end"]) <= 2.0:
|
650 |
+
if "speaker" in seg:
|
651 |
+
nearby_speakers.append(seg["speaker"])
|
652 |
+
|
653 |
+
if nearby_speakers:
|
654 |
+
assigned_speaker = nearby_speakers[0] # Take the first nearby speaker
|
655 |
+
else:
|
656 |
+
assigned_speaker = "SPEAKER_00"
|
657 |
+
|
658 |
inserted_segment = {
|
659 |
"start": ocr["start"],
|
660 |
"end": ocr["end"],
|
661 |
"text": ocr["text"],
|
662 |
+
"speaker": assigned_speaker
|
663 |
}
|
664 |
inserted_segments.append(inserted_segment)
|
665 |
|
|
|
668 |
final_segments = sorted(final_segments, key=lambda x: x["start"])
|
669 |
|
670 |
print(f"✅ Post-editing completed: {len(final_segments)} total segments "
|
671 |
+
f"({len(inserted_segments)} OCR-inserted segments)")
|
672 |
|
673 |
return final_segments
|
674 |
|
675 |
+
|
676 |
def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
|
677 |
logger.debug(f"Processing entry {i}: {entry}")
|
678 |
error_message = None
|