Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -209,7 +209,7 @@ def transcribe_video_with_speakers(video_path):
|
|
209 |
"start": segment["start"],
|
210 |
"end": segment["end"],
|
211 |
"text": segment["text"],
|
212 |
-
"speaker": segment
|
213 |
}
|
214 |
for segment in result["segments"]
|
215 |
]
|
@@ -541,6 +541,7 @@ def extract_ocr_subtitles_parallel(video_path, transcription_json, interval_sec=
|
|
541 |
cap.release()
|
542 |
|
543 |
ocr_results = []
|
|
|
544 |
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
|
545 |
futures = [executor.submit(ocr_frame_worker, frame) for frame in frames]
|
546 |
|
@@ -550,8 +551,9 @@ def extract_ocr_subtitles_parallel(video_path, transcription_json, interval_sec=
|
|
550 |
if result["text"]:
|
551 |
ocr_results.append(result)
|
552 |
except Exception as e:
|
553 |
-
|
554 |
|
|
|
555 |
return ocr_results
|
556 |
|
557 |
|
@@ -574,6 +576,12 @@ def collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90):
|
|
574 |
current = {"start": time, "end": time, "text": text}
|
575 |
if current:
|
576 |
collapsed.append(current)
|
|
|
|
|
|
|
|
|
|
|
|
|
577 |
return collapsed
|
578 |
|
579 |
def post_edit_transcribed_segments(transcription_json, video_path,
|
@@ -673,7 +681,7 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
|
|
673 |
desired_duration = entry["end"] - entry["start"]
|
674 |
desired_speed = entry['speed'] #calibrated_speed(entry['translated'], desired_duration)
|
675 |
|
676 |
-
speaker = entry.get("speaker", "
|
677 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
678 |
|
679 |
if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in tts_model.synthesizer.tts_model.language_manager.name_to_id.keys():
|
|
|
209 |
"start": segment["start"],
|
210 |
"end": segment["end"],
|
211 |
"text": segment["text"],
|
212 |
+
"speaker": segment.get("speaker", "SPEAKER_00")
|
213 |
}
|
214 |
for segment in result["segments"]
|
215 |
]
|
|
|
541 |
cap.release()
|
542 |
|
543 |
ocr_results = []
|
544 |
+
ocr_failures = 0 # Count OCR failures
|
545 |
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
|
546 |
futures = [executor.submit(ocr_frame_worker, frame) for frame in frames]
|
547 |
|
|
|
551 |
if result["text"]:
|
552 |
ocr_results.append(result)
|
553 |
except Exception as e:
|
554 |
+
ocr_failures += 1
|
555 |
|
556 |
+
logger.info(f"✅ OCR extraction completed: {len(ocr_results)} frames successful, {ocr_failures} frames failed.")
|
557 |
return ocr_results
|
558 |
|
559 |
|
|
|
576 |
current = {"start": time, "end": time, "text": text}
|
577 |
if current:
|
578 |
collapsed.append(current)
|
579 |
+
|
580 |
+
# Log collapsed OCR summary
|
581 |
+
logger.info(f"✅ OCR subtitles collapsed into {len(collapsed)} segments.")
|
582 |
+
for idx, seg in enumerate(collapsed):
|
583 |
+
logger.debug(f"[OCR Collapsed {idx}] {seg['start']:.2f}s - {seg['end']:.2f}s: {seg['text'][:50]}...")
|
584 |
+
|
585 |
return collapsed
|
586 |
|
587 |
def post_edit_transcribed_segments(transcription_json, video_path,
|
|
|
681 |
desired_duration = entry["end"] - entry["start"]
|
682 |
desired_speed = entry['speed'] #calibrated_speed(entry['translated'], desired_duration)
|
683 |
|
684 |
+
speaker = entry.get("speaker", "SPEAKER_00")
|
685 |
speaker_wav_path = f"speaker_{speaker}_sample.wav"
|
686 |
|
687 |
if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in tts_model.synthesizer.tts_model.language_manager.name_to_id.keys():
|