qqwjq1981 commited on
Commit
a83dd80
·
verified ·
1 Parent(s): f8b99f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -3
app.py CHANGED
@@ -209,7 +209,7 @@ def transcribe_video_with_speakers(video_path):
209
  "start": segment["start"],
210
  "end": segment["end"],
211
  "text": segment["text"],
212
- "speaker": segment["speaker"]
213
  }
214
  for segment in result["segments"]
215
  ]
@@ -541,6 +541,7 @@ def extract_ocr_subtitles_parallel(video_path, transcription_json, interval_sec=
541
  cap.release()
542
 
543
  ocr_results = []
 
544
  with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
545
  futures = [executor.submit(ocr_frame_worker, frame) for frame in frames]
546
 
@@ -550,8 +551,9 @@ def extract_ocr_subtitles_parallel(video_path, transcription_json, interval_sec=
550
  if result["text"]:
551
  ocr_results.append(result)
552
  except Exception as e:
553
- print(f"⚠️ OCR worker failed: {e}")
554
 
 
555
  return ocr_results
556
 
557
 
@@ -574,6 +576,12 @@ def collapse_ocr_subtitles(ocr_json, text_similarity_threshold=90):
574
  current = {"start": time, "end": time, "text": text}
575
  if current:
576
  collapsed.append(current)
 
 
 
 
 
 
577
  return collapsed
578
 
579
  def post_edit_transcribed_segments(transcription_json, video_path,
@@ -673,7 +681,7 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
673
  desired_duration = entry["end"] - entry["start"]
674
  desired_speed = entry['speed'] #calibrated_speed(entry['translated'], desired_duration)
675
 
676
- speaker = entry.get("speaker", "default")
677
  speaker_wav_path = f"speaker_{speaker}_sample.wav"
678
 
679
  if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in tts_model.synthesizer.tts_model.language_manager.name_to_id.keys():
 
209
  "start": segment["start"],
210
  "end": segment["end"],
211
  "text": segment["text"],
212
+ "speaker": segment.get("speaker", "SPEAKER_00")
213
  }
214
  for segment in result["segments"]
215
  ]
 
541
  cap.release()
542
 
543
  ocr_results = []
544
+ ocr_failures = 0 # Count OCR failures
545
  with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
546
  futures = [executor.submit(ocr_frame_worker, frame) for frame in frames]
547
 
 
551
  if result["text"]:
552
  ocr_results.append(result)
553
  except Exception as e:
554
+ ocr_failures += 1
555
 
556
+ logger.info(f"✅ OCR extraction completed: {len(ocr_results)} frames successful, {ocr_failures} frames failed.")
557
  return ocr_results
558
 
559
 
 
576
  current = {"start": time, "end": time, "text": text}
577
  if current:
578
  collapsed.append(current)
579
+
580
+ # Log collapsed OCR summary
581
+ logger.info(f"✅ OCR subtitles collapsed into {len(collapsed)} segments.")
582
+ for idx, seg in enumerate(collapsed):
583
+ logger.debug(f"[OCR Collapsed {idx}] {seg['start']:.2f}s - {seg['end']:.2f}s: {seg['text'][:50]}...")
584
+
585
  return collapsed
586
 
587
  def post_edit_transcribed_segments(transcription_json, video_path,
 
681
  desired_duration = entry["end"] - entry["start"]
682
  desired_speed = entry['speed'] #calibrated_speed(entry['translated'], desired_duration)
683
 
684
+ speaker = entry.get("speaker", "SPEAKER_00")
685
  speaker_wav_path = f"speaker_{speaker}_sample.wav"
686
 
687
  if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in tts_model.synthesizer.tts_model.language_manager.name_to_id.keys():