qqwjq1981 commited on
Commit
1c18b3d
·
verified ·
1 Parent(s): 635fb63

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -65
app.py CHANGED
@@ -38,7 +38,9 @@ import wave
38
  import librosa
39
  import noisereduce as nr
40
  import soundfile as sf
41
-
 
 
42
 
43
  logger = logging.getLogger(__name__)
44
 
@@ -511,70 +513,141 @@ def solve_optimal_alignment(original_segments, generated_durations, total_durati
511
 
512
  return original_segments
513
 
514
- def get_frame_image_bytes(video, t):
515
- frame = video.get_frame(t)
516
- img = Image.fromarray(frame)
517
- buf = io.BytesIO()
518
- img.save(buf, format='JPEG')
519
- return buf.getvalue()
520
-
521
- def post_edit_segment(entry, image_bytes):
522
- try:
523
- system_prompt = """You are a multilingual assistant helping polish subtitles and voiceover content.
524
- Your job is to fix punctuation, validate meaning, improve tone, and ensure the translation matches the speaker's intended message."""
525
-
526
- user_prompt = f"""
527
- Original (source) transcript: {entry.get("original", "")}
528
- Translated version: {entry.get("translated", "")}
529
- Speaker ID: {entry.get("speaker", "")}
530
- Time: {entry.get("start")} - {entry.get("end")}
531
-
532
- Please:
533
- 1. Add correct punctuation and sentence boundaries.
534
- 2. Improve fluency and tone of the translated text.
535
- 3. Ensure the meaning is preserved from the original.
536
- 4. Use the attached image frame to infer emotion or setting.
537
-
538
- Return the revised original and translated texts in the following format:
539
- Original: <edited original>
540
- Translated: <edited translation>
541
- """
542
- response = ChatCompletion.create(
543
- model="gpt-4o",
544
- messages=[
545
- {"role": "system", "content": system_prompt},
546
- {"role": "user", "content": user_prompt, "image": image_bytes}
547
- ]
548
- )
549
-
550
- output = response.choices[0].message.content.strip()
551
- lines = output.splitlines()
552
- for line in lines:
553
- if line.startswith("Original:"):
554
- entry['original'] = line[len("Original:"):].strip()
555
- elif line.startswith("Translated:"):
556
- entry['translated'] = line[len("Translated:"):].strip()
557
-
558
- return entry
559
- except Exception as e:
560
- print(f"Post-editing failed for segment: {e}")
561
- return entry
562
-
563
-
564
- def post_edit_translated_segments(translated_json, video_path):
565
- video = VideoFileClip(video_path)
566
-
567
- def process(entry):
568
- mid_time = (entry['start'] + entry['end']) / 2
569
- image_bytes = get_frame_image_bytes(video, mid_time)
570
- entry = post_edit_segment(entry, image_bytes)
571
- return entry
572
-
573
- with concurrent.futures.ThreadPoolExecutor() as executor:
574
- edited = list(executor.map(process, translated_json))
575
-
576
- video.close()
577
- return edited
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578
 
579
  def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
580
  logger.debug(f"Processing entry {i}: {entry}")
 
38
  import librosa
39
  import noisereduce as nr
40
  import soundfile as sf
41
+ from paddleocr import PaddleOCR
42
+ import cv2
43
+ from rapidfuzz import fuzz
44
 
45
  logger = logging.getLogger(__name__)
46
 
 
513
 
514
  return original_segments
515
 
516
+ def extract_subtitles_with_ocr(video_path):
517
+ ocr = PaddleOCR(use_angle_cls=True, lang="ch") # Change `lang` as needed
518
+ vidcap = cv2.VideoCapture(video_path)
519
+ fps = vidcap.get(cv2.CAP_PROP_FPS)
520
+
521
+ subtitles = []
522
+ frame_id = 0
523
+ success, image = vidcap.read()
524
+
525
+ while success:
526
+ if frame_id % int(fps) == 0: # OCR 1 frame per second (adjust if needed)
527
+ result = ocr.ocr(image, cls=True)
528
+ texts = [line[1][0] for line in result[0]] # Get text parts
529
+ combined_text = " ".join(texts).strip()
530
+ if combined_text:
531
+ subtitles.append({
532
+ "time": frame_id / fps,
533
+ "text": combined_text
534
+ })
535
+
536
+ frame_id += 1
537
+ success, image = vidcap.read()
538
+
539
+ vidcap.release()
540
+ return subtitles
541
+
542
+ def align_subtitles_to_transcripts(ocr_subtitles, whisperx_segments):
543
+ aligned_pairs = []
544
+
545
+ for ocr_entry in ocr_subtitles:
546
+ ocr_time = ocr_entry["time"]
547
+ best_score = -1
548
+ best_segment = None
549
+
550
+ for seg in whisperx_segments:
551
+ # Only consider segments close in time (within +/- 2s)
552
+ if abs(seg["start"] - ocr_time) < 2.0 or abs(seg["end"] - ocr_time) < 2.0:
553
+ score = fuzz.ratio(seg["text"], ocr_entry["text"])
554
+ if score > best_score:
555
+ best_score = score
556
+ best_segment = seg
557
+
558
+ if best_segment:
559
+ aligned_pairs.append({
560
+ "whisper_text": best_segment["text"],
561
+ "ocr_text": ocr_entry["text"],
562
+ "start": best_segment["start"],
563
+ "end": best_segment["end"],
564
+ "similarity": best_score
565
+ })
566
+
567
+ return aligned_pairs
568
+
569
+ def correct_transcripts_with_ocr(aligned_pairs):
570
+ corrected_segments = []
571
+
572
+ for pair in aligned_pairs:
573
+ if pair["similarity"] > 80:
574
+ # Trust OCR more if they are close
575
+ corrected_text = pair["ocr_text"]
576
+ else:
577
+ corrected_text = pair["whisper_text"]
578
+
579
+ corrected_segments.append({
580
+ "start": pair["start"],
581
+ "end": pair["end"],
582
+ "text": corrected_text
583
+ })
584
+
585
+ return corrected_segments
586
+
587
+ # def get_frame_image_bytes(video, t):
588
+ # frame = video.get_frame(t)
589
+ # img = Image.fromarray(frame)
590
+ # buf = io.BytesIO()
591
+ # img.save(buf, format='JPEG')
592
+ # return buf.getvalue()
593
+
594
+ # def post_edit_segment(entry, image_bytes):
595
+ # try:
596
+ # system_prompt = """You are a multilingual assistant helping polish subtitles and voiceover content.
597
+ # Your job is to fix punctuation, validate meaning, improve tone, and ensure the translation matches the speaker's intended message."""
598
+
599
+ # user_prompt = f"""
600
+ # Original (source) transcript: {entry.get("original", "")}
601
+ # Translated version: {entry.get("translated", "")}
602
+ # Speaker ID: {entry.get("speaker", "")}
603
+ # Time: {entry.get("start")} - {entry.get("end")}
604
+
605
+ # Please:
606
+ # 1. Add correct punctuation and sentence boundaries.
607
+ # 2. Improve fluency and tone of the translated text.
608
+ # 3. Ensure the meaning is preserved from the original.
609
+ # 4. Use the attached image frame to infer emotion or setting.
610
+
611
+ # Return the revised original and translated texts in the following format:
612
+ # Original: <edited original>
613
+ # Translated: <edited translation>
614
+ # """
615
+ # response = ChatCompletion.create(
616
+ # model="gpt-4o",
617
+ # messages=[
618
+ # {"role": "system", "content": system_prompt},
619
+ # {"role": "user", "content": user_prompt, "image": image_bytes}
620
+ # ]
621
+ # )
622
+
623
+ # output = response.choices[0].message.content.strip()
624
+ # lines = output.splitlines()
625
+ # for line in lines:
626
+ # if line.startswith("Original:"):
627
+ # entry['original'] = line[len("Original:"):].strip()
628
+ # elif line.startswith("Translated:"):
629
+ # entry['translated'] = line[len("Translated:"):].strip()
630
+
631
+ # return entry
632
+ # except Exception as e:
633
+ # print(f"Post-editing failed for segment: {e}")
634
+ # return entry
635
+
636
+
637
+ # def post_edit_translated_segments(translated_json, video_path):
638
+ # video = VideoFileClip(video_path)
639
+
640
+ # def process(entry):
641
+ # mid_time = (entry['start'] + entry['end']) / 2
642
+ # image_bytes = get_frame_image_bytes(video, mid_time)
643
+ # entry = post_edit_segment(entry, image_bytes)
644
+ # return entry
645
+
646
+ # with concurrent.futures.ThreadPoolExecutor() as executor:
647
+ # edited = list(executor.map(process, translated_json))
648
+
649
+ # video.close()
650
+ # return edited
651
 
652
  def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
653
  logger.debug(f"Processing entry {i}: {entry}")