studio_V1_4_asr_GPT

Running

App Files Files Community

qqwjq1981 commited on Apr 27

Commit

1c18b3d

verified ·

1 Parent(s): 635fb63

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -65

app.py CHANGED Viewed

@@ -38,7 +38,9 @@ import wave
 import librosa
 import noisereduce as nr
 import soundfile as sf
 logger = logging.getLogger(__name__)
@@ -511,70 +513,141 @@ def solve_optimal_alignment(original_segments, generated_durations, total_durati
     return original_segments
-def get_frame_image_bytes(video, t):
-    frame = video.get_frame(t)
-    img = Image.fromarray(frame)
-    buf = io.BytesIO()
-    img.save(buf, format='JPEG')
-    return buf.getvalue()
-def post_edit_segment(entry, image_bytes):
-    try:
-        system_prompt = """You are a multilingual assistant helping polish subtitles and voiceover content.
-Your job is to fix punctuation, validate meaning, improve tone, and ensure the translation matches the speaker's intended message."""
-        user_prompt = f"""
-Original (source) transcript: {entry.get("original", "")}
-Translated version: {entry.get("translated", "")}
-Speaker ID: {entry.get("speaker", "")}
-Time: {entry.get("start")} - {entry.get("end")}
-Please:
-1. Add correct punctuation and sentence boundaries.
-2. Improve fluency and tone of the translated text.
-3. Ensure the meaning is preserved from the original.
-4. Use the attached image frame to infer emotion or setting.
-Return the revised original and translated texts in the following format:
-Original: <edited original>
-Translated: <edited translation>
-"""
-        response = ChatCompletion.create(
-            model="gpt-4o",
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": user_prompt, "image": image_bytes}
-            ]
-        )
-        output = response.choices[0].message.content.strip()
-        lines = output.splitlines()
-        for line in lines:
-            if line.startswith("Original:"):
-                entry['original'] = line[len("Original:"):].strip()
-            elif line.startswith("Translated:"):
-                entry['translated'] = line[len("Translated:"):].strip()
-        return entry
-    except Exception as e:
-        print(f"Post-editing failed for segment: {e}")
-        return entry
-def post_edit_translated_segments(translated_json, video_path):
-    video = VideoFileClip(video_path)
-    def process(entry):
-        mid_time = (entry['start'] + entry['end']) / 2
-        image_bytes = get_frame_image_bytes(video, mid_time)
-        entry = post_edit_segment(entry, image_bytes)
-        return entry
-    with concurrent.futures.ThreadPoolExecutor() as executor:
-        edited = list(executor.map(process, translated_json))
-    video.close()
-    return edited
 def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
     logger.debug(f"Processing entry {i}: {entry}")

 import librosa
 import noisereduce as nr
 import soundfile as sf
+from paddleocr import PaddleOCR
+import cv2
+from rapidfuzz import fuzz
 logger = logging.getLogger(__name__)
     return original_segments
+def extract_subtitles_with_ocr(video_path):
+    ocr = PaddleOCR(use_angle_cls=True, lang="ch")  # Change `lang` as needed
+    vidcap = cv2.VideoCapture(video_path)
+    fps = vidcap.get(cv2.CAP_PROP_FPS)
+    subtitles = []
+    frame_id = 0
+    success, image = vidcap.read()
+    while success:
+        if frame_id % int(fps) == 0:  # OCR 1 frame per second (adjust if needed)
+            result = ocr.ocr(image, cls=True)
+            texts = [line[1][0] for line in result[0]]  # Get text parts
+            combined_text = " ".join(texts).strip()
+            if combined_text:
+                subtitles.append({
+                    "time": frame_id / fps,
+                    "text": combined_text
+                })
+        frame_id += 1
+        success, image = vidcap.read()
+    vidcap.release()
+    return subtitles
+def align_subtitles_to_transcripts(ocr_subtitles, whisperx_segments):
+    aligned_pairs = []
+    for ocr_entry in ocr_subtitles:
+        ocr_time = ocr_entry["time"]
+        best_score = -1
+        best_segment = None
+        for seg in whisperx_segments:
+            # Only consider segments close in time (within +/- 2s)
+            if abs(seg["start"] - ocr_time) < 2.0 or abs(seg["end"] - ocr_time) < 2.0:
+                score = fuzz.ratio(seg["text"], ocr_entry["text"])
+                if score > best_score:
+                    best_score = score
+                    best_segment = seg
+        if best_segment:
+            aligned_pairs.append({
+                "whisper_text": best_segment["text"],
+                "ocr_text": ocr_entry["text"],
+                "start": best_segment["start"],
+                "end": best_segment["end"],
+                "similarity": best_score
+            })
+    return aligned_pairs
+def correct_transcripts_with_ocr(aligned_pairs):
+    corrected_segments = []
+    for pair in aligned_pairs:
+        if pair["similarity"] > 80:
+            # Trust OCR more if they are close
+            corrected_text = pair["ocr_text"]
+        else:
+            corrected_text = pair["whisper_text"]
+        corrected_segments.append({
+            "start": pair["start"],
+            "end": pair["end"],
+            "text": corrected_text
+        })
+    return corrected_segments
+# def get_frame_image_bytes(video, t):
+#     frame = video.get_frame(t)
+#     img = Image.fromarray(frame)
+#     buf = io.BytesIO()
+#     img.save(buf, format='JPEG')
+#     return buf.getvalue()
+# def post_edit_segment(entry, image_bytes):
+#     try:
+#         system_prompt = """You are a multilingual assistant helping polish subtitles and voiceover content.
+# Your job is to fix punctuation, validate meaning, improve tone, and ensure the translation matches the speaker's intended message."""
+#         user_prompt = f"""
+# Original (source) transcript: {entry.get("original", "")}
+# Translated version: {entry.get("translated", "")}
+# Speaker ID: {entry.get("speaker", "")}
+# Time: {entry.get("start")} - {entry.get("end")}
+# Please:
+# 1. Add correct punctuation and sentence boundaries.
+# 2. Improve fluency and tone of the translated text.
+# 3. Ensure the meaning is preserved from the original.
+# 4. Use the attached image frame to infer emotion or setting.
+# Return the revised original and translated texts in the following format:
+# Original: <edited original>
+# Translated: <edited translation>
+# """
+#         response = ChatCompletion.create(
+#             model="gpt-4o",
+#             messages=[
+#                 {"role": "system", "content": system_prompt},
+#                 {"role": "user", "content": user_prompt, "image": image_bytes}
+#             ]
+#         )
+#         output = response.choices[0].message.content.strip()
+#         lines = output.splitlines()
+#         for line in lines:
+#             if line.startswith("Original:"):
+#                 entry['original'] = line[len("Original:"):].strip()
+#             elif line.startswith("Translated:"):
+#                 entry['translated'] = line[len("Translated:"):].strip()
+#         return entry
+#     except Exception as e:
+#         print(f"Post-editing failed for segment: {e}")
+#         return entry
+# def post_edit_translated_segments(translated_json, video_path):
+#     video = VideoFileClip(video_path)
+#     def process(entry):
+#         mid_time = (entry['start'] + entry['end']) / 2
+#         image_bytes = get_frame_image_bytes(video, mid_time)
+#         entry = post_edit_segment(entry, image_bytes)
+#         return entry
+#     with concurrent.futures.ThreadPoolExecutor() as executor:
+#         edited = list(executor.map(process, translated_json))
+#     video.close()
+#     return edited
 def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
     logger.debug(f"Processing entry {i}: {entry}")