studio_V1_4_asr_GPT

Running

App Files Files Community

qqwjq1981 commited on Apr 27

Commit

15a60a2

verified ·

1 Parent(s): 3898559

Update app.py

Browse files

Files changed (1) hide show

app.py +1 -78

app.py CHANGED Viewed

@@ -158,33 +158,6 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
     background.export(background_audio_path, format="wav")
     return background_audio_path
-# def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
-#     pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
-#     vad_result = pipeline(audio_path)
-#     full_audio = AudioSegment.from_wav(audio_path)
-#     full_duration_sec = len(full_audio) / 1000.0
-#     current_time = 0.0
-#     result_audio = AudioSegment.empty()
-#     for segment in vad_result.itersegments():
-#         # Background segment before the speech
-#         if current_time < segment.start:
-#             bg = full_audio[int(current_time * 1000):int(segment.start * 1000)]
-#             result_audio += bg
-#         # Add silence for the speech duration
-#         silence_duration = segment.end - segment.start
-#         result_audio += AudioSegment.silent(duration=int(silence_duration * 1000))
-#         current_time = segment.end
-#     # Handle any remaining background after the last speech
-#     if current_time < full_duration_sec:
-#         result_audio += full_audio[int(current_time * 1000):]
-#     result_audio.export(background_audio_path, format="wav")
-#     return background_audio_path
 def transcribe_video_with_speakers(video_path):
     # Extract audio from video
     video = VideoFileClip(video_path)
@@ -620,56 +593,6 @@ def post_edit_transcribed_segments(transcription_json, video_path,
     print(f"✅ Post-editing completed: {len(merged_segments)} segments")
     return merged_segments
-# def get_frame_image_bytes(video, t):
-#     frame = video.get_frame(t)
-#     img = Image.fromarray(frame)
-#     buf = io.BytesIO()
-#     img.save(buf, format='JPEG')
-#     return buf.getvalue()
-# def post_edit_segment(entry, image_bytes):
-#     try:
-#         system_prompt = """You are a multilingual assistant helping polish subtitles and voiceover content.
-# Your job is to fix punctuation, validate meaning, improve tone, and ensure the translation matches the speaker's intended message."""
-#         user_prompt = f"""
-# Original (source) transcript: {entry.get("original", "")}
-# Translated version: {entry.get("translated", "")}
-# Speaker ID: {entry.get("speaker", "")}
-# Time: {entry.get("start")} - {entry.get("end")}
-# Please:
-# 1. Add correct punctuation and sentence boundaries.
-# 2. Improve fluency and tone of the translated text.
-# 3. Ensure the meaning is preserved from the original.
-# 4. Use the attached image frame to infer emotion or setting.
-# Return the revised original and translated texts in the following format:
-# Original: <edited original>
-# Translated: <edited translation>
-# """
-#         response = ChatCompletion.create(
-#             model="gpt-4o",
-#             messages=[
-#                 {"role": "system", "content": system_prompt},
-#                 {"role": "user", "content": user_prompt, "image": image_bytes}
-#             ]
-#         )
-#         output = response.choices[0].message.content.strip()
-#         lines = output.splitlines()
-#         for line in lines:
-#             if line.startswith("Original:"):
-#                 entry['original'] = line[len("Original:"):].strip()
-#             elif line.startswith("Translated:"):
-#                 entry['translated'] = line[len("Translated:"):].strip()
-#         return entry
-#     except Exception as e:
-#         print(f"Post-editing failed for segment: {e}")
-#         return entry
 def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
     logger.debug(f"Processing entry {i}: {entry}")
@@ -975,7 +898,7 @@ def upload_and_manage(file, target_language, process_mode):
         transcription_json, source_language = transcribe_video_with_speakers(file.name)
         logger.info(f"Transcription completed. Detected source language: {source_language}")
-        transcription_json_merged = post_edit_translated_segments(transcription_json, file.name)
         # Step 2: Translate the transcription
         logger.info(f"Translating transcription from {source_language} to {target_language}...")
         translated_json_raw = translate_text(transcription_json_merged, source_language, target_language)

     background.export(background_audio_path, format="wav")
     return background_audio_path
 def transcribe_video_with_speakers(video_path):
     # Extract audio from video
     video = VideoFileClip(video_path)
     print(f"✅ Post-editing completed: {len(merged_segments)} segments")
     return merged_segments
 def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
     logger.debug(f"Processing entry {i}: {entry}")
         transcription_json, source_language = transcribe_video_with_speakers(file.name)
         logger.info(f"Transcription completed. Detected source language: {source_language}")
+        transcription_json_merged = post_edit_transcribed_segments(transcription_json, file.name)
         # Step 2: Translate the transcription
         logger.info(f"Translating transcription from {source_language} to {target_language}...")
         translated_json_raw = translate_text(transcription_json_merged, source_language, target_language)