qqwjq1981 commited on
Commit
15a60a2
·
verified ·
1 Parent(s): 3898559

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -78
app.py CHANGED
@@ -158,33 +158,6 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
158
  background.export(background_audio_path, format="wav")
159
  return background_audio_path
160
 
161
- # def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
162
- # pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
163
- # vad_result = pipeline(audio_path)
164
-
165
- # full_audio = AudioSegment.from_wav(audio_path)
166
- # full_duration_sec = len(full_audio) / 1000.0
167
-
168
- # current_time = 0.0
169
- # result_audio = AudioSegment.empty()
170
-
171
- # for segment in vad_result.itersegments():
172
- # # Background segment before the speech
173
- # if current_time < segment.start:
174
- # bg = full_audio[int(current_time * 1000):int(segment.start * 1000)]
175
- # result_audio += bg
176
- # # Add silence for the speech duration
177
- # silence_duration = segment.end - segment.start
178
- # result_audio += AudioSegment.silent(duration=int(silence_duration * 1000))
179
- # current_time = segment.end
180
-
181
- # # Handle any remaining background after the last speech
182
- # if current_time < full_duration_sec:
183
- # result_audio += full_audio[int(current_time * 1000):]
184
-
185
- # result_audio.export(background_audio_path, format="wav")
186
- # return background_audio_path
187
-
188
  def transcribe_video_with_speakers(video_path):
189
  # Extract audio from video
190
  video = VideoFileClip(video_path)
@@ -620,56 +593,6 @@ def post_edit_transcribed_segments(transcription_json, video_path,
620
 
621
  print(f"✅ Post-editing completed: {len(merged_segments)} segments")
622
  return merged_segments
623
-
624
- # def get_frame_image_bytes(video, t):
625
- # frame = video.get_frame(t)
626
- # img = Image.fromarray(frame)
627
- # buf = io.BytesIO()
628
- # img.save(buf, format='JPEG')
629
- # return buf.getvalue()
630
-
631
- # def post_edit_segment(entry, image_bytes):
632
- # try:
633
- # system_prompt = """You are a multilingual assistant helping polish subtitles and voiceover content.
634
- # Your job is to fix punctuation, validate meaning, improve tone, and ensure the translation matches the speaker's intended message."""
635
-
636
- # user_prompt = f"""
637
- # Original (source) transcript: {entry.get("original", "")}
638
- # Translated version: {entry.get("translated", "")}
639
- # Speaker ID: {entry.get("speaker", "")}
640
- # Time: {entry.get("start")} - {entry.get("end")}
641
-
642
- # Please:
643
- # 1. Add correct punctuation and sentence boundaries.
644
- # 2. Improve fluency and tone of the translated text.
645
- # 3. Ensure the meaning is preserved from the original.
646
- # 4. Use the attached image frame to infer emotion or setting.
647
-
648
- # Return the revised original and translated texts in the following format:
649
- # Original: <edited original>
650
- # Translated: <edited translation>
651
- # """
652
- # response = ChatCompletion.create(
653
- # model="gpt-4o",
654
- # messages=[
655
- # {"role": "system", "content": system_prompt},
656
- # {"role": "user", "content": user_prompt, "image": image_bytes}
657
- # ]
658
- # )
659
-
660
- # output = response.choices[0].message.content.strip()
661
- # lines = output.splitlines()
662
- # for line in lines:
663
- # if line.startswith("Original:"):
664
- # entry['original'] = line[len("Original:"):].strip()
665
- # elif line.startswith("Translated:"):
666
- # entry['translated'] = line[len("Translated:"):].strip()
667
-
668
- # return entry
669
- # except Exception as e:
670
- # print(f"Post-editing failed for segment: {e}")
671
- # return entry
672
-
673
 
674
  def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
675
  logger.debug(f"Processing entry {i}: {entry}")
@@ -975,7 +898,7 @@ def upload_and_manage(file, target_language, process_mode):
975
  transcription_json, source_language = transcribe_video_with_speakers(file.name)
976
  logger.info(f"Transcription completed. Detected source language: {source_language}")
977
 
978
- transcription_json_merged = post_edit_translated_segments(transcription_json, file.name)
979
  # Step 2: Translate the transcription
980
  logger.info(f"Translating transcription from {source_language} to {target_language}...")
981
  translated_json_raw = translate_text(transcription_json_merged, source_language, target_language)
 
158
  background.export(background_audio_path, format="wav")
159
  return background_audio_path
160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  def transcribe_video_with_speakers(video_path):
162
  # Extract audio from video
163
  video = VideoFileClip(video_path)
 
593
 
594
  print(f"✅ Post-editing completed: {len(merged_segments)} segments")
595
  return merged_segments
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
596
 
597
  def process_entry(entry, i, tts_model, video_width, video_height, process_mode, target_language, font_path, speaker_sample_paths=None):
598
  logger.debug(f"Processing entry {i}: {entry}")
 
898
  transcription_json, source_language = transcribe_video_with_speakers(file.name)
899
  logger.info(f"Transcription completed. Detected source language: {source_language}")
900
 
901
+ transcription_json_merged = post_edit_transcribed_segments(transcription_json, file.name)
902
  # Step 2: Translate the transcription
903
  logger.info(f"Translating transcription from {source_language} to {target_language}...")
904
  translated_json_raw = translate_text(transcription_json_merged, source_language, target_language)