studio_V1_4_asr_GPT

Running

App Files Files Community

qqwjq1981 commited on Apr 26

Commit

7c1237e

verified ·

1 Parent(s): 99a54ce

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -29

app.py CHANGED Viewed

@@ -34,8 +34,11 @@ from TTS.api import TTS
 import torch
 from pydub import AudioSegment
 from pyannote.audio import Pipeline
-import traceback
 import wave
 logger = logging.getLogger(__name__)
@@ -154,32 +157,12 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
     vocals.export(speech_audio_path, format="wav")
     return background_audio_path, speech_audio_path
-# def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
-#     pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
-#     vad_result = pipeline(audio_path)
-#     full_audio = AudioSegment.from_wav(audio_path)
-#     full_duration_sec = len(full_audio) / 1000.0
-#     current_time = 0.0
-#     result_audio = AudioSegment.empty()
-#     for segment in vad_result.itersegments():
-#         # Background segment before the speech
-#         if current_time < segment.start:
-#             bg = full_audio[int(current_time * 1000):int(segment.start * 1000)]
-#             result_audio += bg
-#         # Add silence for the speech duration
-#         silence_duration = segment.end - segment.start
-#         result_audio += AudioSegment.silent(duration=int(silence_duration * 1000))
-#         current_time = segment.end
-#     # Handle any remaining background after the last speech
-#     if current_time < full_duration_sec:
-#         result_audio += full_audio[int(current_time * 1000):]
-#     result_audio.export(background_audio_path, format="wav")
-#     return background_audio_path
 def transcribe_video_with_speakers(video_path):
     # Extract audio from video
@@ -250,8 +233,22 @@ def transcribe_video_with_speakers(video_path):
         speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
         combined_clip = concatenate_audioclips(speaker_clips)
         truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
-        sample_path = f"speaker_{speaker}_sample.wav"
-        truncated_clip.write_audiofile(sample_path)
         speaker_sample_paths[speaker] = sample_path
         logger.info(f"Created sample for {speaker}: {sample_path}")

 import torch
 from pydub import AudioSegment
 from pyannote.audio import Pipeline
 import wave
+import librosa
+import noisereduce as nr
+import soundfile as sf
 logger = logging.getLogger(__name__)
     vocals.export(speech_audio_path, format="wav")
     return background_audio_path, speech_audio_path
+def denoise_audio_array(audio_array, sr=16000):
+    """
+    Denoise an audio numpy array directly.
+    """
+    y_denoised = nr.reduce_noise(y=audio_array, sr=sr)
+    return y_denoised
 def transcribe_video_with_speakers(video_path):
     # Extract audio from video
         speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
         combined_clip = concatenate_audioclips(speaker_clips)
         truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
+        # Step 1: Get audio array from the clip
+        fps = 16000  # target sampling rate
+        audio_array = truncated_clip.to_soundarray(fps=fps)
+        # If stereo → convert to mono
+        if audio_array.ndim == 2:
+            audio_array = np.mean(audio_array, axis=1)
+        # Step 2: Apply denoising
+        denoised_audio_array = denoise_audio_array(audio_array, sr=fps)
+        # Step 3: Save denoised audio directly
+        clean_sample_path = f"speaker_{speaker}_sample.wav"
+        sf.write(clean_sample_path, denoised_audio_array, fps)
         speaker_sample_paths[speaker] = sample_path
         logger.info(f"Created sample for {speaker}: {sample_path}")