qqwjq1981 commited on
Commit
7c1237e
·
verified ·
1 Parent(s): 99a54ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -29
app.py CHANGED
@@ -34,8 +34,11 @@ from TTS.api import TTS
34
  import torch
35
  from pydub import AudioSegment
36
  from pyannote.audio import Pipeline
37
- import traceback
38
  import wave
 
 
 
 
39
 
40
  logger = logging.getLogger(__name__)
41
 
@@ -154,32 +157,12 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
154
  vocals.export(speech_audio_path, format="wav")
155
  return background_audio_path, speech_audio_path
156
 
157
- # def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
158
- # pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
159
- # vad_result = pipeline(audio_path)
160
-
161
- # full_audio = AudioSegment.from_wav(audio_path)
162
- # full_duration_sec = len(full_audio) / 1000.0
163
-
164
- # current_time = 0.0
165
- # result_audio = AudioSegment.empty()
166
-
167
- # for segment in vad_result.itersegments():
168
- # # Background segment before the speech
169
- # if current_time < segment.start:
170
- # bg = full_audio[int(current_time * 1000):int(segment.start * 1000)]
171
- # result_audio += bg
172
- # # Add silence for the speech duration
173
- # silence_duration = segment.end - segment.start
174
- # result_audio += AudioSegment.silent(duration=int(silence_duration * 1000))
175
- # current_time = segment.end
176
-
177
- # # Handle any remaining background after the last speech
178
- # if current_time < full_duration_sec:
179
- # result_audio += full_audio[int(current_time * 1000):]
180
-
181
- # result_audio.export(background_audio_path, format="wav")
182
- # return background_audio_path
183
 
184
  def transcribe_video_with_speakers(video_path):
185
  # Extract audio from video
@@ -250,8 +233,22 @@ def transcribe_video_with_speakers(video_path):
250
  speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
251
  combined_clip = concatenate_audioclips(speaker_clips)
252
  truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
253
- sample_path = f"speaker_{speaker}_sample.wav"
254
- truncated_clip.write_audiofile(sample_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  speaker_sample_paths[speaker] = sample_path
256
  logger.info(f"Created sample for {speaker}: {sample_path}")
257
 
 
34
  import torch
35
  from pydub import AudioSegment
36
  from pyannote.audio import Pipeline
 
37
  import wave
38
+ import librosa
39
+ import noisereduce as nr
40
+ import soundfile as sf
41
+
42
 
43
  logger = logging.getLogger(__name__)
44
 
 
157
  vocals.export(speech_audio_path, format="wav")
158
  return background_audio_path, speech_audio_path
159
 
160
+ def denoise_audio_array(audio_array, sr=16000):
161
+ """
162
+ Denoise an audio numpy array directly.
163
+ """
164
+ y_denoised = nr.reduce_noise(y=audio_array, sr=sr)
165
+ return y_denoised
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
  def transcribe_video_with_speakers(video_path):
168
  # Extract audio from video
 
233
  speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
234
  combined_clip = concatenate_audioclips(speaker_clips)
235
  truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
236
+
237
+ # Step 1: Get audio array from the clip
238
+ fps = 16000 # target sampling rate
239
+ audio_array = truncated_clip.to_soundarray(fps=fps)
240
+
241
+ # If stereo → convert to mono
242
+ if audio_array.ndim == 2:
243
+ audio_array = np.mean(audio_array, axis=1)
244
+
245
+ # Step 2: Apply denoising
246
+ denoised_audio_array = denoise_audio_array(audio_array, sr=fps)
247
+
248
+ # Step 3: Save denoised audio directly
249
+ clean_sample_path = f"speaker_{speaker}_sample.wav"
250
+ sf.write(clean_sample_path, denoised_audio_array, fps)
251
+
252
  speaker_sample_paths[speaker] = sample_path
253
  logger.info(f"Created sample for {speaker}: {sample_path}")
254