Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -34,8 +34,11 @@ from TTS.api import TTS
|
|
34 |
import torch
|
35 |
from pydub import AudioSegment
|
36 |
from pyannote.audio import Pipeline
|
37 |
-
import traceback
|
38 |
import wave
|
|
|
|
|
|
|
|
|
39 |
|
40 |
logger = logging.getLogger(__name__)
|
41 |
|
@@ -154,32 +157,12 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
|
|
154 |
vocals.export(speech_audio_path, format="wav")
|
155 |
return background_audio_path, speech_audio_path
|
156 |
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
# current_time = 0.0
|
165 |
-
# result_audio = AudioSegment.empty()
|
166 |
-
|
167 |
-
# for segment in vad_result.itersegments():
|
168 |
-
# # Background segment before the speech
|
169 |
-
# if current_time < segment.start:
|
170 |
-
# bg = full_audio[int(current_time * 1000):int(segment.start * 1000)]
|
171 |
-
# result_audio += bg
|
172 |
-
# # Add silence for the speech duration
|
173 |
-
# silence_duration = segment.end - segment.start
|
174 |
-
# result_audio += AudioSegment.silent(duration=int(silence_duration * 1000))
|
175 |
-
# current_time = segment.end
|
176 |
-
|
177 |
-
# # Handle any remaining background after the last speech
|
178 |
-
# if current_time < full_duration_sec:
|
179 |
-
# result_audio += full_audio[int(current_time * 1000):]
|
180 |
-
|
181 |
-
# result_audio.export(background_audio_path, format="wav")
|
182 |
-
# return background_audio_path
|
183 |
|
184 |
def transcribe_video_with_speakers(video_path):
|
185 |
# Extract audio from video
|
@@ -250,8 +233,22 @@ def transcribe_video_with_speakers(video_path):
|
|
250 |
speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
|
251 |
combined_clip = concatenate_audioclips(speaker_clips)
|
252 |
truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
|
253 |
-
|
254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
speaker_sample_paths[speaker] = sample_path
|
256 |
logger.info(f"Created sample for {speaker}: {sample_path}")
|
257 |
|
|
|
34 |
import torch
|
35 |
from pydub import AudioSegment
|
36 |
from pyannote.audio import Pipeline
|
|
|
37 |
import wave
|
38 |
+
import librosa
|
39 |
+
import noisereduce as nr
|
40 |
+
import soundfile as sf
|
41 |
+
|
42 |
|
43 |
logger = logging.getLogger(__name__)
|
44 |
|
|
|
157 |
vocals.export(speech_audio_path, format="wav")
|
158 |
return background_audio_path, speech_audio_path
|
159 |
|
160 |
+
def denoise_audio_array(audio_array, sr=16000):
|
161 |
+
"""
|
162 |
+
Denoise an audio numpy array directly.
|
163 |
+
"""
|
164 |
+
y_denoised = nr.reduce_noise(y=audio_array, sr=sr)
|
165 |
+
return y_denoised
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
def transcribe_video_with_speakers(video_path):
|
168 |
# Extract audio from video
|
|
|
233 |
speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
|
234 |
combined_clip = concatenate_audioclips(speaker_clips)
|
235 |
truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
|
236 |
+
|
237 |
+
# Step 1: Get audio array from the clip
|
238 |
+
fps = 16000 # target sampling rate
|
239 |
+
audio_array = truncated_clip.to_soundarray(fps=fps)
|
240 |
+
|
241 |
+
# If stereo → convert to mono
|
242 |
+
if audio_array.ndim == 2:
|
243 |
+
audio_array = np.mean(audio_array, axis=1)
|
244 |
+
|
245 |
+
# Step 2: Apply denoising
|
246 |
+
denoised_audio_array = denoise_audio_array(audio_array, sr=fps)
|
247 |
+
|
248 |
+
# Step 3: Save denoised audio directly
|
249 |
+
clean_sample_path = f"speaker_{speaker}_sample.wav"
|
250 |
+
sf.write(clean_sample_path, denoised_audio_array, fps)
|
251 |
+
|
252 |
speaker_sample_paths[speaker] = sample_path
|
253 |
logger.info(f"Created sample for {speaker}: {sample_path}")
|
254 |
|