qqwjq1981 commited on
Commit
3898559
·
verified ·
1 Parent(s): b5a9cd7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -80
app.py CHANGED
@@ -131,7 +131,7 @@ def handle_feedback(feedback):
131
  conn.commit()
132
  return "Thank you for your feedback!", None
133
 
134
- def segment_background_audio(audio_path, background_audio_path="background_segments.wav", speech_audio_path="speech_segment.wav"):
135
  """
136
  Uses Demucs to separate audio and extract background (non-vocal) parts.
137
  Merges drums, bass, and other stems into a single background track.
@@ -148,7 +148,6 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
148
  stem_dir = os.path.join("separated", "htdemucs", filename)
149
 
150
  # Step 3: Load and merge background stems
151
- vocals = AudioSegment.from_wav(os.path.join(stem_dir, "vocals.wav"))
152
  drums = AudioSegment.from_wav(os.path.join(stem_dir, "drums.wav"))
153
  bass = AudioSegment.from_wav(os.path.join(stem_dir, "bass.wav"))
154
  other = AudioSegment.from_wav(os.path.join(stem_dir, "other.wav"))
@@ -157,15 +156,34 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
157
 
158
  # Step 4: Export the merged background
159
  background.export(background_audio_path, format="wav")
160
- vocals.export(speech_audio_path, format="wav")
161
- return background_audio_path, speech_audio_path
162
 
163
- def denoise_audio_array(audio_array, sr=16000):
164
- """
165
- Denoise an audio numpy array directly.
166
- """
167
- y_denoised = nr.reduce_noise(y=audio_array, sr=sr)
168
- return y_denoised
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  def transcribe_video_with_speakers(video_path):
171
  # Extract audio from video
@@ -174,7 +192,7 @@ def transcribe_video_with_speakers(video_path):
174
  video.audio.write_audiofile(audio_path)
175
  logger.info(f"Audio extracted from video: {audio_path}")
176
 
177
- segment_result, speech_audio_path = segment_background_audio(audio_path)
178
  print(f"Saved non-speech (background) audio to local")
179
 
180
  # Set up device
@@ -187,7 +205,7 @@ def transcribe_video_with_speakers(video_path):
187
  logger.info("WhisperX model loaded")
188
 
189
  # Transcribe
190
- result = model.transcribe(speech_audio_path, chunk_size=6, print_progress = True)
191
  logger.info("Audio transcription completed")
192
 
193
  # Get the detected language
@@ -195,12 +213,12 @@ def transcribe_video_with_speakers(video_path):
195
  logger.debug(f"Detected language: {detected_language}")
196
  # Alignment
197
  model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
198
- result = whisperx.align(result["segments"], model_a, metadata, speech_audio_path, device)
199
  logger.info("Transcription alignment completed")
200
 
201
  # Diarization (works independently of Whisper model size)
202
  diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
203
- diarize_segments = diarize_model(speech_audio_path)
204
  logger.info("Speaker diarization completed")
205
 
206
  # Assign speakers
@@ -220,84 +238,31 @@ def transcribe_video_with_speakers(video_path):
220
  }
221
  for segment in result["segments"]
222
  ]
223
-
224
  # Collect audio for each speaker
225
  speaker_audio = {}
226
- logger.info("🔎 Start collecting valid audio segments per speaker...")
227
-
228
- for idx, segment in enumerate(result["segments"]):
229
  speaker = segment["speaker"]
230
- start = segment["start"]
231
- end = segment["end"]
232
-
233
- if end > start and (end - start) > 0.05: # Require >50ms duration
234
- if speaker not in speaker_audio:
235
- speaker_audio[speaker] = [(start, end)]
236
- else:
237
- speaker_audio[speaker].append((start, end))
238
-
239
- logger.debug(f"Segment {idx}: Added to speaker {speaker} [{start:.2f}s → {end:.2f}s]")
240
- else:
241
- logger.warning(f"⚠️ Segment {idx} discarded: invalid duration ({start:.2f}s → {end:.2f}s)")
242
-
243
  # Collapse and truncate speaker audio
244
  speaker_sample_paths = {}
245
- audio_clip = AudioFileClip(speech_audio_path)
246
-
247
- logger.info(f"🔎 Found {len(speaker_audio)} speakers with valid segments. Start creating speaker samples...")
248
-
249
  for speaker, segments in speaker_audio.items():
250
- logger.info(f"🔹 Speaker {speaker}: {len(segments)} valid segments")
251
-
252
  speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
253
- if not speaker_clips:
254
- logger.warning(f"⚠️ No valid audio clips for speaker {speaker}. Skipping sample creation.")
255
- continue
256
-
257
- if len(speaker_clips) == 1:
258
- logger.debug(f"Speaker {speaker}: Only one clip, skipping concatenation.")
259
- combined_clip = speaker_clips[0]
260
- else:
261
- logger.debug(f"Speaker {speaker}: Concatenating {len(speaker_clips)} clips.")
262
- combined_clip = concatenate_audioclips(speaker_clips)
263
-
264
  truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
265
- logger.debug(f"Speaker {speaker}: Truncated to {truncated_clip.duration:.2f} seconds.")
266
-
267
- # Step 1: Get audio array from the clip
268
- fps = 16000 # target sampling rate
269
- audio_array = truncated_clip.to_soundarray(fps=fps)
270
-
271
- if audio_array.ndim == 2:
272
- logger.debug(f"Speaker {speaker}: Stereo detected, converting to mono.")
273
- audio_array = np.mean(audio_array, axis=1)
274
-
275
- # Step 2: Apply denoising
276
- denoised_audio_array = denoise_audio_array(audio_array, sr=fps)
277
-
278
- if isinstance(denoised_audio_array, (list, tuple)):
279
- logger.debug(f"Speaker {speaker}: Denoising returned a sequence, concatenating.")
280
- # Concatenate the arrays along the first axis (samples)
281
- try:
282
- denoised_audio_array = np.concatenate(denoised_audio_array, axis=0)
283
- except ValueError as e:
284
- logger.error(f"Failed to concatenate denoised audio segments for {speaker}: {e}")
285
- # Decide how to handle this - maybe skip saving the sample?
286
- continue # Skip saving this sample if concatenation fails
287
-
288
- # Step 3: Save denoised audio directly
289
  sample_path = f"speaker_{speaker}_sample.wav"
290
- sf.write(sample_path, denoised_audio_array, fps)
291
-
292
  speaker_sample_paths[speaker] = sample_path
293
- logger.info(f"Created and saved sample for {speaker}: {sample_path}")
294
-
295
- # Cleanup
296
- logger.info("🧹 Closing audio clip and removing temporary files...")
297
  video.close()
298
  audio_clip.close()
299
- os.remove(speech_audio_path)
300
- logger.info("✅ Finished processing all speaker samples.")
301
 
302
  return transcript_with_speakers, detected_language
303
 
 
131
  conn.commit()
132
  return "Thank you for your feedback!", None
133
 
134
+ def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
135
  """
136
  Uses Demucs to separate audio and extract background (non-vocal) parts.
137
  Merges drums, bass, and other stems into a single background track.
 
148
  stem_dir = os.path.join("separated", "htdemucs", filename)
149
 
150
  # Step 3: Load and merge background stems
 
151
  drums = AudioSegment.from_wav(os.path.join(stem_dir, "drums.wav"))
152
  bass = AudioSegment.from_wav(os.path.join(stem_dir, "bass.wav"))
153
  other = AudioSegment.from_wav(os.path.join(stem_dir, "other.wav"))
 
156
 
157
  # Step 4: Export the merged background
158
  background.export(background_audio_path, format="wav")
159
+ return background_audio_path
 
160
 
161
+ # def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
162
+ # pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
163
+ # vad_result = pipeline(audio_path)
164
+
165
+ # full_audio = AudioSegment.from_wav(audio_path)
166
+ # full_duration_sec = len(full_audio) / 1000.0
167
+
168
+ # current_time = 0.0
169
+ # result_audio = AudioSegment.empty()
170
+
171
+ # for segment in vad_result.itersegments():
172
+ # # Background segment before the speech
173
+ # if current_time < segment.start:
174
+ # bg = full_audio[int(current_time * 1000):int(segment.start * 1000)]
175
+ # result_audio += bg
176
+ # # Add silence for the speech duration
177
+ # silence_duration = segment.end - segment.start
178
+ # result_audio += AudioSegment.silent(duration=int(silence_duration * 1000))
179
+ # current_time = segment.end
180
+
181
+ # # Handle any remaining background after the last speech
182
+ # if current_time < full_duration_sec:
183
+ # result_audio += full_audio[int(current_time * 1000):]
184
+
185
+ # result_audio.export(background_audio_path, format="wav")
186
+ # return background_audio_path
187
 
188
  def transcribe_video_with_speakers(video_path):
189
  # Extract audio from video
 
192
  video.audio.write_audiofile(audio_path)
193
  logger.info(f"Audio extracted from video: {audio_path}")
194
 
195
+ segment_result = segment_background_audio(audio_path)
196
  print(f"Saved non-speech (background) audio to local")
197
 
198
  # Set up device
 
205
  logger.info("WhisperX model loaded")
206
 
207
  # Transcribe
208
+ result = model.transcribe(audio_path, chunk_size=6, print_progress = True)
209
  logger.info("Audio transcription completed")
210
 
211
  # Get the detected language
 
213
  logger.debug(f"Detected language: {detected_language}")
214
  # Alignment
215
  model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
216
+ result = whisperx.align(result["segments"], model_a, metadata, audio_path, device)
217
  logger.info("Transcription alignment completed")
218
 
219
  # Diarization (works independently of Whisper model size)
220
  diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
221
+ diarize_segments = diarize_model(audio_path)
222
  logger.info("Speaker diarization completed")
223
 
224
  # Assign speakers
 
238
  }
239
  for segment in result["segments"]
240
  ]
241
+
242
  # Collect audio for each speaker
243
  speaker_audio = {}
244
+ for segment in result["segments"]:
 
 
245
  speaker = segment["speaker"]
246
+ if speaker not in speaker_audio:
247
+ speaker_audio[speaker] = []
248
+ speaker_audio[speaker].append((segment["start"], segment["end"]))
249
+
 
 
 
 
 
 
 
 
 
250
  # Collapse and truncate speaker audio
251
  speaker_sample_paths = {}
252
+ audio_clip = AudioFileClip(audio_path)
 
 
 
253
  for speaker, segments in speaker_audio.items():
 
 
254
  speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
255
+ combined_clip = concatenate_audioclips(speaker_clips)
 
 
 
 
 
 
 
 
 
 
256
  truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  sample_path = f"speaker_{speaker}_sample.wav"
258
+ truncated_clip.write_audiofile(sample_path)
 
259
  speaker_sample_paths[speaker] = sample_path
260
+ logger.info(f"Created sample for {speaker}: {sample_path}")
261
+
262
+ # Clean up
 
263
  video.close()
264
  audio_clip.close()
265
+ os.remove(audio_path)
 
266
 
267
  return transcript_with_speakers, detected_language
268