qqwjq1981 commited on
Commit
b5a9cd7
·
verified ·
1 Parent(s): d79df48

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -17
app.py CHANGED
@@ -220,53 +220,84 @@ def transcribe_video_with_speakers(video_path):
220
  }
221
  for segment in result["segments"]
222
  ]
223
-
224
  # Collect audio for each speaker
225
  speaker_audio = {}
226
- for segment in result["segments"]:
 
 
227
  speaker = segment["speaker"]
228
- end = segment["end"]
229
  start = segment["start"]
 
 
230
  if end > start and (end - start) > 0.05: # Require >50ms duration
231
  if speaker not in speaker_audio:
232
- speaker_audio[speaker] = [(segment["start"], segment["end"])]
233
  else:
234
- speaker_audio[speaker].append((segment["start"], segment["end"]))
 
 
 
 
 
235
  # Collapse and truncate speaker audio
236
  speaker_sample_paths = {}
237
  audio_clip = AudioFileClip(speech_audio_path)
 
 
 
238
  for speaker, segments in speaker_audio.items():
 
 
239
  speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
240
- # Add a check to ensure speaker_clips is not empty
241
  if not speaker_clips:
242
- logger.warning(f"No valid audio segments found for speaker {speaker} meeting the duration requirement. Skipping sample creation.")
243
- continue # Skip the rest of the loop for this speaker
244
-
245
- combined_clip = concatenate_audioclips(speaker_clips)
 
 
 
 
 
 
246
  truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
 
247
 
248
  # Step 1: Get audio array from the clip
249
  fps = 16000 # target sampling rate
250
  audio_array = truncated_clip.to_soundarray(fps=fps)
251
-
252
- # If stereo → convert to mono
253
  if audio_array.ndim == 2:
 
254
  audio_array = np.mean(audio_array, axis=1)
255
-
256
  # Step 2: Apply denoising
257
  denoised_audio_array = denoise_audio_array(audio_array, sr=fps)
258
 
 
 
 
 
 
 
 
 
 
 
259
  # Step 3: Save denoised audio directly
260
  sample_path = f"speaker_{speaker}_sample.wav"
261
  sf.write(sample_path, denoised_audio_array, fps)
262
-
263
  speaker_sample_paths[speaker] = sample_path
264
- logger.info(f"Created sample for {speaker}: {sample_path}")
265
-
266
- # Clean up
 
267
  video.close()
268
  audio_clip.close()
269
  os.remove(speech_audio_path)
 
270
 
271
  return transcript_with_speakers, detected_language
272
 
 
220
  }
221
  for segment in result["segments"]
222
  ]
223
+
224
  # Collect audio for each speaker
225
  speaker_audio = {}
226
+ logger.info("🔎 Start collecting valid audio segments per speaker...")
227
+
228
+ for idx, segment in enumerate(result["segments"]):
229
  speaker = segment["speaker"]
 
230
  start = segment["start"]
231
+ end = segment["end"]
232
+
233
  if end > start and (end - start) > 0.05: # Require >50ms duration
234
  if speaker not in speaker_audio:
235
+ speaker_audio[speaker] = [(start, end)]
236
  else:
237
+ speaker_audio[speaker].append((start, end))
238
+
239
+ logger.debug(f"Segment {idx}: Added to speaker {speaker} [{start:.2f}s → {end:.2f}s]")
240
+ else:
241
+ logger.warning(f"⚠️ Segment {idx} discarded: invalid duration ({start:.2f}s → {end:.2f}s)")
242
+
243
  # Collapse and truncate speaker audio
244
  speaker_sample_paths = {}
245
  audio_clip = AudioFileClip(speech_audio_path)
246
+
247
+ logger.info(f"🔎 Found {len(speaker_audio)} speakers with valid segments. Start creating speaker samples...")
248
+
249
  for speaker, segments in speaker_audio.items():
250
+ logger.info(f"🔹 Speaker {speaker}: {len(segments)} valid segments")
251
+
252
  speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
 
253
  if not speaker_clips:
254
+ logger.warning(f"⚠️ No valid audio clips for speaker {speaker}. Skipping sample creation.")
255
+ continue
256
+
257
+ if len(speaker_clips) == 1:
258
+ logger.debug(f"Speaker {speaker}: Only one clip, skipping concatenation.")
259
+ combined_clip = speaker_clips[0]
260
+ else:
261
+ logger.debug(f"Speaker {speaker}: Concatenating {len(speaker_clips)} clips.")
262
+ combined_clip = concatenate_audioclips(speaker_clips)
263
+
264
  truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
265
+ logger.debug(f"Speaker {speaker}: Truncated to {truncated_clip.duration:.2f} seconds.")
266
 
267
  # Step 1: Get audio array from the clip
268
  fps = 16000 # target sampling rate
269
  audio_array = truncated_clip.to_soundarray(fps=fps)
270
+
 
271
  if audio_array.ndim == 2:
272
+ logger.debug(f"Speaker {speaker}: Stereo detected, converting to mono.")
273
  audio_array = np.mean(audio_array, axis=1)
274
+
275
  # Step 2: Apply denoising
276
  denoised_audio_array = denoise_audio_array(audio_array, sr=fps)
277
 
278
+ if isinstance(denoised_audio_array, (list, tuple)):
279
+ logger.debug(f"Speaker {speaker}: Denoising returned a sequence, concatenating.")
280
+ # Concatenate the arrays along the first axis (samples)
281
+ try:
282
+ denoised_audio_array = np.concatenate(denoised_audio_array, axis=0)
283
+ except ValueError as e:
284
+ logger.error(f"Failed to concatenate denoised audio segments for {speaker}: {e}")
285
+ # Decide how to handle this - maybe skip saving the sample?
286
+ continue # Skip saving this sample if concatenation fails
287
+
288
  # Step 3: Save denoised audio directly
289
  sample_path = f"speaker_{speaker}_sample.wav"
290
  sf.write(sample_path, denoised_audio_array, fps)
291
+
292
  speaker_sample_paths[speaker] = sample_path
293
+ logger.info(f"Created and saved sample for {speaker}: {sample_path}")
294
+
295
+ # Cleanup
296
+ logger.info("🧹 Closing audio clip and removing temporary files...")
297
  video.close()
298
  audio_clip.close()
299
  os.remove(speech_audio_path)
300
+ logger.info("✅ Finished processing all speaker samples.")
301
 
302
  return transcript_with_speakers, detected_language
303