studio_V1_4_asr_GPT

Running

App Files Files Community

qqwjq1981 commited on Apr 27

Commit

3898559

verified ·

1 Parent(s): b5a9cd7

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -80

app.py CHANGED Viewed

@@ -131,7 +131,7 @@ def handle_feedback(feedback):
             conn.commit()
         return "Thank you for your feedback!", None
-def segment_background_audio(audio_path, background_audio_path="background_segments.wav", speech_audio_path="speech_segment.wav"):
     """
     Uses Demucs to separate audio and extract background (non-vocal) parts.
     Merges drums, bass, and other stems into a single background track.
@@ -148,7 +148,6 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
     stem_dir = os.path.join("separated", "htdemucs", filename)
     # Step 3: Load and merge background stems
-    vocals = AudioSegment.from_wav(os.path.join(stem_dir, "vocals.wav"))
     drums = AudioSegment.from_wav(os.path.join(stem_dir, "drums.wav"))
     bass = AudioSegment.from_wav(os.path.join(stem_dir, "bass.wav"))
     other = AudioSegment.from_wav(os.path.join(stem_dir, "other.wav"))
@@ -157,15 +156,34 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
     # Step 4: Export the merged background
     background.export(background_audio_path, format="wav")
-    vocals.export(speech_audio_path, format="wav")
-    return background_audio_path, speech_audio_path
-def denoise_audio_array(audio_array, sr=16000):
-    """
-    Denoise an audio numpy array directly.
-    """
-    y_denoised = nr.reduce_noise(y=audio_array, sr=sr)
-    return y_denoised
 def transcribe_video_with_speakers(video_path):
     # Extract audio from video
@@ -174,7 +192,7 @@ def transcribe_video_with_speakers(video_path):
     video.audio.write_audiofile(audio_path)
     logger.info(f"Audio extracted from video: {audio_path}")
-    segment_result, speech_audio_path = segment_background_audio(audio_path)
     print(f"Saved non-speech (background) audio to local")
     # Set up device
@@ -187,7 +205,7 @@ def transcribe_video_with_speakers(video_path):
         logger.info("WhisperX model loaded")
         # Transcribe
-        result = model.transcribe(speech_audio_path, chunk_size=6, print_progress = True)
         logger.info("Audio transcription completed")
         # Get the detected language
@@ -195,12 +213,12 @@ def transcribe_video_with_speakers(video_path):
         logger.debug(f"Detected language: {detected_language}")
         # Alignment
         model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
-        result = whisperx.align(result["segments"], model_a, metadata, speech_audio_path, device)
         logger.info("Transcription alignment completed")
         # Diarization (works independently of Whisper model size)
         diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
-        diarize_segments = diarize_model(speech_audio_path)
         logger.info("Speaker diarization completed")
         # Assign speakers
@@ -220,84 +238,31 @@ def transcribe_video_with_speakers(video_path):
         }
         for segment in result["segments"]
     ]
     # Collect audio for each speaker
     speaker_audio = {}
-    logger.info("🔎 Start collecting valid audio segments per speaker...")
-    for idx, segment in enumerate(result["segments"]):
         speaker = segment["speaker"]
-        start = segment["start"]
-        end = segment["end"]
-        if end > start and (end - start) > 0.05:  # Require >50ms duration
-            if speaker not in speaker_audio:
-                speaker_audio[speaker] = [(start, end)]
-            else:
-                speaker_audio[speaker].append((start, end))
-            logger.debug(f"Segment {idx}: Added to speaker {speaker} [{start:.2f}s → {end:.2f}s]")
-        else:
-            logger.warning(f"⚠️ Segment {idx} discarded: invalid duration ({start:.2f}s → {end:.2f}s)")
     # Collapse and truncate speaker audio
     speaker_sample_paths = {}
-    audio_clip = AudioFileClip(speech_audio_path)
-    logger.info(f"🔎 Found {len(speaker_audio)} speakers with valid segments. Start creating speaker samples...")
     for speaker, segments in speaker_audio.items():
-        logger.info(f"🔹 Speaker {speaker}: {len(segments)} valid segments")
         speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
-        if not speaker_clips:
-            logger.warning(f"⚠️ No valid audio clips for speaker {speaker}. Skipping sample creation.")
-            continue
-        if len(speaker_clips) == 1:
-            logger.debug(f"Speaker {speaker}: Only one clip, skipping concatenation.")
-            combined_clip = speaker_clips[0]
-        else:
-            logger.debug(f"Speaker {speaker}: Concatenating {len(speaker_clips)} clips.")
-            combined_clip = concatenate_audioclips(speaker_clips)
         truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
-        logger.debug(f"Speaker {speaker}: Truncated to {truncated_clip.duration:.2f} seconds.")
-        # Step 1: Get audio array from the clip
-        fps = 16000  # target sampling rate
-        audio_array = truncated_clip.to_soundarray(fps=fps)
-        if audio_array.ndim == 2:
-            logger.debug(f"Speaker {speaker}: Stereo detected, converting to mono.")
-            audio_array = np.mean(audio_array, axis=1)
-        # Step 2: Apply denoising
-        denoised_audio_array = denoise_audio_array(audio_array, sr=fps)
-        if isinstance(denoised_audio_array, (list, tuple)):
-             logger.debug(f"Speaker {speaker}: Denoising returned a sequence, concatenating.")
-             # Concatenate the arrays along the first axis (samples)
-             try:
-                denoised_audio_array = np.concatenate(denoised_audio_array, axis=0)
-             except ValueError as e:
-                logger.error(f"Failed to concatenate denoised audio segments for {speaker}: {e}")
-                # Decide how to handle this - maybe skip saving the sample?
-                continue # Skip saving this sample if concatenation fails
-        # Step 3: Save denoised audio directly
         sample_path = f"speaker_{speaker}_sample.wav"
-        sf.write(sample_path, denoised_audio_array, fps)
         speaker_sample_paths[speaker] = sample_path
-        logger.info(f"✅ Created and saved sample for {speaker}: {sample_path}")
-    # Cleanup
-    logger.info("🧹 Closing audio clip and removing temporary files...")
     video.close()
     audio_clip.close()
-    os.remove(speech_audio_path)
-    logger.info("✅ Finished processing all speaker samples.")
     return transcript_with_speakers, detected_language

             conn.commit()
         return "Thank you for your feedback!", None
+def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
     """
     Uses Demucs to separate audio and extract background (non-vocal) parts.
     Merges drums, bass, and other stems into a single background track.
     stem_dir = os.path.join("separated", "htdemucs", filename)
     # Step 3: Load and merge background stems
     drums = AudioSegment.from_wav(os.path.join(stem_dir, "drums.wav"))
     bass = AudioSegment.from_wav(os.path.join(stem_dir, "bass.wav"))
     other = AudioSegment.from_wav(os.path.join(stem_dir, "other.wav"))
     # Step 4: Export the merged background
     background.export(background_audio_path, format="wav")
+    return background_audio_path
+# def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
+#     pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
+#     vad_result = pipeline(audio_path)
+#     full_audio = AudioSegment.from_wav(audio_path)
+#     full_duration_sec = len(full_audio) / 1000.0
+#     current_time = 0.0
+#     result_audio = AudioSegment.empty()
+#     for segment in vad_result.itersegments():
+#         # Background segment before the speech
+#         if current_time < segment.start:
+#             bg = full_audio[int(current_time * 1000):int(segment.start * 1000)]
+#             result_audio += bg
+#         # Add silence for the speech duration
+#         silence_duration = segment.end - segment.start
+#         result_audio += AudioSegment.silent(duration=int(silence_duration * 1000))
+#         current_time = segment.end
+#     # Handle any remaining background after the last speech
+#     if current_time < full_duration_sec:
+#         result_audio += full_audio[int(current_time * 1000):]
+#     result_audio.export(background_audio_path, format="wav")
+#     return background_audio_path
 def transcribe_video_with_speakers(video_path):
     # Extract audio from video
     video.audio.write_audiofile(audio_path)
     logger.info(f"Audio extracted from video: {audio_path}")
+    segment_result = segment_background_audio(audio_path)
     print(f"Saved non-speech (background) audio to local")
     # Set up device
         logger.info("WhisperX model loaded")
         # Transcribe
+        result = model.transcribe(audio_path, chunk_size=6, print_progress = True)
         logger.info("Audio transcription completed")
         # Get the detected language
         logger.debug(f"Detected language: {detected_language}")
         # Alignment
         model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
+        result = whisperx.align(result["segments"], model_a, metadata, audio_path, device)
         logger.info("Transcription alignment completed")
         # Diarization (works independently of Whisper model size)
         diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
+        diarize_segments = diarize_model(audio_path)
         logger.info("Speaker diarization completed")
         # Assign speakers
         }
         for segment in result["segments"]
     ]
     # Collect audio for each speaker
     speaker_audio = {}
+    for segment in result["segments"]:
         speaker = segment["speaker"]
+        if speaker not in speaker_audio:
+            speaker_audio[speaker] = []
+        speaker_audio[speaker].append((segment["start"], segment["end"]))
     # Collapse and truncate speaker audio
     speaker_sample_paths = {}
+    audio_clip = AudioFileClip(audio_path)
     for speaker, segments in speaker_audio.items():
         speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
+        combined_clip = concatenate_audioclips(speaker_clips)
         truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
         sample_path = f"speaker_{speaker}_sample.wav"
+        truncated_clip.write_audiofile(sample_path)
         speaker_sample_paths[speaker] = sample_path
+        logger.info(f"Created sample for {speaker}: {sample_path}")
+    # Clean up
     video.close()
     audio_clip.close()
+    os.remove(audio_path)
     return transcript_with_speakers, detected_language