Spaces:

rc19477
/

Speech_Enhancement_Mamba

Running on Zero

roychao19477 commited on 13 days ago

Commit

3c23ad1

1 Parent(s): 9d66cc0

Upload

Files changed (1) hide show

app.py CHANGED Viewed

@@ -53,20 +53,34 @@ def enhance(filepath):
             wav = librosa.resample(wav, orig_sr=orig_sr, target_sr=16000)
         x = torch.from_numpy(wav).float().to(device)
         norm = torch.sqrt(len(x)/torch.sum(x**2))
-        x = (x * norm).unsqueeze(0)
-        # STFT → model → ISTFT
-        amp, pha, _ = mag_phase_stft(x, 400, 100, 400, 0.3)
-        amp2, pha2, _ = model(amp, pha)
-        out = mag_phase_istft(amp2, pha2, 400, 100, 400, 0.3)
-        out = (out / norm).squeeze().cpu().numpy()
         # back to original rate
         if orig_sr != 16000:
             out = librosa.resample(out, orig_sr=16000, target_sr=orig_sr)
         # write file
-        sf.write("enhanced.wav", out, orig_sr)
         # spectrograms
         fig, axs = plt.subplots(1, 2, figsize=(10, 4))

             wav = librosa.resample(wav, orig_sr=orig_sr, target_sr=16000)
         x = torch.from_numpy(wav).float().to(device)
         norm = torch.sqrt(len(x)/torch.sum(x**2))
+        #x = (x * norm).unsqueeze(0)
+        x = (x * norm)
+        # split into 4s segments (64000 samples)
+        segment_len = 4 * 16000
+        chunks = x.split(segment_len)
+        enhanced_chunks = []
+        for chunk in chunks:
+            if len(chunk) < segment_len:
+                pad = torch.zeros(segment_len - len(chunk), device=chunk.device)
+                chunk = torch.cat([chunk, pad])
+            chunk = chunk.unsqueeze(0)
+            amp, pha, _ = mag_phase_stft(chunk, 400, 100, 400, 0.3)
+            amp2, pha2, _ = model(amp, pha)
+            out = mag_phase_istft(amp2, pha2, 400, 100, 400, 0.3)
+            out = (out / norm).squeeze(0)
+            enhanced_chunks.append(out)
+        out = torch.cat(enhanced_chunks)[:len(x)].cpu().numpy()  # trim padding
         # back to original rate
         if orig_sr != 16000:
             out = librosa.resample(out, orig_sr=16000, target_sr=orig_sr)
         # write file
+        sf.write("enhanced.wav", out, sr=orig_sr)
         # spectrograms
         fig, axs = plt.subplots(1, 2, figsize=(10, 4))