Spaces:

rc19477
/

Speech_Enhancement_Mamba

Running on Zero

App Files Files Community

roychao19477 commited on May 30

Commit

3af0ebe

1 Parent(s): 56efbc8

Upload

Browse files

Files changed (1) hide show

app.py +28 -20

app.py CHANGED Viewed

@@ -44,48 +44,56 @@ sdict  = torch.load(ckpt, map_location=device)
 model.load_state_dict(sdict["generator"])
 model.eval()
 @spaces.GPU
 def enhance(filepath):
     with torch.no_grad():
-        # load & (if needed) resample to model SR
         wav, orig_sr = librosa.load(filepath, sr=None)
         if orig_sr != 16000:
             wav = librosa.resample(wav, orig_sr, 16000)
-        # normalize → tensor
         x = torch.from_numpy(wav).float().to(device)
         norm = torch.sqrt(len(x)/torch.sum(x**2))
-        x = (x*norm).unsqueeze(0)
         # STFT → model → ISTFT
-        amp ,pha , _ = mag_phase_stft(x, 400, 100, 400, 0.3)
-        with torch.no_grad():
-            amp2, pha2, comp = model(amp, pha)
         out = mag_phase_istft(amp2, pha2, 400, 100, 400, 0.3)
-        out = (out/norm).squeeze().cpu().numpy()
         # back to original rate
         if orig_sr != 16000:
-            out = librosa.resample(out, 16000, orig_sr, 'PCM_16')
         # write file
         sf.write("enhanced.wav", out, orig_sr)
-        # build spectrogram
-        D = librosa.stft(out, n_fft=1024, hop_length=512)
-        S = librosa.amplitude_to_db(np.abs(D), ref=np.max)
-        fig, ax = plt.subplots(figsize=(6,3))
-        librosa.display.specshow(S, sr=orig_sr, hop_length=512, x_axis="time", y_axis="hz", ax=ax)
-        ax.set_title("Enhanced Spectrogram")
-        plt.colorbar(format="%+2.0f dB", ax=ax)
-        return "enhanced.wav"#, fig
 with gr.Blocks() as demo:
     gr.Markdown(ABOUT)
-    input_audio = gr.Audio(label="Input Audio", type="filepath")
     enhance_btn = gr.Button("Enhance")
     output_audio = gr.Audio(label="Enhanced Audio", type="filepath")
-    enhance_btn.click(fn=enhance, inputs=input_audio, outputs=output_audio)
 demo.queue().launch()

 model.load_state_dict(sdict["generator"])
 model.eval()
 @spaces.GPU
 def enhance(filepath):
     with torch.no_grad():
+        # load & resample
         wav, orig_sr = librosa.load(filepath, sr=None)
         if orig_sr != 16000:
             wav = librosa.resample(wav, orig_sr, 16000)
         x = torch.from_numpy(wav).float().to(device)
         norm = torch.sqrt(len(x)/torch.sum(x**2))
+        x = (x * norm).unsqueeze(0)
         # STFT → model → ISTFT
+        amp, pha, _ = mag_phase_stft(x, 400, 100, 400, 0.3)
+        amp2, pha2, _ = model(amp, pha)
         out = mag_phase_istft(amp2, pha2, 400, 100, 400, 0.3)
+        out = (out / norm).squeeze().cpu().numpy()
         # back to original rate
         if orig_sr != 16000:
+            out = librosa.resample(out, 16000, orig_sr)
         # write file
         sf.write("enhanced.wav", out, orig_sr)
+        # spectrograms
+        fig, axs = plt.subplots(1, 2, figsize=(10, 4))
+        # noisy
+        D_noisy = librosa.stft(wav, n_fft=1024, hop_length=512)
+        S_noisy = librosa.amplitude_to_db(np.abs(D_noisy), ref=np.max)
+        librosa.display.specshow(S_noisy, sr=orig_sr, hop_length=512, x_axis="time", y_axis="hz", ax=axs[0])
+        axs[0].set_title("Noisy Spectrogram")
+        # enhanced
+        D_clean = librosa.stft(out, n_fft=1024, hop_length=512)
+        S_clean = librosa.amplitude_to_db(np.abs(D_clean), ref=np.max)
+        librosa.display.specshow(S_clean, sr=orig_sr, hop_length=512, x_axis="time", y_axis="hz", ax=axs[1])
+        axs[1].set_title("Enhanced Spectrogram")
+        plt.tight_layout()
+    return "enhanced.wav", fig
 with gr.Blocks() as demo:
     gr.Markdown(ABOUT)
+    input_audio = gr.Audio(label="Input Audio", type="filepath", interactive=True)
     enhance_btn = gr.Button("Enhance")
     output_audio = gr.Audio(label="Enhanced Audio", type="filepath")
+    plot_output = gr.Plot(label="Spectrograms")
+    enhance_btn.click(fn=enhance, inputs=input_audio, outputs=[output_audio, plot_output])
 demo.queue().launch()