Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -70,8 +70,14 @@ def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(tra
|
|
70 |
|
71 |
# Loading
|
72 |
audio, sampling_rate = load_wav(audio_path)
|
|
|
|
|
73 |
audio, spec = get_mel_spectrogram_from_audio(audio)
|
|
|
|
|
74 |
norm_spec = normalize_spectrogram(spec)
|
|
|
|
|
75 |
# norm_spec = norm_spec[:,:, width_start:width_start+width]
|
76 |
norm_spec = pad_spec(norm_spec, 1024)
|
77 |
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
|
@@ -166,8 +172,14 @@ def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.
|
|
166 |
|
167 |
# Loading
|
168 |
audio, sampling_rate = load_wav(audio_path)
|
|
|
|
|
169 |
audio, spec = get_mel_spectrogram_from_audio(audio)
|
|
|
|
|
170 |
norm_spec = normalize_spectrogram(spec)
|
|
|
|
|
171 |
norm_spec = pad_spec(norm_spec, 1024)
|
172 |
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
|
173 |
|
@@ -206,10 +218,6 @@ def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.
|
|
206 |
post_masked_spec = denormalize(masked_spec).to(device, dtype)
|
207 |
denorm_masked_spec = denormalize_spectrogram(post_masked_spec)
|
208 |
denorm_masked_spec_audio = vocoder.inference(denorm_masked_spec)
|
209 |
-
|
210 |
-
# Rescale generated spectrogram to match original range
|
211 |
-
output_spec = (output_spec - output_spec.min()) / (output_spec.max() - output_spec.min()) # Normalize to [0,1]
|
212 |
-
output_spec = output_spec * (norm_spec.max() - norm_spec.min()) + norm_spec.min() # Rescale to match input range
|
213 |
|
214 |
denorm_spec = denormalize_spectrogram(output_spec)
|
215 |
denorm_spec_audio = vocoder.inference(denorm_spec)
|
@@ -218,19 +226,9 @@ def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.
|
|
218 |
|
219 |
# Ensure correct shape
|
220 |
denorm_spec_audio = denorm_spec_audio.flatten() # Converts (1, N) → (N,)
|
221 |
-
|
222 |
-
print("Original spectrogram min/max:", norm_spec.min().item(), norm_spec.max().item())
|
223 |
-
print("Generated spectrogram min/max:", output_spec.min().item(), output_spec.max().item())
|
224 |
-
|
225 |
-
global_max = max(np.max(np.abs(raw_chunk_audio)), np.max(np.abs(denorm_spec_audio)))
|
226 |
-
denorm_spec_audio = denorm_spec_audio / global_max # Normalize using a shared scale
|
227 |
-
|
228 |
-
print("Masked spectrogram min/max:", masked_spec.min().item(), masked_spec.max().item())
|
229 |
-
print("Output spectrogram min/max:", output_spec.min().item(), output_spec.max().item())
|
230 |
|
231 |
# Save as WAV
|
232 |
-
sf.write("raw_output.wav", raw_chunk_audio, 16000)
|
233 |
-
sf.write("masked_raw_output.wav", denorm_masked_spec_audio, 16000)
|
234 |
sf.write("generated_output.wav", denorm_spec_audio, 16000)
|
235 |
|
236 |
# Save input spectrogram image
|
@@ -241,7 +239,7 @@ def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.
|
|
241 |
output_spec_image_path = "output_spectrogram.png"
|
242 |
color_output_spec_image.save(output_spec_image_path)
|
243 |
|
244 |
-
return "raw_output.wav", input_spec_image_path, color_output_spec_image
|
245 |
|
246 |
def load_input_spectrogram(audio_path):
|
247 |
# Loading
|
@@ -387,11 +385,6 @@ with gr.Blocks(css=css) as demo:
|
|
387 |
input_spectrogram_inp = gr.Image(label="Input Spectrogram")
|
388 |
output_spectrogram_inp = gr.Image(label="Output Spectrogram")
|
389 |
|
390 |
-
with gr.Accordion("Raw Processed audio", open=False):
|
391 |
-
with gr.Column():
|
392 |
-
raw_out_audio = gr.Audio(label="RAW Audio")
|
393 |
-
raw_masked_out_audio = gr.Audio(label="RAW Masked Audio")
|
394 |
-
|
395 |
gr.Examples(
|
396 |
examples = [
|
397 |
["A siren ringing with a vehicle speeding closer", "./notebooks/examples/inpainting/IvfaKPDWC00_160.wav"],
|
@@ -426,7 +419,7 @@ with gr.Blocks(css=css) as demo:
|
|
426 |
submit_btn_inp.click(
|
427 |
fn = infer_inp,
|
428 |
inputs = [prompt_inp, audio_in_inp, mask_start_point, mask_end_point],
|
429 |
-
outputs = [audio_out_inp, input_spectrogram_inp, output_spectrogram_inp
|
430 |
)
|
431 |
|
432 |
demo.queue().launch(show_api=False, show_error=True)
|
|
|
70 |
|
71 |
# Loading
|
72 |
audio, sampling_rate = load_wav(audio_path)
|
73 |
+
print(f"Raw audio min/max: {audio.min()}, {audio.max()}")
|
74 |
+
|
75 |
audio, spec = get_mel_spectrogram_from_audio(audio)
|
76 |
+
print(f"Spectrogram min/max before normalization: {spec.min()}, {spec.max()}")
|
77 |
+
|
78 |
norm_spec = normalize_spectrogram(spec)
|
79 |
+
print(f"Spectrogram min/max after normalization: {norm_spec.min()}, {norm_spec.max()}")
|
80 |
+
|
81 |
# norm_spec = norm_spec[:,:, width_start:width_start+width]
|
82 |
norm_spec = pad_spec(norm_spec, 1024)
|
83 |
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
|
|
|
172 |
|
173 |
# Loading
|
174 |
audio, sampling_rate = load_wav(audio_path)
|
175 |
+
print(f"Raw audio min/max: {audio.min()}, {audio.max()}")
|
176 |
+
|
177 |
audio, spec = get_mel_spectrogram_from_audio(audio)
|
178 |
+
print(f"Spectrogram min/max before normalization: {spec.min()}, {spec.max()}")
|
179 |
+
|
180 |
norm_spec = normalize_spectrogram(spec)
|
181 |
+
print(f"Spectrogram min/max after normalization: {norm_spec.min()}, {norm_spec.max()}")
|
182 |
+
|
183 |
norm_spec = pad_spec(norm_spec, 1024)
|
184 |
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
|
185 |
|
|
|
218 |
post_masked_spec = denormalize(masked_spec).to(device, dtype)
|
219 |
denorm_masked_spec = denormalize_spectrogram(post_masked_spec)
|
220 |
denorm_masked_spec_audio = vocoder.inference(denorm_masked_spec)
|
|
|
|
|
|
|
|
|
221 |
|
222 |
denorm_spec = denormalize_spectrogram(output_spec)
|
223 |
denorm_spec_audio = vocoder.inference(denorm_spec)
|
|
|
226 |
|
227 |
# Ensure correct shape
|
228 |
denorm_spec_audio = denorm_spec_audio.flatten() # Converts (1, N) → (N,)
|
229 |
+
denorm_spec_audio = denorm_spec_audio / np.max(np.abs(denorm_spec_audio)) # Scale between -1 and 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
230 |
|
231 |
# Save as WAV
|
|
|
|
|
232 |
sf.write("generated_output.wav", denorm_spec_audio, 16000)
|
233 |
|
234 |
# Save input spectrogram image
|
|
|
239 |
output_spec_image_path = "output_spectrogram.png"
|
240 |
color_output_spec_image.save(output_spec_image_path)
|
241 |
|
242 |
+
return "raw_output.wav", input_spec_image_path, color_output_spec_image
|
243 |
|
244 |
def load_input_spectrogram(audio_path):
|
245 |
# Loading
|
|
|
385 |
input_spectrogram_inp = gr.Image(label="Input Spectrogram")
|
386 |
output_spectrogram_inp = gr.Image(label="Output Spectrogram")
|
387 |
|
|
|
|
|
|
|
|
|
|
|
388 |
gr.Examples(
|
389 |
examples = [
|
390 |
["A siren ringing with a vehicle speeding closer", "./notebooks/examples/inpainting/IvfaKPDWC00_160.wav"],
|
|
|
419 |
submit_btn_inp.click(
|
420 |
fn = infer_inp,
|
421 |
inputs = [prompt_inp, audio_in_inp, mask_start_point, mask_end_point],
|
422 |
+
outputs = [audio_out_inp, input_spectrogram_inp, output_spectrogram_inp]
|
423 |
)
|
424 |
|
425 |
demo.queue().launch(show_api=False, show_error=True)
|