fffiloni commited on
Commit
3c3e8e9
·
verified ·
1 Parent(s): 05091a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -22
app.py CHANGED
@@ -70,8 +70,14 @@ def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(tra
70
 
71
  # Loading
72
  audio, sampling_rate = load_wav(audio_path)
 
 
73
  audio, spec = get_mel_spectrogram_from_audio(audio)
 
 
74
  norm_spec = normalize_spectrogram(spec)
 
 
75
  # norm_spec = norm_spec[:,:, width_start:width_start+width]
76
  norm_spec = pad_spec(norm_spec, 1024)
77
  norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
@@ -166,8 +172,14 @@ def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.
166
 
167
  # Loading
168
  audio, sampling_rate = load_wav(audio_path)
 
 
169
  audio, spec = get_mel_spectrogram_from_audio(audio)
 
 
170
  norm_spec = normalize_spectrogram(spec)
 
 
171
  norm_spec = pad_spec(norm_spec, 1024)
172
  norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
173
 
@@ -206,10 +218,6 @@ def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.
206
  post_masked_spec = denormalize(masked_spec).to(device, dtype)
207
  denorm_masked_spec = denormalize_spectrogram(post_masked_spec)
208
  denorm_masked_spec_audio = vocoder.inference(denorm_masked_spec)
209
-
210
- # Rescale generated spectrogram to match original range
211
- output_spec = (output_spec - output_spec.min()) / (output_spec.max() - output_spec.min()) # Normalize to [0,1]
212
- output_spec = output_spec * (norm_spec.max() - norm_spec.min()) + norm_spec.min() # Rescale to match input range
213
 
214
  denorm_spec = denormalize_spectrogram(output_spec)
215
  denorm_spec_audio = vocoder.inference(denorm_spec)
@@ -218,19 +226,9 @@ def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.
218
 
219
  # Ensure correct shape
220
  denorm_spec_audio = denorm_spec_audio.flatten() # Converts (1, N) → (N,)
221
-
222
- print("Original spectrogram min/max:", norm_spec.min().item(), norm_spec.max().item())
223
- print("Generated spectrogram min/max:", output_spec.min().item(), output_spec.max().item())
224
-
225
- global_max = max(np.max(np.abs(raw_chunk_audio)), np.max(np.abs(denorm_spec_audio)))
226
- denorm_spec_audio = denorm_spec_audio / global_max # Normalize using a shared scale
227
-
228
- print("Masked spectrogram min/max:", masked_spec.min().item(), masked_spec.max().item())
229
- print("Output spectrogram min/max:", output_spec.min().item(), output_spec.max().item())
230
 
231
  # Save as WAV
232
- sf.write("raw_output.wav", raw_chunk_audio, 16000)
233
- sf.write("masked_raw_output.wav", denorm_masked_spec_audio, 16000)
234
  sf.write("generated_output.wav", denorm_spec_audio, 16000)
235
 
236
  # Save input spectrogram image
@@ -241,7 +239,7 @@ def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.
241
  output_spec_image_path = "output_spectrogram.png"
242
  color_output_spec_image.save(output_spec_image_path)
243
 
244
- return "raw_output.wav", input_spec_image_path, color_output_spec_image, "raw_output.wav", "masked_raw_output.wav"
245
 
246
  def load_input_spectrogram(audio_path):
247
  # Loading
@@ -387,11 +385,6 @@ with gr.Blocks(css=css) as demo:
387
  input_spectrogram_inp = gr.Image(label="Input Spectrogram")
388
  output_spectrogram_inp = gr.Image(label="Output Spectrogram")
389
 
390
- with gr.Accordion("Raw Processed audio", open=False):
391
- with gr.Column():
392
- raw_out_audio = gr.Audio(label="RAW Audio")
393
- raw_masked_out_audio = gr.Audio(label="RAW Masked Audio")
394
-
395
  gr.Examples(
396
  examples = [
397
  ["A siren ringing with a vehicle speeding closer", "./notebooks/examples/inpainting/IvfaKPDWC00_160.wav"],
@@ -426,7 +419,7 @@ with gr.Blocks(css=css) as demo:
426
  submit_btn_inp.click(
427
  fn = infer_inp,
428
  inputs = [prompt_inp, audio_in_inp, mask_start_point, mask_end_point],
429
- outputs = [audio_out_inp, input_spectrogram_inp, output_spectrogram_inp, raw_out_audio, raw_masked_out_audio]
430
  )
431
 
432
  demo.queue().launch(show_api=False, show_error=True)
 
70
 
71
  # Loading
72
  audio, sampling_rate = load_wav(audio_path)
73
+ print(f"Raw audio min/max: {audio.min()}, {audio.max()}")
74
+
75
  audio, spec = get_mel_spectrogram_from_audio(audio)
76
+ print(f"Spectrogram min/max before normalization: {spec.min()}, {spec.max()}")
77
+
78
  norm_spec = normalize_spectrogram(spec)
79
+ print(f"Spectrogram min/max after normalization: {norm_spec.min()}, {norm_spec.max()}")
80
+
81
  # norm_spec = norm_spec[:,:, width_start:width_start+width]
82
  norm_spec = pad_spec(norm_spec, 1024)
83
  norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
 
172
 
173
  # Loading
174
  audio, sampling_rate = load_wav(audio_path)
175
+ print(f"Raw audio min/max: {audio.min()}, {audio.max()}")
176
+
177
  audio, spec = get_mel_spectrogram_from_audio(audio)
178
+ print(f"Spectrogram min/max before normalization: {spec.min()}, {spec.max()}")
179
+
180
  norm_spec = normalize_spectrogram(spec)
181
+ print(f"Spectrogram min/max after normalization: {norm_spec.min()}, {norm_spec.max()}")
182
+
183
  norm_spec = pad_spec(norm_spec, 1024)
184
  norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
185
 
 
218
  post_masked_spec = denormalize(masked_spec).to(device, dtype)
219
  denorm_masked_spec = denormalize_spectrogram(post_masked_spec)
220
  denorm_masked_spec_audio = vocoder.inference(denorm_masked_spec)
 
 
 
 
221
 
222
  denorm_spec = denormalize_spectrogram(output_spec)
223
  denorm_spec_audio = vocoder.inference(denorm_spec)
 
226
 
227
  # Ensure correct shape
228
  denorm_spec_audio = denorm_spec_audio.flatten() # Converts (1, N) → (N,)
229
+ denorm_spec_audio = denorm_spec_audio / np.max(np.abs(denorm_spec_audio)) # Scale between -1 and 1
 
 
 
 
 
 
 
 
230
 
231
  # Save as WAV
 
 
232
  sf.write("generated_output.wav", denorm_spec_audio, 16000)
233
 
234
  # Save input spectrogram image
 
239
  output_spec_image_path = "output_spectrogram.png"
240
  color_output_spec_image.save(output_spec_image_path)
241
 
242
+ return "raw_output.wav", input_spec_image_path, color_output_spec_image
243
 
244
  def load_input_spectrogram(audio_path):
245
  # Loading
 
385
  input_spectrogram_inp = gr.Image(label="Input Spectrogram")
386
  output_spectrogram_inp = gr.Image(label="Output Spectrogram")
387
 
 
 
 
 
 
388
  gr.Examples(
389
  examples = [
390
  ["A siren ringing with a vehicle speeding closer", "./notebooks/examples/inpainting/IvfaKPDWC00_160.wav"],
 
419
  submit_btn_inp.click(
420
  fn = infer_inp,
421
  inputs = [prompt_inp, audio_in_inp, mask_start_point, mask_end_point],
422
+ outputs = [audio_out_inp, input_spectrogram_inp, output_spectrogram_inp]
423
  )
424
 
425
  demo.queue().launch(show_api=False, show_error=True)