import spaces import gradio as gr import torch, os import wave import librosa import numpy as np from scipy.io.wavfile import write from PIL import Image import matplotlib.pyplot as plt from huggingface_hub import snapshot_download import soundfile as sf from auffusion_pipeline import AuffusionPipeline # —— from diffusers import StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline from converter import load_wav, mel_spectrogram, normalize_spectrogram, denormalize_spectrogram, Generator, get_mel_spectrogram_from_audio from utils import pad_spec, image_add_color, torch_to_pil, normalize, denormalize, prepare_mask_and_masked_image # —— def resample_audio(input_audio, original_sr, target_sr=16000): """ Resample the audio to the target sample rate (16000 Hz by default). Args: - input_audio (numpy array): The raw audio data. - original_sr (int): The original sample rate of the input audio. - target_sr (int): The target sample rate (default is 16000 Hz). Returns: - numpy array: The resampled audio. """ if original_sr != target_sr: # Resample the audio using librosa audio_resampled = librosa.resample(input_audio, orig_sr=original_sr, target_sr=target_sr) return audio_resampled else: # If sample rate is already 16000, no resampling is needed return input_audio def save_spectrogram_image(spectrogram, filename): """Save a spectrogram as an image.""" plt.figure(figsize=(10, 4)) plt.imshow(spectrogram.squeeze(), aspect='auto', origin='lower', cmap='magma') plt.axis('off') # Hide axes for a cleaner image plt.savefig(filename, bbox_inches='tight', pad_inches=0) plt.close() @spaces.GPU def infer(prompt, progress=gr.Progress(track_tqdm=True)): """ Generate audio from a textual prompt using AuffusionPipeline. Args: prompt (str): Text description of the desired audio content. progress (gr.Progress, optional): Progress tracker for UI feedback. Returns: str: The file path to the generated WAV audio file. """ pipeline = AuffusionPipeline.from_pretrained("auffusion/auffusion") prompt = prompt output = pipeline(prompt=prompt) audio = output.audios[0] sf.write(f"{prompt}.wav", audio, samplerate=16000) return f"{prompt}.wav" @spaces.GPU def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(track_tqdm=True)): """ Perform audio-to-audio transformation with image-to-image style generation. Args: prompt (str): Text prompt guiding the audio transformation. audio_path (str): File path to the input WAV audio reference. desired_strength (float): Strength of prompt influence in [0.0, 1.0]. progress (gr.Progress, optional): Progress tracker for UI feedback. Returns: tuple: - str: File path of the generated output WAV audio. - str: File path of the input spectrogram image (PNG). - str: File path of the output spectrogram image (PNG). """ # Load your audio file input_audio, original_sr = librosa.load(audio_path, sr=None) # Load with original sampling rate resampled_audio = resample_audio(input_audio, original_sr, target_sr=16000) # Save the resampled audio to a new file sf.write('resampled_audio.wav', resampled_audio, 16000) audio_path = 'resampled_audio.wav' pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter" dtype = torch.float16 device = "cuda" if not os.path.isdir(pretrained_model_name_or_path): pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path) vocoder = Generator.from_pretrained(pretrained_model_name_or_path, subfolder="vocoder") vocoder = vocoder.to(device=device, dtype=dtype) pipe = StableDiffusionImg2ImgPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype) pipe = pipe.to(device) width_start, width = 0, 160 strength_list = [desired_strength] prompt = prompt seed = 42 # Loading audio, sampling_rate = load_wav(audio_path) audio, spec = get_mel_spectrogram_from_audio(audio) # Normalize the spectrogram norm_spec = normalize_spectrogram(spec) # norm_spec = norm_spec[:,:, width_start:width_start+width] norm_spec = pad_spec(norm_spec, 1024) norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input # raw_image = image_add_color(torch_to_pil(norm_spec[:,:,:width])) raw_image = image_add_color(torch_to_pil(norm_spec)) # Generation for different strength image_list = [] audio_list = [] generator = torch.Generator(device=device).manual_seed(seed) for strength in strength_list: with torch.autocast("cuda"): output_spec = pipe( prompt=prompt, image=norm_spec, num_inference_steps=100, generator=generator, output_type="pt", strength=strength, guidance_scale=7.5 ).images[0] # add to image_list # output_spec = output_spec[:, :, :width] output_spec_image = torch_to_pil(output_spec) color_output_spec_image = image_add_color(output_spec_image) image_list.append(color_output_spec_image) # add to audio_list denorm_spec = denormalize_spectrogram(output_spec) denorm_spec_audio = vocoder.inference(denorm_spec) audio_list.append(denorm_spec_audio) # Display # Concat image with different strength & add interval between images with black color concat_image_list = [] for i in range(len(image_list)): if i == len(image_list) - 1: concat_image_list.append(np.array(image_list[i])) else: concat_image_list.append(np.concatenate([np.array(image_list[i]), np.ones((256, 20, 3))*0], axis=1)) concat_image = np.concatenate(concat_image_list, axis=1) concat_image = Image.fromarray(np.uint8(concat_image)) ### Concat audio concat_audio_list = [np.concatenate([audio, np.zeros((1, 16000))], axis=1) for audio in audio_list] concat_audio = np.concatenate(concat_audio_list, axis=1) print("audio_path:", audio_path) print("width_start:", width_start, "width:", width) print("text prompt:", prompt) print("strength_list:", strength_list) # Ensure correct shape concat_audio = concat_audio.flatten() # Converts (1, N) → (N,) # Normalize the audio to prevent clipping or excessive loudness concat_audio = concat_audio / np.max(np.abs(concat_audio)) # Scale between -1 and 1 # Save as WAV sf.write("output.wav", concat_audio, 16000) # Save input spectrogram image input_spec_image_path = "input_spectrogram.png" raw_image.save(input_spec_image_path) # Save concatenated spectrogram image output_spec_image_path = "output_spectrogram.png" concat_image.save(output_spec_image_path) return "output.wav", input_spec_image_path, output_spec_image_path @spaces.GPU def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.Progress(track_tqdm=True)): """ Perform audio inpainting with masked spectrogram region guided by a prompt. Args: prompt (str): Text prompt describing the desired inpainted audio content. audio_path (str): File path to the input WAV audio reference. mask_start_point (int): Start index of the mask region in the spectrogram. mask_end_point (int): End index of the mask region in the spectrogram. progress (gr.Progress, optional): Progress tracker for UI feedback. Returns: tuple: - str: File path of the generated inpainted output WAV audio. - str: File path of the input spectrogram image (PNG). - PIL.Image.Image: The output spectrogram image with inpainted region (PIL image). """ # Load your audio file input_audio, original_sr = librosa.load(audio_path, sr=None) # Load with original sampling rate resampled_audio = resample_audio(input_audio, original_sr, target_sr=16000) # Save the resampled audio to a new file sf.write('resampled_audio.wav', resampled_audio, 16000) audio_path = 'resampled_audio.wav' pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter" dtype = torch.float16 device = "cuda" if not os.path.isdir(pretrained_model_name_or_path): pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path) vocoder = Generator.from_pretrained(pretrained_model_name_or_path, subfolder="vocoder") vocoder = vocoder.to(device=device, dtype=dtype) pipe = StableDiffusionInpaintPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype) pipe = pipe.to(device) width_start, width = mask_start_point, mask_end_point-mask_start_point prompt = prompt seed = 42 # Loading audio, sampling_rate = load_wav(audio_path) audio, spec = get_mel_spectrogram_from_audio(audio) norm_spec = normalize_spectrogram(spec) norm_spec = pad_spec(norm_spec, 1024) norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input raw_image = image_add_color(torch_to_pil(norm_spec)) # Add Mask mask = torch.zeros_like(norm_spec)[:1,...] mask[:, :, width_start:width_start+width] = 1 mask_image = torch_to_pil(mask) mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask) masked_spec_image = torch_to_pil(masked_spec) # color masked spec and paint masked area to black color_masked_spec_image = image_add_color(masked_spec_image) color_masked_spec_image = np.array(color_masked_spec_image) color_masked_spec_image[:, width_start:width_start+width, :] = 0 color_masked_spec_image = Image.fromarray(color_masked_spec_image) # Generation generator = torch.Generator(device=device).manual_seed(seed) with torch.autocast("cuda"): output_spec = pipe( prompt=prompt, image=norm_spec, mask_image=mask, num_inference_steps=100, generator=generator, height=256, width=1024, output_type="pt" ).images[0] output_spec_image = torch_to_pil(output_spec) color_output_spec_image = image_add_color(output_spec_image) # Display audio result: raw audio, masked raw audio, generated audio post_norm_spec = denormalize(norm_spec).to(device, dtype) raw_chunk_spec = denormalize_spectrogram(post_norm_spec) raw_chunk_audio = vocoder.inference(raw_chunk_spec) post_masked_spec = denormalize(masked_spec).to(device, dtype) denorm_masked_spec = denormalize_spectrogram(post_masked_spec) denorm_masked_spec_audio = vocoder.inference(denorm_masked_spec) denorm_spec = denormalize_spectrogram(output_spec) denorm_spec_audio = vocoder.inference(denorm_spec) #——— # Ensure correct shape denorm_spec_audio = denorm_spec_audio.flatten() # Converts (1, N) → (N,) denorm_spec_audio = denorm_spec_audio / np.max(np.abs(denorm_spec_audio)) # Scale between -1 and 1 # Save as WAV sf.write("generated_output.wav", denorm_spec_audio, 16000) # Save input spectrogram image input_spec_image_path = "input_spectrogram.png" raw_image.save(input_spec_image_path) # Save output spectrogram image output_spec_image_path = "output_spectrogram.png" color_output_spec_image.save(output_spec_image_path) return "generated_output.wav", input_spec_image_path, color_output_spec_image def load_input_spectrogram(audio_path): # Loading audio, sampling_rate = load_wav(audio_path) audio, spec = get_mel_spectrogram_from_audio(audio) norm_spec = normalize_spectrogram(spec) norm_spec = pad_spec(norm_spec, 1024) norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input raw_image = image_add_color(torch_to_pil(norm_spec)) # Save input spectrogram image input_spec_image_path = "input_spectrogram.png" raw_image.save(input_spec_image_path) return input_spec_image_path @spaces.GPU def preview_masked_area(audio_path, mask_start_point, mask_end_point): # Loading audio, sampling_rate = load_wav(audio_path) audio, spec = get_mel_spectrogram_from_audio(audio) norm_spec = normalize_spectrogram(spec) norm_spec = pad_spec(norm_spec, 1024) norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input # Add Mask width_start, width = mask_start_point, mask_end_point-mask_start_point mask = torch.zeros_like(norm_spec)[:1,...] mask[:, :, width_start:width_start+width] = 1 mask_image = torch_to_pil(mask) mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask) masked_spec_image = torch_to_pil(masked_spec) # color masked spec and paint masked area to black color_masked_spec_image = image_add_color(masked_spec_image) color_masked_spec_image = np.array(color_masked_spec_image) color_masked_spec_image[:, width_start:width_start+width, :] = 0 color_masked_spec_image = Image.fromarray(color_masked_spec_image) # Save the masked spectrogram image masked_spec_image_path = "masked_spectrogram.png" color_masked_spec_image.save(masked_spec_image_path) return masked_spec_image_path def load_inpaint_example(prompt_inp, audio_path): in_spec_path = load_input_spectrogram(audio_path) masked_spec_path = preview_masked_area(audio_path, 256, 768) return in_spec_path, masked_spec_path css=""" div#col-container{ margin: 0 auto; max-width: 640px; } """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.Markdown("# Auffusion") gr.Markdown("Auffusion can generate realistic audios including human sounds, animal sounds, natural and artificial sounds and sound effects from textual prompts. ") gr.HTML("""
Duplicate this Space
""") with gr.Tab("Text-to-Audio"): prompt = gr.Textbox(label="Prompt") submit_btn = gr.Button("Submit") audio_out = gr.Audio(label="Audio Ressult") gr.Examples( examples = [ "Rolling thunder with lightning strikes", "Two gunshots followed by birds chirping", "A train whistle blowing in the distance" ], inputs = [prompt], cache_examples=False ) submit_btn.click( fn = infer, inputs = [prompt], outputs = [audio_out], show_api=True ) with gr.Tab("Audio-to-Audio"): prompt_img2img = gr.Textbox(label="Prompt") audio_in_img2img = gr.Audio(label="Audio Reference", type="filepath", format="wav") prompt_strength = gr.Slider(label="Prompt Strength", minimum=0.0, maximum=1.0, step=0.1, value=0.7) submit_btn_img2img = gr.Button("Submit") audio_out_img2img = gr.Audio(label="Audio Ressult") with gr.Accordion("Compare Spectrograms", open=False): with gr.Column(): input_spectrogram = gr.Image(label="Input Spectrogram") output_spectrogram = gr.Image(label="Output Spectrogram") gr.Examples( examples = [ ["Ambulance siren", "./notebooks/examples/img2img/GIOApFAWDOc_160.wav"], ["A cat is moewing", "./notebooks/examples/img2img/YniwgMbB6tpQ_01.wav"], ["A car racing", "./notebooks/examples/img2img/_GI7meqlYZk_30.wav"] ], inputs = [prompt_img2img, audio_in_img2img], cache_examples=False ) submit_btn_img2img.click( fn = infer_img2img, inputs = [prompt_img2img, audio_in_img2img, prompt_strength], outputs = [audio_out_img2img, input_spectrogram, output_spectrogram], show_api=True ) with gr.Tab("Audio InPainting"): prompt_inp = gr.Textbox(label="Prompt") audio_in_inp = gr.Audio(label="Audio Reference", type="filepath", format="wav") audio_in_spec = gr.Image(label="Audio IN spectrogram") mask_start_point = gr.Slider(label="Mask Start point", minimum=0, maximum=1024, step=1, value=256) mask_end_point = gr.Slider(label="Mask End point", minimum=0, maximum=1024, step=1, value=768) preview_mask_btn = gr.Button("Preview Mask") masked_spec_preview = gr.Image(label="Spectrogram Mask Preview") submit_btn_inp = gr.Button("Submit") audio_out_inp = gr.Audio(label="Audio Ressult") with gr.Accordion("Compare Spectrograms", open=False): with gr.Column(): input_spectrogram_inp = gr.Image(label="Input Spectrogram") output_spectrogram_inp = gr.Image(label="Output Spectrogram") gr.Examples( examples = [ ["A siren ringing with a vehicle speeding closer", "./notebooks/examples/inpainting/IvfaKPDWC00_160.wav"], ["A woman speaking", "./notebooks/examples/inpainting/9z8XIRyUq9Q_30.wav"], ["An infant crying", "./notebooks/examples/inpainting/14ekd4nkpwc_28.wav"], ["A dog barking and growling", "./notebooks/examples/inpainting/3ek-xLwr05Q_30.wav"] ], fn = load_inpaint_example, inputs = [prompt_inp, audio_in_inp], outputs = [audio_in_spec, masked_spec_preview], cache_examples = False ) audio_in_inp.upload( fn = load_input_spectrogram, inputs = [audio_in_inp], outputs = [audio_in_spec], show_api=False ) audio_in_inp.stop_recording( fn = load_input_spectrogram, inputs = [audio_in_inp], outputs = [audio_in_spec], show_api=False ) preview_mask_btn.click( fn = preview_masked_area, inputs = [audio_in_inp, mask_start_point, mask_end_point], outputs = [masked_spec_preview], show_api=False ) submit_btn_inp.click( fn = infer_inp, inputs = [prompt_inp, audio_in_inp, mask_start_point, mask_end_point], outputs = [audio_out_inp, input_spectrogram_inp, output_spectrogram_inp], show_api=False ) demo.queue().launch(ssr_mode=False, mcp_server=True, show_error=True)