import gradio as gr import numpy as np import torch from einops import rearrange from stable_audio_tools import get_pretrained_model from stable_audio_tools.inference.generation import generate_diffusion_cond # ---------- Load model ---------- DEVICE = "cuda" if torch.cuda.is_available() else "cpu" MODEL_REPO = "stabilityai/stable-audio-open-small" # accept license once on the model page # Download + load (cached on first run) model, model_config = get_pretrained_model(MODEL_REPO) SAMPLE_RATE = int(model_config["sample_rate"]) # 44100 SAMPLE_SIZE = int(model_config["sample_size"]) # internal size; we will pass seconds via conditioning model = model.to(DEVICE) model.eval() def tta_seconds_to_sample_size(seconds: float) -> int: # Clamp to 1–11s (model cap) seconds = max(1.0, min(float(seconds), 11.0)) return int(seconds) @torch.inference_mode() def generate_sfx(prompt, seconds, steps, cfg_scale, sampler): if not prompt or not prompt.strip(): return None, "Enter a descriptive prompt (e.g., 'footsteps on gravel, outdoors, distant')." seconds = tta_seconds_to_sample_size(seconds) # Conditioning per stable-audio-tools API conditioning = [{ "prompt": prompt.strip(), "seconds_total": seconds }] # Fast, CPU-friendly defaults: # steps=8–12 is a good range; pingpong sampler is efficient on CPU output = generate_diffusion_cond( model=model, steps=int(steps), cfg_scale=float(cfg_scale), conditioning=conditioning, sample_size=SAMPLE_SIZE, sampler_type=sampler, device=DEVICE ) # output shape: (B, C, N) -> here B=1. Make it (C, N) audio = rearrange(output, "b d n -> d (b n)") # Normalize to [-1, 1] float32 audio = audio.to(torch.float32) peak = torch.max(torch.abs(audio)) if peak > 0: audio = (audio / peak).clamp(-1, 1) audio_np = audio.cpu().numpy() # Gradio expects (sr, np.ndarray [N] or [N, C]); provide stereo [N, 2] audio_np = audio_np.T # (N, C) return (SAMPLE_RATE, audio_np), "Done." EXAMPLES = [ "Footsteps on gravel, outdoors, medium pace, natural ambience", "Heavy metal door slam with long metallic reverb, industrial", "Rain on window, occasional distant thunder, calm night", "Camera shutter click, mechanical, clean studio", "Sci-fi laser blast, short, bright, synthetic fizz" ] with gr.Blocks(title="Professor Treviño — Text→SFX (Free)") as demo: gr.Markdown("### Text-to-Sound Effects — Free, no login\nEnter a descriptive prompt and generate up to ~11s stereo @ 44.1 kHz.") with gr.Row(): prompt = gr.Textbox(label="Prompt", placeholder="e.g., footsteps on gravel, outdoors, distant") with gr.Row(): seconds = gr.Slider(3, 11, value=6, step=1, label="Duration (seconds)") steps = gr.Slider(6, 16, value=8, step=1, label="Diffusion steps (higher = better/slower)") with gr.Row(): cfg_scale = gr.Slider(0.5, 4.0, value=1.0, step=0.1, label="Guidance (CFG scale)") sampler = gr.Dropdown(choices=["pingpong", "heun", "dpmpp-2m"], value="pingpong", label="Sampler") btn = gr.Button("Generate") audio_out = gr.Audio(label="Output", type="numpy") status = gr.Markdown() btn.click(fn=generate_sfx, inputs=[prompt, seconds, steps, cfg_scale, sampler], outputs=[audio_out, status]) gr.Examples(EXAMPLES, [prompt], [], fn=None, cache_examples=False) demo.queue(concurrency_count=1, max_size=8).launch()