Spaces:
Runtime error
Runtime error
import gradio as gr | |
import numpy as np | |
import torch | |
from einops import rearrange | |
from stable_audio_tools import get_pretrained_model | |
from stable_audio_tools.inference.generation import generate_diffusion_cond | |
# ---------- Load model ---------- | |
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
MODEL_REPO = "stabilityai/stable-audio-open-small" # accept license once on the model page | |
# Download + load (cached on first run) | |
model, model_config = get_pretrained_model(MODEL_REPO) | |
SAMPLE_RATE = int(model_config["sample_rate"]) # 44100 | |
SAMPLE_SIZE = int(model_config["sample_size"]) # internal size; we will pass seconds via conditioning | |
model = model.to(DEVICE) | |
model.eval() | |
def tta_seconds_to_sample_size(seconds: float) -> int: | |
# Clamp to 1–11s (model cap) | |
seconds = max(1.0, min(float(seconds), 11.0)) | |
return int(seconds) | |
def generate_sfx(prompt, seconds, steps, cfg_scale, sampler): | |
if not prompt or not prompt.strip(): | |
return None, "Enter a descriptive prompt (e.g., 'footsteps on gravel, outdoors, distant')." | |
seconds = tta_seconds_to_sample_size(seconds) | |
# Conditioning per stable-audio-tools API | |
conditioning = [{ | |
"prompt": prompt.strip(), | |
"seconds_total": seconds | |
}] | |
# Fast, CPU-friendly defaults: | |
# steps=8–12 is a good range; pingpong sampler is efficient on CPU | |
output = generate_diffusion_cond( | |
model=model, | |
steps=int(steps), | |
cfg_scale=float(cfg_scale), | |
conditioning=conditioning, | |
sample_size=SAMPLE_SIZE, | |
sampler_type=sampler, | |
device=DEVICE | |
) | |
# output shape: (B, C, N) -> here B=1. Make it (C, N) | |
audio = rearrange(output, "b d n -> d (b n)") | |
# Normalize to [-1, 1] float32 | |
audio = audio.to(torch.float32) | |
peak = torch.max(torch.abs(audio)) | |
if peak > 0: | |
audio = (audio / peak).clamp(-1, 1) | |
audio_np = audio.cpu().numpy() | |
# Gradio expects (sr, np.ndarray [N] or [N, C]); provide stereo [N, 2] | |
audio_np = audio_np.T # (N, C) | |
return (SAMPLE_RATE, audio_np), "Done." | |
EXAMPLES = [ | |
"Footsteps on gravel, outdoors, medium pace, natural ambience", | |
"Heavy metal door slam with long metallic reverb, industrial", | |
"Rain on window, occasional distant thunder, calm night", | |
"Camera shutter click, mechanical, clean studio", | |
"Sci-fi laser blast, short, bright, synthetic fizz" | |
] | |
with gr.Blocks(title="Professor Treviño — Text→SFX (Free)") as demo: | |
gr.Markdown("### Text-to-Sound Effects — Free, no login\nEnter a descriptive prompt and generate up to ~11s stereo @ 44.1 kHz.") | |
with gr.Row(): | |
prompt = gr.Textbox(label="Prompt", placeholder="e.g., footsteps on gravel, outdoors, distant") | |
with gr.Row(): | |
seconds = gr.Slider(3, 11, value=6, step=1, label="Duration (seconds)") | |
steps = gr.Slider(6, 16, value=8, step=1, label="Diffusion steps (higher = better/slower)") | |
with gr.Row(): | |
cfg_scale = gr.Slider(0.5, 4.0, value=1.0, step=0.1, label="Guidance (CFG scale)") | |
sampler = gr.Dropdown(choices=["pingpong", "heun", "dpmpp-2m"], value="pingpong", label="Sampler") | |
btn = gr.Button("Generate") | |
audio_out = gr.Audio(label="Output", type="numpy") | |
status = gr.Markdown() | |
btn.click(fn=generate_sfx, inputs=[prompt, seconds, steps, cfg_scale, sampler], outputs=[audio_out, status]) | |
gr.Examples(EXAMPLES, [prompt], [], fn=None, cache_examples=False) | |
demo.queue(concurrency_count=1, max_size=8).launch() |