import gradio as gr
import numpy as np
import torch
from einops import rearrange
from stable_audio_tools import get_pretrained_model
from stable_audio_tools.inference.generation import generate_diffusion_cond

# ---------- Load model ----------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_REPO = "stabilityai/stable-audio-open-small"  # accept license once on the model page

# Download + load (cached on first run)
model, model_config = get_pretrained_model(MODEL_REPO)
SAMPLE_RATE = int(model_config["sample_rate"])   # 44100
SAMPLE_SIZE  = int(model_config["sample_size"])  # internal size; we will pass seconds via conditioning
model = model.to(DEVICE)
model.eval()

def tta_seconds_to_sample_size(seconds: float) -> int:
    # Clamp to 1–11s (model cap)
    seconds = max(1.0, min(float(seconds), 11.0))
    return int(seconds)

@torch.inference_mode()
def generate_sfx(prompt, seconds, steps, cfg_scale, sampler):
    if not prompt or not prompt.strip():
        return None, "Enter a descriptive prompt (e.g., 'footsteps on gravel, outdoors, distant')."
    seconds = tta_seconds_to_sample_size(seconds)

    # Conditioning per stable-audio-tools API
    conditioning = [{
        "prompt": prompt.strip(),
        "seconds_total": seconds
    }]

    # Fast, CPU-friendly defaults:
    # steps=8–12 is a good range; pingpong sampler is efficient on CPU
    output = generate_diffusion_cond(
        model=model,
        steps=int(steps),
        cfg_scale=float(cfg_scale),
        conditioning=conditioning,
        sample_size=SAMPLE_SIZE,
        sampler_type=sampler,
        device=DEVICE
    )

    # output shape: (B, C, N) -> here B=1. Make it (C, N)
    audio = rearrange(output, "b d n -> d (b n)")
    # Normalize to [-1, 1] float32
    audio = audio.to(torch.float32)
    peak = torch.max(torch.abs(audio))
    if peak > 0:
        audio = (audio / peak).clamp(-1, 1)
    audio_np = audio.cpu().numpy()

    # Gradio expects (sr, np.ndarray [N] or [N, C]); provide stereo [N, 2]
    audio_np = audio_np.T  # (N, C)
    return (SAMPLE_RATE, audio_np), "Done."

EXAMPLES = [
    "Footsteps on gravel, outdoors, medium pace, natural ambience",
    "Heavy metal door slam with long metallic reverb, industrial",
    "Rain on window, occasional distant thunder, calm night",
    "Camera shutter click, mechanical, clean studio",
    "Sci-fi laser blast, short, bright, synthetic fizz"
]

with gr.Blocks(title="Professor Treviño — Text→SFX (Free)") as demo:
    gr.Markdown("### Text-to-Sound Effects — Free, no login\nEnter a descriptive prompt and generate up to ~11s stereo @ 44.1 kHz.")

    with gr.Row():
        prompt = gr.Textbox(label="Prompt", placeholder="e.g., footsteps on gravel, outdoors, distant")
    with gr.Row():
        seconds = gr.Slider(3, 11, value=6, step=1, label="Duration (seconds)")
        steps = gr.Slider(6, 16, value=8, step=1, label="Diffusion steps (higher = better/slower)")
    with gr.Row():
        cfg_scale = gr.Slider(0.5, 4.0, value=1.0, step=0.1, label="Guidance (CFG scale)")
        sampler = gr.Dropdown(choices=["pingpong", "heun", "dpmpp-2m"], value="pingpong", label="Sampler")

    btn = gr.Button("Generate")
    audio_out = gr.Audio(label="Output", type="numpy")
    status = gr.Markdown()

    btn.click(fn=generate_sfx, inputs=[prompt, seconds, steps, cfg_scale, sampler], outputs=[audio_out, status])
    gr.Examples(EXAMPLES, [prompt], [], fn=None, cache_examples=False)

demo.queue(concurrency_count=1, max_size=8).launch()