#!/usr/bin/env python3 """ RND1 Diffusion Model Demo for Hugging Face Spaces with ZeroGPU """ import torch import gradio as gr import spaces import random import numpy as np from transformers import AutoTokenizer from typing import Iterator import time model = None tokenizer = None device = "cuda" def set_seed(seed: int): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) def load_model(): global model, tokenizer from rnd.configuration_rnd import RND1Config from rnd.modeling_rnd import RND1LM model_path = "radicalnumerics/RND1-Base-0910" print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) print("Loading model...") cfg = RND1Config.from_pretrained(model_path) cfg.model_type = "rnd1" cfg.attn_implementation = "sdpa" cfg.moe_backend = "hf" model = RND1LM.from_pretrained( model_path, config=cfg, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, use_safetensors=True, low_cpu_mem_usage=True, ) model.eval() print("Model loaded successfully!") @spaces.GPU(duration=120) def generate_with_intermediate_steps( prompt: str, mode: str, num_steps: int, max_new_tokens: int, temperature: float, top_k: int, top_p: float, seed: int, show_intermediate: bool, ) -> Iterator[tuple[str, str]]: if not prompt.strip(): yield "Please enter a prompt.", "Error" return # Keep last valid output to avoid blanks last_output = "" last_status = "Initializing..." yield last_output, last_status set_seed(seed) if mode == "task": if not prompt.strip().startswith("Question:"): formatted_prompt = f"Question: {prompt}\n" else: formatted_prompt = prompt else: formatted_prompt = prompt last_status = "Tokenizing..." yield last_output, last_status inputs = tokenizer(formatted_prompt, return_tensors="pt") input_ids = inputs.input_ids.to(device) from rnd.generation_config import RND1GenerationConfig greedy = (temperature == 1.0) mask_token_id = 151669 generator = torch.Generator(device=device) generator.manual_seed(seed) if show_intermediate: # Generate at EVERY step from 1 to num_steps for current_step in range(1, num_steps + 1): generator.manual_seed(seed) gen_config = RND1GenerationConfig( max_new_tokens=max_new_tokens, num_diffusion_steps=current_step, mask_token_id=mask_token_id, temperature=temperature if not greedy else 1.0, top_k=top_k if top_k > 0 else None, top_p=top_p if top_p > 0 else None, greedy=greedy, eos_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else 151645, pad_token_id=tokenizer.pad_token_id, bos_token_id=tokenizer.bos_token_id, ) last_status = f"Step {current_step}/{num_steps}" with torch.no_grad(): output = model.generate( inputs=input_ids, generation_config=gen_config, generator=generator, ) generated_tokens = output[0][len(input_ids[0]):] text = tokenizer.decode( generated_tokens.tolist(), skip_special_tokens=True ) last_output = text if current_step == num_steps: last_status = f"Complete ({num_steps} steps)" yield last_output, last_status else: last_status = f"Generating ({num_steps} steps)..." yield last_output, last_status gen_config = RND1GenerationConfig( max_new_tokens=max_new_tokens, num_diffusion_steps=num_steps, mask_token_id=mask_token_id, temperature=temperature if not greedy else 1.0, top_k=top_k if top_k > 0 else None, top_p=top_p if top_p > 0 else None, greedy=greedy, eos_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else 151645, pad_token_id=tokenizer.pad_token_id, bos_token_id=tokenizer.bos_token_id, ) with torch.no_grad(): output = model.generate( inputs=input_ids, generation_config=gen_config, generator=generator, ) generated_tokens = output[0][len(input_ids[0]):] last_output = tokenizer.decode( generated_tokens.tolist(), skip_special_tokens=True ) last_status = "Complete" yield last_output, last_status def generate_wrapper( prompt: str, mode: str, num_steps: int, max_new_tokens: int, temperature: float, top_k: int, top_p: float, seed: int, show_intermediate: bool, ): for output, status in generate_with_intermediate_steps( prompt, mode, num_steps, max_new_tokens, temperature, top_k, top_p, seed, show_intermediate ): yield output, status def create_interface(): with gr.Blocks(title="RND1 Diffusion Language Model", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # RND1 Diffusion Language Model Generate text using a diffusion-based language model that refines all tokens simultaneously through iterative denoising steps. """) with gr.Row(): with gr.Column(scale=1): prompt = gr.Textbox( label="Prompt", placeholder="Enter your prompt here...", lines=4, value="Write a Python function that finds the longest common subsequence of two strings." ) mode = gr.Radio( choices=["task", "completion"], value="task", label="Generation Mode", info="Task: Q&A format | Completion: Text continuation" ) show_intermediate = gr.Checkbox( label="Show Live Generation", value=True, info="Display output at each diffusion step (slower but shows the process)" ) with gr.Accordion("Generation Settings", open=True): num_steps = gr.Slider( minimum=16, maximum=256, value=64, step=16, label="Diffusion Steps", info="More steps typically improve quality" ) max_new_tokens = gr.Slider( minimum=32, maximum=512, value=256, step=32, label="Max New Tokens" ) with gr.Accordion("Sampling Parameters", open=False): temperature = gr.Slider( minimum=0.1, maximum=2.0, value=1.0, step=0.1, label="Temperature", info="1.0 = greedy/deterministic" ) top_k = gr.Slider( minimum=0, maximum=100, value=0, step=1, label="Top-K", info="0 to disable" ) top_p = gr.Slider( minimum=0.0, maximum=1.0, value=0.0, step=0.05, label="Top-P (Nucleus)", info="0 to disable" ) seed = gr.Slider( minimum=0, maximum=100000, value=12345, step=1, label="Random Seed" ) generate_btn = gr.Button("Generate", variant="primary", size="lg") with gr.Column(scale=1): status_box = gr.Textbox( label="Status", value="Ready", lines=1, interactive=False ) output = gr.Textbox( label="Generated Text", lines=18, show_copy_button=True ) gr.Markdown(""" ### How it works Diffusion models generate text differently than standard language models: 1. Initialize all tokens as noise/masks simultaneously 2. Iteratively denoise and refine all tokens together 3. After N steps, the output converges to coherent text With live generation enabled, you can watch the text improve step by step. """) gr.Examples( examples=[ ["Write a Python function that finds the longest common subsequence of two strings.", "task", 64, 256, 1.0, 0, 0.0, 12345, True], ["Explain quantum entanglement to a 10-year-old.", "task", 64, 200, 1.0, 0, 0.0, 42, True], ["The most important discovery in the history of science was", "completion", 64, 256, 1.0, 0, 0.0, 9876, True], ["In a world where time flows backwards,", "completion", 128, 300, 1.0, 0, 0.0, 7777, False], ], inputs=[prompt, mode, num_steps, max_new_tokens, temperature, top_k, top_p, seed, show_intermediate], outputs=[output, status_box], fn=generate_wrapper, cache_examples=False, ) generate_btn.click( fn=generate_wrapper, inputs=[prompt, mode, num_steps, max_new_tokens, temperature, top_k, top_p, seed, show_intermediate], outputs=[output, status_box], ) return demo if __name__ == "__main__": load_model() demo = create_interface() demo.queue(max_size=10) demo.launch()