Spaces:

Mohaddz
/

RND1-Base-0910

Running on Zero

App Files Files Community

Mohaddz commited on 13 days ago

Commit

278d275

1 Parent(s): 08b5ccb

Added rnd

Browse files

Files changed (16) hide show

demo_rnd_generation.py +359 -0
pyproject.toml +22 -0
rnd/__init__.py +53 -0
rnd/__pycache__/__init__.cpython-310.pyc +0 -0
rnd/__pycache__/configuration_rnd.cpython-310.pyc +0 -0
rnd/__pycache__/generation_config.cpython-310.pyc +0 -0
rnd/__pycache__/generation_utils.cpython-310.pyc +0 -0
rnd/__pycache__/modeling_rnd.cpython-310.pyc +0 -0
rnd/__pycache__/sampling.cpython-310.pyc +0 -0
rnd/__pycache__/terminal_visualizer.cpython-310.pyc +0 -0
rnd/configuration_rnd.py +123 -0
rnd/generation_config.py +77 -0
rnd/generation_utils.py +196 -0
rnd/modeling_rnd.py +534 -0
rnd/sampling.py +259 -0
rnd/terminal_visualizer.py +251 -0

demo_rnd_generation.py ADDED Viewed

	@@ -0,0 +1,359 @@

+#!/usr/bin/env python3
+"""
+Demo script for RND1 generation.
+"""
+import torch
+import argparse
+import os
+import sys
+import random
+import numpy as np
+from transformers import AutoTokenizer
+# Add RND1 module to path for local testing
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+def set_seed(seed: int):
+    """Set random seed for reproducibility.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def demo_completion(
+    model_path: str,
+    checkpoint_path: str = None,
+    device: str = "cuda:0",
+    use_bfloat16: bool = True,
+    show_visualization: bool = True,
+    num_steps: int = 64,
+    max_new_tokens: int = 256,
+    custom_prompt: str = None,
+    temperature: float = 1.0,
+    top_k: int = None,
+    top_p: float = None,
+    mask_token_id: int = 151669,
+    seed: int = 12345,
+    moe_backend: str = "hf",
+    mode: str = "task",
+):
+    """
+    Demonstrate text completion using RND1.
+    Args:
+        model_path: Path to base model or HuggingFace model ID
+        checkpoint_path: Path to custom checkpoint (if any)
+        device: Device to run on (e.g., cuda:0, cpu)
+        use_bfloat16: Whether to use bfloat16 precision
+        show_visualization: Whether to show live visualization (requires rich)
+        num_steps: Number of diffusion steps
+        max_new_tokens: Maximum number of tokens to generate
+        custom_prompt: Custom prompt to use instead of default examples
+        temperature: Temperature for sampling (0.0 = greedy)
+        top_k: Top-k filtering for sampling (None = disabled)
+        top_p: Top-p (nucleus) filtering for sampling (None = disabled)
+        mask_token_id: Token ID for mask token
+        seed: Random seed for reproducibility
+        moe_backend: MoE backend to use ('hf' or 'flashinfer')
+        mode: Generation mode ('task' for Q&A format, 'completion' for continuation)
+    """
+    set_seed(seed)
+    from rnd.configuration_rnd import RND1Config
+    from rnd.modeling_rnd import RND1LM
+    print("Loading tokenizer...")
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    dtype = torch.bfloat16 if use_bfloat16 else torch.float32
+    print(f"Using dtype: {dtype}")
+    if moe_backend == "hf":
+        print("\n⚠️  Note: HuggingFace backend is slower. Consider using --moe_backend flashinfer or sglang for better performance.\n")
+    # Load from checkpoint if provided, otherwise from model_path
+    load_path = checkpoint_path if checkpoint_path else model_path
+    print(f"Loading model from {load_path}...")
+    # Load config and set RND1-specific settings
+    cfg = RND1Config.from_pretrained(load_path)
+    cfg.model_type = "rnd1"
+    cfg.attn_implementation = "sdpa"
+    cfg.moe_backend = moe_backend
+    # Load model with RND1LM
+    model = RND1LM.from_pretrained(
+        load_path,
+        config=cfg,
+        torch_dtype=dtype,
+        device_map="auto" if device == "cuda:0" else device,
+        trust_remote_code=True,
+        use_safetensors=True,
+        low_cpu_mem_usage=True,
+    )
+    print("Model loaded")
+    model = model.eval()
+    if custom_prompt:
+        prompts = [custom_prompt]
+    else:
+        # Default prompts based on mode
+        if mode == "task":
+            prompts = ["Write a Python function that finds the longest common subsequence of two strings. Include comments explaining the algorithm."]
+        else:
+            prompts = ["The key to understanding quantum computing lies in"]
+    greedy = (temperature == 1.0)
+    generator = torch.Generator(device=device if device != "auto" else "cuda")
+    generator.manual_seed(seed)
+    for i, user_prompt in enumerate(prompts):
+        print(f"\n{'='*60}")
+        print(f"Mode: {mode.upper()}")
+        print(f"Prompt {i+1}: {user_prompt[:100]}...")
+        print(f"{'='*60}\n")
+        if mode == "task":
+            # Task mode: Add "Question: " prefix if not already present
+            if not user_prompt.strip().startswith("Question:"):
+                prompt = f"Question: {user_prompt}\n"
+            else:
+                prompt = user_prompt
+        else:
+            # Completion mode: Use prompt as-is for continuation
+            prompt = user_prompt
+        inputs = tokenizer(prompt, return_tensors="pt")
+        input_ids = inputs.input_ids.to(device if device != "auto" else "cuda")
+        attention_mask = inputs.attention_mask.to(device if device != "auto" else "cuda") if 'attention_mask' in inputs else None
+        print("Generation parameters:")
+        print(f"  Prompt length: {input_ids.shape[1]} tokens")
+        print(f"  Max new tokens: {max_new_tokens}")
+        print(f"  Total sequence: {input_ids.shape[1] + max_new_tokens} tokens")
+        print(f"  Diffusion steps: {num_steps}")
+        print(f"  Temperature: {temperature}")
+        print(f"  Greedy: {greedy}")
+        if top_k:
+            print(f"  Top-k: {top_k}")
+        if top_p:
+            print(f"  Top-p: {top_p}")
+        print()
+        # Create explicit generation config that takes priority over model defaults
+        from rnd.generation_config import RND1GenerationConfig
+        gen_config = RND1GenerationConfig(
+            max_new_tokens=max_new_tokens,
+            num_diffusion_steps=num_steps,
+            mask_token_id=mask_token_id,
+            temperature=temperature if not greedy else 1.0,
+            top_k=top_k,
+            top_p=top_p,
+            greedy=greedy,
+            eos_token_id=tokenizer.eos_token_id if tokenizer.eos_token_id else 151645,
+            pad_token_id=tokenizer.pad_token_id,
+            bos_token_id=tokenizer.bos_token_id,
+        )
+        with torch.no_grad():
+            if show_visualization and hasattr(model, 'generate_with_visualization'):
+                # Use method with visualization support (requires tokenizer)
+                output = model.generate_with_visualization(
+                    tokenizer=tokenizer,
+                    inputs=input_ids,
+                    generation_config=gen_config,
+                    generator=generator,
+                )
+            else:
+                # Use standard generate method with explicit config
+                output = model.generate(
+                    inputs=input_ids,
+                    generation_config=gen_config,
+                    generator=generator,
+                )
+        generated_tokens = output[0][len(input_ids[0]):]
+        generation = tokenizer.decode(
+            generated_tokens.tolist(),
+            skip_special_tokens=True
+        )
+        print("\nGenerated response:")
+        print(generation)
+        print(f"\n(Generation completed in {num_steps} diffusion steps)")
+def main():
+    parser = argparse.ArgumentParser(
+        description="RND1 diffusion model demo with live visualization",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    # Model configuration
+    model_group = parser.add_argument_group('Model Configuration')
+    model_group.add_argument(
+        "--model_path",
+        type=str,
+        default="radicalnumerics/RND1-Base-0910",
+        help="Path to model or HuggingFace model ID"
+    )
+    model_group.add_argument(
+        "--checkpoint",
+        type=str,
+        default=None,
+        help="Path to custom checkpoint file or directory"
+    )
+    model_group.add_argument(
+        "--device",
+        type=str,
+        default="cuda:0",
+        help="Device to run on (e.g., cuda:0, cpu)"
+    )
+    model_group.add_argument(
+        "--fp32",
+        action="store_true",
+        help="Use FP32 precision instead of BF16"
+    )
+    # Generation configuration
+    gen_group = parser.add_argument_group('Generation Settings')
+    gen_group.add_argument(
+        "--num_steps",
+        type=int,
+        default=256,
+        help="Number of diffusion steps"
+    )
+    gen_group.add_argument(
+        "--max_new_tokens",
+        type=int,
+        default=256,
+        help="Maximum number of tokens to generate"
+    )
+    gen_group.add_argument(
+        "--prompt",
+        type=str,
+        default=None,
+        help="Custom prompt to use for generation"
+    )
+    gen_group.add_argument(
+        "--mode",
+        type=str,
+        default="task",
+        choices=["task", "completion"],
+        help="Generation mode: 'task' (Q&A format for instructions) or 'completion' (text continuation)"
+    )
+    gen_group.add_argument(
+        "--mask_token_id",
+        type=int,
+        default=151669,
+        help="Token ID for mask token"
+    )
+    # Sampling configuration
+    sampling_group = parser.add_argument_group('Sampling Parameters')
+    sampling_group.add_argument(
+        "--temperature",
+        type=float,
+        default=1.0,
+        help="Temperature for sampling (1.0 = greedy/deterministic)"
+    )
+    sampling_group.add_argument(
+        "--top_k",
+        type=int,
+        default=None,
+        help="Top-k filtering: keep only k most likely tokens"
+    )
+    sampling_group.add_argument(
+        "--top_p",
+        type=float,
+        default=None,
+        help="Top-p (nucleus) filtering: keep tokens with cumulative probability <= p"
+    )
+    # Visualization
+    viz_group = parser.add_argument_group('Visualization')
+    viz_group.add_argument(
+        "--no_viz",
+        action="store_true",
+        help="Disable live visualization during generation (requires rich library)"
+    )
+    # Other settings
+    other_group = parser.add_argument_group('Other Settings')
+    other_group.add_argument(
+        "--seed",
+        type=int,
+        default=12345,
+        help="Random seed for reproducibility"
+    )
+    moe_backend_group = parser.add_argument_group('MoE Backend')
+    moe_backend_group.add_argument(
+        "--moe_backend",
+        type=str,
+        default="hf",
+        choices=["hf", "flashinfer", "sglang"],
+        help="MoE backend to use for sparse mixture of experts layers"
+    )
+    args = parser.parse_args()
+    if args.temperature < 0:
+        parser.error("Temperature must be non-negative")
+    if args.top_k is not None and args.top_k <= 0:
+        parser.error("Top-k must be positive")
+    if args.top_p is not None and (args.top_p <= 0 or args.top_p > 1):
+        parser.error("Top-p must be between 0 and 1")
+    print("\n" + "="*60)
+    print("RND1 Diffusion Language Model Demo")
+    print("="*60)
+    print("Configuration:")
+    print(f"  Model: {args.model_path}")
+    if args.checkpoint:
+        print(f"  Checkpoint: {args.checkpoint}")
+    print(f"  Device: {args.device}")
+    print(f"  Precision: {'FP32' if args.fp32 else 'BF16'}")
+    print(f"  Mode: {args.mode.upper()} ({'Q&A format for instructions' if args.mode == 'task' else 'Text continuation'})")
+    print(f"  Random seed: {args.seed}")
+    print(f"  Diffusion steps: {args.num_steps}")
+    print(f"  Max new tokens: {args.max_new_tokens}")
+    print(f"  Algorithm: Entropy-based selection")
+    print(f"  Temperature: {args.temperature}")
+    if args.top_k:
+        print(f"  Top-k: {args.top_k}")
+    if args.top_p:
+        print(f"  Top-p: {args.top_p}")
+    print(f"  MoE Backend: {args.moe_backend}")
+    print(f"  Visualization: {'Enabled' if not args.no_viz else 'Disabled'}")
+    print("="*60 + "\n")
+    demo_completion(
+        model_path=args.model_path,
+        checkpoint_path=args.checkpoint,
+        device=args.device,
+        use_bfloat16=not args.fp32,
+        show_visualization=not args.no_viz,
+        num_steps=args.num_steps,
+        max_new_tokens=args.max_new_tokens,
+        custom_prompt=args.prompt,
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        mask_token_id=args.mask_token_id,
+        seed=args.seed,
+        moe_backend=args.moe_backend,
+        mode=args.mode,
+    )
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,22 @@

+[build-system]
+requires = ["setuptools>=61", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "rnd"
+version = "0.1.0"
+dependencies = [
+    "accelerate",
+    "torch>=2.8",
+    "transformers",
+    "rich"
+]
+[project.optional-dependencies]
+flashinfer = [
+    "flashinfer-python",
+]
+sglang = ["sglang[all]"]
+[tool.setuptools]
+packages = ["rnd"]

rnd/__init__.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright 2025 Radical Numerics Inc.
+#
+# This source code is licensed under the Apache License, Version 2.0, found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Radical Numerics Diffusion (RND1) - Diffusion-based Language Model.
+"""
+from .configuration_rnd import RND1Config
+from .modeling_rnd import (
+    RND1LM,
+    RND1Model,
+    RND1PreTrainedModel,
+    RND1Attention,
+    RND1DecoderLayer,
+    RND1SparseMoeBlock,
+)
+from .generation_config import RND1GenerationConfig
+from .generation_utils import RND1GenerationMixin
+from .sampling import (
+    diffusion_sample,
+    apply_top_k_filtering,
+    apply_top_p_filtering,
+)
+from .terminal_visualizer import TerminalVisualizer, SimpleProgressBar
+__version__ = "0.1.0"
+__all__ = [
+    "RND1Config",
+    "RND1GenerationConfig",
+    "RND1LM",
+    "RND1Model",
+    "RND1PreTrainedModel",
+    "RND1Attention",
+    "RND1DecoderLayer",
+    "RND1SparseMoeBlock",
+    "RND1GenerationMixin",
+    "TerminalVisualizer",
+    "SimpleProgressBar",
+]
+# Register with HuggingFace Auto classes for local usage
+try:
+    from transformers import AutoConfig, AutoModel, AutoModelForMaskedLM
+    AutoConfig.register("rnd1", RND1Config)
+    AutoModel.register(RND1Config, RND1Model)
+    AutoModelForMaskedLM.register(RND1Config, RND1LM)
+except ImportError:
+    # transformers not available or Auto classes not imported
+    pass

rnd/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (1.07 kB). View file

rnd/__pycache__/configuration_rnd.cpython-310.pyc ADDED Viewed

Binary file (3.49 kB). View file

rnd/__pycache__/generation_config.cpython-310.pyc ADDED Viewed

Binary file (2.34 kB). View file

rnd/__pycache__/generation_utils.cpython-310.pyc ADDED Viewed

Binary file (5.54 kB). View file

rnd/__pycache__/modeling_rnd.cpython-310.pyc ADDED Viewed

Binary file (16.2 kB). View file

rnd/__pycache__/sampling.cpython-310.pyc ADDED Viewed

Binary file (7.07 kB). View file

rnd/__pycache__/terminal_visualizer.cpython-310.pyc ADDED Viewed

Binary file (7.31 kB). View file

rnd/configuration_rnd.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# Copyright 2025 Radical Numerics Inc.
+#
+# This source code is licensed under the Apache License, Version 2.0, found in the
+# LICENSE file in the root directory of this source tree.
+"""
+RND1 Model Configuration.
+This module defines the configuration class for RND1 models.
+The default settings are derived from Qwen/Qwen3-30B-A3B and augmented
+with RND1-specific parameters.
+"""
+from transformers.configuration_utils import PretrainedConfig
+# Qwen3-30B-A3B / checkpoint defaults
+CONFIG_DEFAULTS = {
+    "attention_bias": False,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "decoder_sparse_step": 1,
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": 6144,
+    "max_position_embeddings": 40960,
+    "max_window_layers": 48,
+    "mlp_only_layers": [],
+    "moe_intermediate_size": 768,
+    "norm_topk_prob": True,
+    "num_attention_heads": 32,
+    "num_experts": 128,
+    "num_experts_per_tok": 8,
+    "num_hidden_layers": 48,
+    "num_key_value_heads": 4,
+    "output_router_logits": False,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": False,
+    "rope_theta": 1000000.0,
+    "router_aux_loss_coef": 0.001,
+    "sliding_window": False,
+    "tie_word_embeddings": False,
+    "torch_dtype": "bfloat16",
+    "use_cache": False,
+    "use_sliding_window": False,
+    "vocab_size": 151936,
+}
+class RND1Config(PretrainedConfig):
+    """
+    Configuration class for RND1 models.
+    This configuration extends Qwen3MoeConfig with additional parameters
+    specific to the RND1 (Radical Numerics Diffusion v1) architecture.
+    Args:
+        moe_backend: Backend for MoE computation ("hf", "flashinfer", or "sglang")
+        num_diffusion_steps: Default number of diffusion steps for generation
+        mask_token_id: Token ID used for masking (default: 151669 for Qwen)
+        **kwargs: Additional arguments passed to Qwen3MoeConfig
+    """
+    model_type = "rnd1"
+    def __init__(
+        self,
+        moe_backend: str = "hf",
+        num_diffusion_steps: int = 256,
+        mask_token_id: int = 151669,
+        **kwargs,
+    ):
+        # Force non-causal and no caching for RND1
+        kwargs["use_cache"] = False
+        kwargs["is_causal"] = False
+        super().__init__(**kwargs)
+        # Set defaults after pretrained init to prevent overrides
+        self.set_config_defaults()
+        # QoL: set attn impl directly from config
+        if "attn_implementation" in kwargs:
+            self._attn_implementation = kwargs["attn_implementation"]
+        # RND1-specific parameters
+        self.moe_backend = moe_backend
+        self.num_diffusion_steps = num_diffusion_steps
+        self.mask_token_id = mask_token_id
+        # Ensure bidirectional attention and no caching
+        self.is_causal = False
+        self.use_cache = False
+    def set_config_defaults(self):
+        """
+        Ensure model defaults are set according to final training checkpoint
+        Qwen3MoeConfig defaults don't match Qwen/Qwen3-30B-A3B settings from which
+        RND1 is derived.
+        """
+        for k, v in CONFIG_DEFAULTS.items():
+            setattr(self, k, v)
+    def to_dict(self):
+        """
+        Serializes configuration to dictionary with auto_map for Hub.
+        The auto_map ensures that when users load from HuggingFace Hub,
+        the correct custom classes are automatically resolved.
+        """
+        data = super().to_dict()
+        data.setdefault(
+            "auto_map",
+            {
+                "AutoConfig": "configuration_rnd.RND1Config",
+                "AutoModel": "modeling_rnd.RND1Model",
+                "AutoModelForMaskedLM": "modeling_rnd.RND1LM",
+            },
+        )
+        return data

rnd/generation_config.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Copyright 2025 Radical Numerics Inc.
+#
+# This source code is licensed under the Apache License, Version 2.0, found in the
+# LICENSE file in the root directory of this source tree.
+"""
+RND1 Generation Configuration.
+This module defines the generation configuration for RND1 models,
+controlling the diffusion-based generation process.
+"""
+from typing import Optional
+from transformers.generation.configuration_utils import GenerationConfig
+class RND1GenerationConfig(GenerationConfig):
+    """
+    Configuration class for RND1 generation parameters.
+    This class extends the base GenerationConfig to include parameters
+    specific to diffusion-based language generation.
+    Args:
+        max_length: Maximum sequence length
+        num_diffusion_steps: Number of denoising steps in the diffusion process
+        mask_token_id: Token ID used for masking during diffusion
+        temperature: Temperature for sampling (higher = more random)
+        top_k: Optional top-k filtering
+        top_p: Optional nucleus (top-p) filtering
+        greedy: Whether to use greedy decoding (True) or stochastic sampling (False)
+        **kwargs: Additional arguments passed to GenerationConfig
+    """
+    def __init__(
+        self,
+        max_length: int = 256,
+        num_diffusion_steps: int = 256,
+        mask_token_id: int = 151669,
+        temperature: float = 1.0,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        greedy: bool = True,
+        bos_token_id: int = None,
+        eos_token_id: int = None,
+        pad_token_id: int = None,
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        # Force no caching for RND generation
+        # kwargs['use_cache'] = False
+        kwargs.pop('use_cache', None)
+        super().__init__(
+            max_length=max_length,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            do_sample=not greedy,
+            use_cache=False,
+            **kwargs,
+        )
+        # RND-specific parameters
+        self.num_diffusion_steps = num_diffusion_steps
+        self.mask_token_id = mask_token_id
+        self.greedy = greedy
+    def to_dict(self):
+        """Convert configuration to dictionary."""
+        output = super().to_dict()
+        output["num_diffusion_steps"] = self.num_diffusion_steps
+        output["mask_token_id"] = self.mask_token_id
+        output["greedy"] = self.greedy
+        return output

rnd/generation_utils.py ADDED Viewed

	@@ -0,0 +1,196 @@

+# Copyright 2025 Radical Numerics Inc.
+#
+# This source code is licensed under the Apache License, Version 2.0, found in the
+# LICENSE file in the root directory of this source tree.
+"""
+RND1 Generation Utilities.
+This module provides generation utilities and mixins for RND1 models,
+including the main GenerationMixin class that integrates with HuggingFace.
+"""
+import torch
+import torch.nn as nn
+from typing import Optional, Union, Dict, Any
+from transformers import GenerationMixin as HFGenerationMixin
+from transformers.generation import GenerationConfig
+from .sampling import diffusion_sample, apply_top_k_filtering, apply_top_p_filtering
+class RND1GenerationMixin(HFGenerationMixin):
+    """
+    Generation mixin for RND1 models.
+    This mixin provides generation methods compatible with HuggingFace's
+    generation API while using RND1's diffusion-based sampling internally.
+    """
+    def generate(
+        self,
+        inputs: Optional[torch.LongTensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        # RND1-specific parameters
+        prefix_ids: Optional[torch.LongTensor] = None,
+        suffix_ids: Optional[torch.LongTensor] = None,
+        infill_length: Optional[int] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        **kwargs,  # Accept all kwargs to be compatible with pipelines
+    ) -> Union[torch.LongTensor, Dict[str, Any]]:
+        """
+        Generate text using RND1's diffusion-based sampling.
+        Follows HuggingFace's standard generate API, using diffusion sampling
+        internally. Supports both standard generation and infilling.
+        Args:
+            inputs: Input token IDs to use as prefix (standard HF parameter)
+            generation_config: Generation configuration object
+            prefix_ids: Alternative to inputs for infilling tasks
+            suffix_ids: Optional suffix for infilling tasks
+            infill_length: Length of infill region (for infilling)
+            return_dict_in_generate: Whether to return GenerateDecoderOnlyOutput
+            **kwargs: Additional arguments (accepted for compatibility)
+        Returns:
+            Generated token IDs or GenerateDecoderOnlyOutput
+        """
+        if generation_config is not None:
+            gen_config = generation_config
+            model_kwargs = kwargs.copy()
+        else:
+            # Only prepare config from kwargs if no config was provided
+            gen_config, model_kwargs = self._prepare_generation_config(None, **kwargs)
+        device = next(self.parameters()).device
+        if inputs is not None:
+            prefix_ids = inputs.to(device)
+        elif prefix_ids is not None:
+            prefix_ids = prefix_ids.to(device)
+        else:
+            prefix_ids = None
+        if suffix_ids is not None:
+            suffix_ids = suffix_ids.to(device)
+        eos_token_id = gen_config.eos_token_id or getattr(self.config, "eos_token_id", 151645)
+        pad_token_id = gen_config.pad_token_id or getattr(self.config, "pad_token_id", None)
+        bos_token_id = gen_config.bos_token_id or getattr(self.config, "bos_token_id", None)
+        mask_token_id = getattr(gen_config, "mask_token_id", getattr(self.config, "mask_token_id", 151669))
+        if infill_length is not None and prefix_ids is not None:
+            # Infilling mode: use specified infill_length
+            prefix_len = prefix_ids.shape[1] if prefix_ids is not None else 0
+            suffix_len = suffix_ids.shape[1] if suffix_ids is not None else 0
+            seq_len = prefix_len + infill_length + suffix_len
+        else:
+            # Standard generation mode
+            if prefix_ids is not None:
+                prefix_len = prefix_ids.shape[1]
+                if gen_config.max_new_tokens is not None:
+                    seq_len = prefix_len + gen_config.max_new_tokens
+                else:
+                    seq_len = gen_config.max_length or self.config.max_position_embeddings
+            else:
+                seq_len = gen_config.max_length or self.config.max_position_embeddings
+        num_diffusion_steps = getattr(gen_config, "num_diffusion_steps",
+                                     getattr(self.config, "num_diffusion_steps", 256))
+        temperature = float(getattr(gen_config, "temperature", 1.0))
+        top_k = getattr(gen_config, "top_k", None)
+        top_p = getattr(gen_config, "top_p", None)
+        greedy = getattr(gen_config, "greedy",
+                        not bool(gen_config.do_sample) if hasattr(gen_config, "do_sample") else True)
+        generator = model_kwargs.get("generator", None)
+        if generator is None:
+            seed = getattr(gen_config, 'seed', None)
+            if seed is not None:
+                generator = torch.Generator(device=device)
+                generator.manual_seed(seed)
+        with torch.inference_mode():
+            sequences = diffusion_sample(
+                model=self,
+                seq_len=seq_len,
+                num_steps=num_diffusion_steps,
+                mask_token_id=mask_token_id,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                greedy=greedy,
+                prefix_ids=prefix_ids,
+                suffix_ids=suffix_ids,
+                infill_length=infill_length,
+                eos_token_id=eos_token_id,
+                pad_token_id=pad_token_id,
+                bos_token_id=bos_token_id,
+                device=device,
+                generator=generator,
+                visualizer=model_kwargs.get("visualizer", None),  # Optional visualizer from kwargs
+            )
+        if return_dict_in_generate or getattr(gen_config, "return_dict_in_generate", False):
+            from transformers.generation.utils import GenerateDecoderOnlyOutput
+            return GenerateDecoderOnlyOutput(sequences=sequences)
+        return sequences
+    def generate_with_visualization(
+        self,
+        tokenizer,
+        inputs: Optional[torch.LongTensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        suffix_ids: Optional[torch.LongTensor] = None,
+        infill_length: Optional[int] = None,
+        generator: Optional[torch.Generator] = None,
+        **kwargs,
+    ) -> torch.LongTensor:
+        """
+        Generate with live visualization (for demos).
+        This method requires a tokenizer to display the generation process.
+        For production use, prefer `generate()`.
+        Args:
+            tokenizer: Tokenizer for decoding tokens to text
+            inputs: Input token IDs to use as prefix
+            generation_config: Generation configuration object
+            suffix_ids: Optional suffix token IDs
+            infill_length: Length of infill region
+            generator: Random generator for reproducibility
+            **kwargs: Additional arguments for backward compatibility
+        Returns:
+            Generated token IDs as LongTensor
+        """
+        from .terminal_visualizer import TerminalVisualizer
+        visualizer = TerminalVisualizer(tokenizer, show_visualization=True)
+        return self.generate(
+            inputs=inputs,
+            generation_config=generation_config,
+            suffix_ids=suffix_ids,
+            infill_length=infill_length,
+            generator=generator,
+            visualizer=visualizer,
+            return_dict_in_generate=False,
+            **kwargs,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """
+        Prepare inputs for generation (required by HuggingFace).
+        For RND1, we don't use the standard autoregressive generation,
+        so this just returns the input_ids.
+        """
+        return {"input_ids": input_ids}

rnd/modeling_rnd.py ADDED Viewed

	@@ -0,0 +1,534 @@

+# Copyright 2025 Radical Numerics Inc.
+#
+# This source code is licensed under the Apache License, Version 2.0, found in the
+# LICENSE file in the root directory of this source tree.
+"""
+RND1 model implementation.
+This module implements the RND1 architecture with bidirectional attention for
+diffusion-based language modeling. Includes support for Mixture of Experts (MoE)
+with multiple backend options (HF, FlashInfer, SGLang).
+Based on the Qwen3Moe architecture:
+https://github.com/huggingface/transformers/blob/v4.57.0/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py
+"""
+from __future__ import annotations
+import os
+from typing import Optional, Tuple, List, Union
+import torch
+from torch import nn
+from transformers.utils import logging
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import (
+    MoeModelOutputWithPast,
+    MaskedLMOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.configuration_utils import PretrainedConfig
+from transformers.generation import GenerationConfig
+from .configuration_rnd import RND1Config
+from .generation_utils import RND1GenerationMixin
+from .generation_config import RND1GenerationConfig
+from transformers.models.qwen3_moe.modeling_qwen3_moe import (
+    Qwen3MoeConfig,
+    Qwen3MoeRMSNorm,
+    Qwen3MoeRotaryEmbedding,
+    Qwen3MoeSparseMoeBlock,
+    Qwen3MoeMLP,
+    apply_rotary_pos_emb
+)
+import torch.nn.functional as F
+try:
+    import flashinfer.fused_moe as fused_moe
+except Exception:
+    fused_moe = None
+try:
+    from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe as sglang_fused_moe
+    from sglang.srt.layers.moe.topk import StandardTopKOutput
+except Exception:
+    sglang_fused_moe = None
+    StandardTopKOutput = None
+logger = logging.get_logger(__name__)
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """Expand key/value heads to match query heads for grouped-query attention."""
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class RND1Attention(nn.Module):
+    """RND1 attention layer with bidirectional attention for diffusion modeling."""
+    def __init__(self, config: RND1Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.scaling = self.head_dim ** -0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = False
+        self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(config.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(config.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, config.hidden_size, bias=config.attention_bias)
+        self.q_norm = Qwen3MoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = Qwen3MoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.sliding_window = getattr(config, "sliding_window", None)
+        self.rotary_emb = Qwen3MoeRotaryEmbedding(config=config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        dual_cache: Optional[bool] = False,
+        replace_position: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Union[Cache, Tuple[torch.Tensor, torch.Tensor]]]]:
+        bsz, q_len, _ = hidden_states.size()
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        use_sdpa = (getattr(self.config, "_attn_implementation", "eager") == "sdpa")
+        if use_sdpa:
+            if attention_mask is not None and isinstance(attention_mask, torch.Tensor):
+                if attention_mask.dtype not in [torch.bool, torch.float32, torch.float16, torch.bfloat16]:
+                    attention_mask = attention_mask.to(dtype=query_states.dtype)
+            assert not self.is_causal, f"Attention layer {self.layer_idx} is causal"
+            attn_out = torch.nn.functional.scaled_dot_product_attention(
+                query_states, key_states, value_states,
+                attn_mask=attention_mask if isinstance(attention_mask, torch.Tensor) else None,
+                dropout_p=self.attention_dropout if self.training else 0.0,
+                is_causal=self.is_causal,
+            )
+            attn_out = attn_out.transpose(1, 2).contiguous()
+            attn_out = attn_out.view(bsz, q_len, self.num_heads * self.head_dim)
+            attn_out = self.o_proj(attn_out)
+            return attn_out, None
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_out = torch.matmul(attn_weights, value_states)
+        attn_out = attn_out.transpose(1, 2).contiguous().view(hidden_states.size(0), hidden_states.size(1), -1)
+        attn_out = self.o_proj(attn_out)
+        return attn_out, None
+class RND1DecoderLayer(nn.Module):
+    """RND1 decoder layer with bidirectional attention for diffusion language modeling."""
+    def __init__(self, config: RND1Config, layer_idx: int):
+        super().__init__()
+        self.self_attn = RND1Attention(config, layer_idx)
+        self.mlp = RND1SparseMoeBlock(config)
+        self.input_layernorm = Qwen3MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen3MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        replace_position: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.Tensor]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        attn_out, attn_weights = self.self_attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            position_embeddings=position_embeddings,
+            replace_position=replace_position,
+        )
+        hidden_states = residual + attn_out
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        ff_out = self.mlp(hidden_states)
+        if isinstance(ff_out, tuple):
+            ff_out = ff_out[0]
+        hidden_states = residual + ff_out
+        return hidden_states, attn_weights
+class RND1SparseMoeBlock(nn.Module):
+    """RND1 Sparse MoE block with multiple backend support (HF, FlashInfer, SGLang)."""
+    def __init__(self, config: RND1Config):
+        super().__init__()
+        self.config = config
+        self.backend = getattr(config, "moe_backend", "hf")
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+        self.norm_topk_prob = config.norm_topk_prob
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = getattr(config, "moe_intermediate_size", config.intermediate_size)
+        self.gate = nn.Linear(self.hidden_size, self.num_experts, bias=False)
+        self.experts = nn.ModuleList(
+            [Qwen3MoeMLP(config, intermediate_size=self.intermediate_size) for _ in range(self.num_experts)]
+        )
+        # Cached weight tensors for optimized backends
+        self._flashinfer_fc1_weights = None
+        self._flashinfer_fc2_weights = None
+        self._sglang_w1 = None
+        self._sglang_w2 = None
+        if self.backend == "sglang":
+            if sglang_fused_moe is None or StandardTopKOutput is None:
+                raise RuntimeError("sglang is not available, cannot use sglang backend")
+        elif self.backend == "flashinfer":
+            if fused_moe is None:
+                raise RuntimeError("flashinfer is not available, cannot use flashinfer backend")
+    def _initialize_flashinfer_weights(self):
+        """Initialize FlashInfer-compatible weight format."""
+        fc1_list = []
+        fc2_list = []
+        for expert in self.experts:
+            gate_w = expert.gate_proj.weight  # [I, H]
+            up_w = expert.up_proj.weight      # [I, H]
+            down_w = expert.down_proj.weight  # [H, I]
+            # FlashInfer expects [up; gate] ordering
+            fc1_list.append(torch.cat([up_w, gate_w], dim=0))  # [2I, H]
+            fc2_list.append(down_w)  # [H, I]
+        self._flashinfer_fc1_weights = torch.stack(fc1_list, dim=0).contiguous()
+        self._flashinfer_fc2_weights = torch.stack(fc2_list, dim=0).contiguous()
+    def _initialize_sglang_weights(self):
+        """Initialize SGLang-compatible weight format."""
+        w1_list = []
+        w2_list = []
+        for expert in self.experts:
+            gate_w = expert.gate_proj.weight  # [I, H]
+            up_w = expert.up_proj.weight      # [I, H]
+            down_w = expert.down_proj.weight  # [H, I]
+            w1 = torch.cat([gate_w, up_w], dim=0)  # [2I, H]
+            w1_list.append(w1)
+            w2_list.append(down_w)
+        self._sglang_w1 = torch.stack(w1_list, dim=0).contiguous()
+        self._sglang_w2 = torch.stack(w2_list, dim=0).contiguous()
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Forward pass with expert routing and computation."""
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        x = hidden_states.view(-1, hidden_dim)
+        # Expert routing
+        router_logits = self.gate(x)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        if self.norm_topk_prob:
+            routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
+        if self.backend == "hf":
+            final_hidden_states = torch.zeros(
+                (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+            )
+            expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+            expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+            for expert_idx in expert_hit:
+                expert_layer = self.experts[expert_idx]
+                idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
+                current_state = x[top_x]
+                current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+                final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+            out = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+            return out, router_logits.view(batch_size, sequence_length, -1)
+        elif self.backend == "flashinfer":
+            if self._flashinfer_fc1_weights is None or self._flashinfer_fc2_weights is None:
+                self._initialize_flashinfer_weights()
+            result = fused_moe.cutlass_fused_moe(
+                input=x,
+                token_selected_experts=selected_experts.to(torch.int),
+                token_final_scales=routing_weights.to(torch.float32),
+                fc1_expert_weights=self._flashinfer_fc1_weights,
+                fc2_expert_weights=self._flashinfer_fc2_weights,
+                output_dtype=x.dtype,
+                quant_scales=None,
+            )
+            if isinstance(result, (list, tuple)):
+                out_flat = result[0]
+            else:
+                out_flat = result
+            out = out_flat.view(batch_size, sequence_length, hidden_dim)
+            return out, router_logits.view(batch_size, sequence_length, -1)
+        elif self.backend == "sglang":
+            if self._sglang_w1 is None or self._sglang_w2 is None:
+                self._initialize_sglang_weights()
+            topk_output = StandardTopKOutput(
+                topk_weights=routing_weights,
+                topk_ids=selected_experts,
+                router_logits=router_logits,
+            )
+            out_flat = sglang_fused_moe(
+                hidden_states=x,
+                w1=self._sglang_w1,
+                w2=self._sglang_w2,
+                topk_output=topk_output,
+            )
+            out = out_flat.view(batch_size, sequence_length, hidden_dim)
+            return out, router_logits.view(batch_size, sequence_length, -1)
+        else:
+            raise ValueError(f"Invalid backend: {self.backend}")
+class RND1PreTrainedModel(PreTrainedModel):
+    """Base class for RND1 models with weight initialization and loading support."""
+    config_class = RND1Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["RND1DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        """Initialize weights using normal distribution."""
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        use_safetensors: Optional[bool] = None,
+        weights_only: bool = True,
+        **kwargs,
+    ):
+        """Load pretrained model with generation config."""
+        _model = super().from_pretrained(
+            pretrained_model_name_or_path,
+            *model_args,
+            config=config,
+            cache_dir=cache_dir,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            force_download=force_download,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            use_safetensors=use_safetensors,
+            weights_only=weights_only,
+            **kwargs,
+        )
+        resume_download = kwargs.get("resume_download", None)
+        proxies = kwargs.get("proxies", None)
+        subfolder = kwargs.get("subfolder", "")
+        from_auto_class = kwargs.get("_from_auto", False)
+        from_pipeline = kwargs.get("_from_pipeline", None)
+        _model.generation_config = GenerationConfig.from_pretrained(
+            pretrained_model_name_or_path,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            _from_auto=from_auto_class,
+            _from_pipeline=from_pipeline,
+        )
+        return _model
+class RND1Model(RND1PreTrainedModel):
+    """RND1 transformer model with bidirectional attention for diffusion language modeling."""
+    def __init__(self, config: RND1Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([RND1DecoderLayer(config, i) for i in range(config.num_hidden_layers)])
+        self.norm = Qwen3MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen3MoeRotaryEmbedding(config=config)
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ) -> MoeModelOutputWithPast:
+        """Forward pass through the RND1 model."""
+        if (input_ids is None) == (inputs_embeds is None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if position_ids is None:
+            position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device).unsqueeze(0)
+        position_embeddings = self.rotary_emb(inputs_embeds, position_ids)
+        hidden_states = inputs_embeds
+        for layer in self.layers:
+            hidden_states, _ = layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                position_embeddings=position_embeddings,
+            )
+        hidden_states = self.norm(hidden_states)
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            router_logits=None,
+        )
+class RND1LM(RND1PreTrainedModel, RND1GenerationMixin):
+    """Radical Numerics Diffusion Language Model with bidirectional attention."""
+    def __init__(self, config: RND1Config):
+        super().__init__(config)
+        self.model = RND1Model(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def get_input_embeddings(self):
+        """Get the input embeddings layer."""
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        """Set the input embeddings layer."""
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        """Get the output embeddings layer (lm_head)."""
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        """Set the output embeddings layer (lm_head)."""
+        self.lm_head = new_embeddings
+    @classmethod
+    def can_generate(cls) -> bool:
+        """Indicates this model can generate text."""
+        return True
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> MaskedLMOutput:
+        """Forward pass with optional loss computation."""
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+        logits = self.lm_head(outputs.last_hidden_state)
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
+        return MaskedLMOutput(
+            loss=loss,
+            logits=logits,
+        )

rnd/sampling.py ADDED Viewed

	@@ -0,0 +1,259 @@

+# Copyright 2025 Radical Numerics Inc.
+#
+# This source code is licensed under the Apache License, Version 2.0, found in the
+# LICENSE file in the root directory of this source tree.
+"""
+RND1 sampling module for masked diffusion generation.
+This module implements entropy-based token selection for iterative denoising
+in diffusion language models. Supports both greedy and stochastic sampling
+with optional prefix/suffix constraints and infilling.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple, Union
+def apply_top_k_filtering(logits: torch.Tensor, k: int) -> torch.Tensor:
+    """
+    Apply top-k filtering to logits: with non-top-k values set to -inf
+    """
+    top_k_values, top_k_indices = torch.topk(logits, min(k, logits.size(-1)), dim=-1)
+    filtered_logits = torch.full_like(logits, float('-inf'))
+    filtered_logits.scatter_(-1, top_k_indices, top_k_values)
+    return filtered_logits
+def apply_top_p_filtering(logits: torch.Tensor, p: float) -> torch.Tensor:
+    """
+    Apply top-p (nucleus) filtering to logits: with tokens beyond threshold set to -inf
+    """
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+    cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+    # Remove tokens with cumulative probability above threshold
+    sorted_indices_to_remove = cumulative_probs > p
+    sorted_indices_to_remove[..., 0] = False  # Keep at least one token
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    indices_to_remove = sorted_indices_to_remove.scatter(-1, sorted_indices, sorted_indices_to_remove)
+    return logits.masked_fill(indices_to_remove, float('-inf'))
+@torch.no_grad()
+def diffusion_sample(
+    model: nn.Module,
+    seq_len: int = 256,
+    num_steps: int = 256,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    temperature: float = 1.0,
+    greedy: bool = True,
+    mask_token_id: int = 151669,
+    prefix_ids: Optional[torch.LongTensor] = None,
+    suffix_ids: Optional[torch.LongTensor] = None,
+    infill_length: Optional[int] = None,
+    eos_token_id: int = 151645,
+    pad_token_id: Optional[int] = None,
+    bos_token_id: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    generator: Optional[torch.Generator] = None,
+    visualizer: Optional['TerminalVisualizer'] = None,
+) -> torch.LongTensor:
+    """
+    Perform masked diffusion sampling with entropy-based token selection.
+    Args:
+        model: The RND1 language model
+        seq_len: Target sequence length
+        num_steps: Number of denoising steps
+        top_k: Optional top-k filtering for sampling (None = no filtering)
+        top_p: Optional nucleus (top-p) filtering for sampling (None = no filtering)
+               When both top_k and top_p are set, top_k is applied first, then top_p
+        temperature: Temperature for sampling (higher = more random, lower = more deterministic)
+                    Values close to 0 are clamped to 1e-8 to avoid division by zero
+        greedy: Whether to use greedy sampling (True) or stochastic (False)
+        mask_token_id: Token ID for masked positions (default: 151669)
+        prefix_ids: Optional prefix token IDs to preserve
+        suffix_ids: Optional suffix token IDs to preserve
+        infill_length: Length of infill region between prefix/suffix
+        eos_token_id: End of sequence token ID (default: 151645)
+        pad_token_id: Padding token ID (default: None, uses 0 if needed)
+        bos_token_id: Beginning of sequence token ID (default: None)
+        device: Device for computation (None = infer from model)
+        generator: Optional torch generator for reproducible sampling
+        visualizer: Optional TerminalVisualizer for live visualization
+    Returns:
+        Generated token IDs as LongTensor
+    """
+    model.eval()
+    if device is None:
+        device = next(model.parameters()).device
+    else:
+        device = torch.device(device)
+    dtype = next(model.parameters()).dtype
+    if pad_token_id is None:
+        pad_token_id = 0
+    # Build initial masked sequence
+    # When prefix_ids is provided, we create a sequence of length seq_len where:
+    # - The prefix occupies the first pre_len positions
+    # - The remaining (seq_len - pre_len) positions are filled with mask tokens to be generated
+    if prefix_ids is not None or suffix_ids is not None:
+        if prefix_ids is not None:
+            prefix_ids = prefix_ids.to(device) if isinstance(prefix_ids, torch.Tensor) else torch.tensor(prefix_ids, device=device)
+            pre_len = prefix_ids.shape[-1] if prefix_ids.dim() > 0 else 0
+        else:
+            pre_len = 0
+        if suffix_ids is not None:
+            suffix_ids = suffix_ids.to(device) if isinstance(suffix_ids, torch.Tensor) else torch.tensor(suffix_ids, device=device)
+            suf_len = suffix_ids.shape[-1] if suffix_ids.dim() > 0 else 0
+        else:
+            suf_len = 0
+        reserved = (1 if bos_token_id is not None else 0) + (1 if eos_token_id is not None else 0)
+        used = pre_len + suf_len + reserved
+        if used > seq_len:
+            raise ValueError(
+                f"Combined length of prefix ({pre_len}), suffix ({suf_len}), "
+                f"and special tokens ({reserved}) = {used} exceeds seq_len ({seq_len}). "
+                f"Please increase seq_len or reduce input lengths."
+            )
+        elif used == seq_len:
+            raise ValueError(
+                f"No space for generation: prefix ({pre_len}) + suffix ({suf_len}) "
+                f"+ special tokens ({reserved}) = seq_len ({seq_len}). "
+                f"Need at least 1 position for generation."
+            )
+        infill_length = min(infill_length or (seq_len - used), seq_len - used)
+        x = torch.full((1, seq_len), pad_token_id, dtype=torch.long, device=device)
+        pos = 0
+        if bos_token_id is not None:
+            x[0, pos] = bos_token_id; pos += 1
+        if pre_len > 0:
+            x[0, pos:pos+pre_len] = prefix_ids.flatten()[:pre_len]; pos += pre_len
+        fill_start, fill_end = pos, pos + infill_length
+        x[0, fill_start:fill_end] = mask_token_id
+        pos = fill_end
+        if suf_len > 0:
+            x[0, pos:pos+suf_len] = suffix_ids.flatten()[:suf_len]; pos += suf_len
+        init_maskable = torch.zeros_like(x, dtype=torch.bool)
+        init_maskable[0, fill_start:fill_end] = True
+    else:
+        x = torch.full((1, seq_len), mask_token_id, dtype=torch.long, device=device)
+        if bos_token_id is not None:
+            x[0, 0] = bos_token_id
+        if eos_token_id is not None:
+            x[0, -1] = eos_token_id
+        init_maskable = x.eq(mask_token_id)
+    if bos_token_id is not None:
+        init_maskable[:, 0] = False
+    if eos_token_id is not None:
+        init_maskable &= x.ne(eos_token_id)
+    init_maskable &= x.ne(pad_token_id)
+    maskable = init_maskable.clone()
+    xt = x.clone()
+    if visualizer:
+        visualizer.start_visualization(xt, maskable, num_steps)
+    def forward_scores(tokens):
+        """Compute predictions and entropy scores for next tokens."""
+        # Try with input_ids parameter first (standard HF models)
+        try:
+            model_output = model(input_ids=tokens)
+        except TypeError:
+            # Fall back to positional argument
+            model_output = model(tokens)
+        # Apply temperature scaling (with safety for near-zero temperature)
+        safe_temperature = max(temperature, 1e-8)  # Prevent division by zero
+        logits = model_output.logits / safe_temperature
+        # Apply filtering strategies
+        # Note: When both top_k and top_p are provided, they are applied sequentially:
+        # First top_k filters to k tokens, then top_p filters from those k tokens
+        if top_k is not None and top_k > 0:
+            logits = apply_top_k_filtering(logits, top_k)
+        if top_p is not None and 0 < top_p < 1.0:
+            logits = apply_top_p_filtering(logits, top_p)
+        # Convert to log probabilities
+        logp = torch.log_softmax(logits, dim=-1)
+        # Greedy or stochastic sampling
+        if greedy:
+            pred_next = logp.argmax(-1)
+        else:
+            pred_next = torch.distributions.Categorical(logits=logp).sample(generator=generator)
+        conf_next = torch.gather(logp, -1, pred_next.unsqueeze(-1)).squeeze(-1)
+        p = logp.exp()
+        ent_next = -(p * logp).sum(-1)
+        # Shift predictions: pos i predicts token i+1
+        pred_i = tokens.clone()
+        conf_i = torch.full_like(conf_next, torch.finfo(conf_next.dtype).min)
+        ent_i = torch.zeros_like(ent_next)
+        pred_i[:, 1:] = pred_next[:, :-1]
+        conf_i[:, 1:] = conf_next[:, :-1]
+        ent_i[:, 1:] = ent_next[:, :-1]
+        return pred_i, conf_i, ent_i
+    pred_i, conf_i, ent_i = forward_scores(xt)
+    total_masked = init_maskable.sum(1, keepdim=True)
+    finf = torch.finfo(conf_i.dtype)
+    for step in range(num_steps - 1, 0, -1):
+        rate = step / num_steps
+        cutoff_len = (total_masked * rate).long().clamp(min=0)
+        # Choose HIGH-entropy tokens to keep masked
+        sel_scores = ent_i.masked_fill(~maskable, -finf.max)
+        B, L = sel_scores.shape
+        k_max = cutoff_len.max().item()
+        if k_max > 0:
+            sss, idx = torch.topk(sel_scores, k_max, dim=-1, largest=True)
+            keep_mask = torch.zeros_like(sel_scores, dtype=torch.bool)
+            for b in range(B):
+                k_b = int(cutoff_len[b].item())
+                if k_b > 0:
+                    keep_mask[b, idx[b, :k_b]] = True
+        else:
+            keep_mask = torch.zeros_like(sel_scores, dtype=torch.bool)
+        to_unmask = maskable & ~keep_mask
+        if to_unmask.any():
+            xt[to_unmask] = pred_i[to_unmask]
+            maskable[to_unmask] = False
+        if visualizer:
+            visualizer.update_step(xt, maskable, num_steps - step, ent_i, conf_i)
+        if maskable.any():
+            pred_i, conf_i, ent_i = forward_scores(xt)
+    if maskable.any():
+        xt[maskable] = pred_i[maskable]
+    if visualizer:
+        visualizer.stop_visualization()
+    return xt

rnd/terminal_visualizer.py ADDED Viewed

	@@ -0,0 +1,251 @@

+# Copyright 2025 Radical Numerics Inc.
+#
+# This source code is licensed under the Apache License, Version 2.0, found in the
+# LICENSE file in the root directory of this source tree.
+"""
+Terminal visualization for RND1 generation.
+This module provides real-time visualization of the diffusion denoising process,
+showing token evolution and generation progress in the terminal using rich
+formatting when available.
+"""
+import torch
+from typing import Optional
+from tqdm import tqdm
+try:
+    from rich.console import Console
+    from rich.live import Live
+    from rich.text import Text
+    from rich.panel import Panel
+    from rich.progress import Progress, BarColumn, TextColumn, TimeRemainingColumn, MofNCompleteColumn
+    from rich.layout import Layout
+    RICH_AVAILABLE = True
+except ImportError:
+    RICH_AVAILABLE = False
+class TerminalVisualizer:
+    """
+    Rich-based visualization for diffusion process with live updates.
+    Provides real-time visualization of the token denoising process during
+    diffusion-based language generation, with colored highlighting of masked
+    positions and progress tracking.
+    """
+    def __init__(self, tokenizer, show_visualization: bool = True):
+        """
+        Initialize the terminal visualizer.
+        Args:
+            tokenizer: The tokenizer for decoding tokens to text
+            show_visualization: Whether to show visualization (requires rich)
+        """
+        self.tokenizer = tokenizer
+        self.show_visualization = show_visualization and RICH_AVAILABLE
+        if not RICH_AVAILABLE and show_visualization:
+            print("Warning: Install 'rich' for better visualization. Falling back to simple progress bar.")
+            self.show_visualization = False
+        if self.show_visualization:
+            self.console = Console()
+            self.live = None
+            self.progress = None
+            self.layout = None
+        else:
+            self.pbar = None
+        self.current_tokens = None
+        self.mask_positions = None
+        self.total_steps = 0
+        self.current_step = 0
+    def start_visualization(self, initial_tokens: torch.LongTensor, mask_positions: torch.BoolTensor, total_steps: int):
+        """
+        Start the visualization.
+        Args:
+            initial_tokens: Initial token IDs (possibly masked)
+            mask_positions: Boolean mask indicating which positions are masked
+            total_steps: Total number of diffusion steps
+        """
+        if not self.show_visualization:
+            self.pbar = tqdm(total=total_steps, desc="Diffusion")
+            return
+        self.current_tokens = initial_tokens.clone()
+        self.mask_positions = mask_positions
+        self.total_steps = total_steps
+        self.current_step = 0
+        self.layout = Layout()
+        self.layout.split_column(
+            Layout(name="header", size=3),
+            Layout(name="text", ratio=1),
+            Layout(name="progress", size=3)
+        )
+        self.progress = Progress(
+            TextColumn("[bold blue]Diffusion"),
+            BarColumn(),
+            MofNCompleteColumn(),
+            TextColumn("•"),
+            TextColumn("[cyan]Masks: {task.fields[masks]}"),
+            TimeRemainingColumn(),
+        )
+        self.progress_task = self.progress.add_task(
+            "Generating",
+            total=total_steps,
+            masks=mask_positions.sum().item()
+        )
+        self.live = Live(self.layout, console=self.console, refresh_per_second=4)
+        self.live.start()
+        self._update_display()
+    def update_step(self, tokens: torch.LongTensor, maskable: Optional[torch.BoolTensor], step: int,
+                    entropy: Optional[torch.FloatTensor] = None, confidence: Optional[torch.FloatTensor] = None):
+        """
+        Update visualization for current step.
+        Args:
+            tokens: Current token IDs
+            maskable: Boolean mask of remaining masked positions
+            step: Current step number
+            entropy: Optional entropy scores for each position
+            confidence: Optional confidence scores for each position
+        """
+        if not self.show_visualization:
+            if self.pbar:
+                self.pbar.update(1)
+                masks = maskable.sum().item() if maskable is not None else 0
+                self.pbar.set_postfix({'masks': masks})
+            return
+        self.current_tokens = tokens.clone()
+        self.mask_positions = maskable
+        self.current_step = step
+        masks_remaining = maskable.sum().item() if maskable is not None else 0
+        self.progress.update(
+            self.progress_task,
+            advance=1,
+            masks=masks_remaining
+        )
+        self._update_display()
+    def _update_display(self):
+        """Update the live display."""
+        if not self.live:
+            return
+        header = Text("RND1-Base Generation", style="bold magenta", justify="center")
+        self.layout["header"].update(Panel(header, border_style="bright_blue"))
+        text_display = self._format_text_with_masks()
+        self.layout["text"].update(
+            Panel(
+                text_display,
+                title="[bold]Generated Text",
+                subtitle=f"[dim]Step {self.current_step}/{self.total_steps}[/dim]",
+                border_style="cyan"
+            )
+        )
+        self.layout["progress"].update(Panel(self.progress))
+    def _format_text_with_masks(self) -> Text:
+        """
+        Format text with colored masks.
+        Returns:
+            Rich Text object with formatted tokens
+        """
+        text = Text()
+        if self.current_tokens is None:
+            return text
+        token_ids = self.current_tokens[0] if self.current_tokens.dim() > 1 else self.current_tokens
+        mask_flags = self.mask_positions[0] if self.mask_positions is not None and self.mask_positions.dim() > 1 else self.mask_positions
+        for i, token_id in enumerate(token_ids):
+            if mask_flags is not None and i < len(mask_flags) and mask_flags[i]:
+                # Alternate colors for visual effect
+                text.append("[MASK]", style="bold red on yellow" if self.current_step % 2 == 0 else "bold yellow on red")
+            else:
+                try:
+                    token_str = self.tokenizer.decode([token_id.item()], skip_special_tokens=False)
+                    # Skip special tokens in display
+                    if token_str not in ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<s>", "</s>"]:
+                        # Color based on position
+                        text.append(token_str, style="green" if i < len(token_ids) // 2 else "cyan")
+                except:
+                    continue
+        return text
+    def stop_visualization(self):
+        """Stop the visualization and display final result."""
+        if not self.show_visualization:
+            if self.pbar:
+                self.pbar.close()
+                print("\n✨ Generation complete!\n")
+            return
+        if self.live:
+            self.live.stop()
+            self.console.print("\n[bold green]✨ Generation complete![/bold green]\n")
+            # Display final text
+            if self.current_tokens is not None:
+                try:
+                    token_ids = self.current_tokens[0] if self.current_tokens.dim() > 1 else self.current_tokens
+                    final_text = self.tokenizer.decode(token_ids, skip_special_tokens=True)
+                    self.console.print(Panel(
+                        final_text,
+                        title="[bold]Final Generated Text",
+                        border_style="green",
+                        padding=(1, 2)
+                    ))
+                except:
+                    pass
+class SimpleProgressBar:
+    """
+    Simple progress bar fallback when rich is not available.
+    Provides basic progress tracking using tqdm when the rich library
+    is not installed.
+    """
+    def __init__(self, total_steps: int):
+        """
+        Initialize simple progress bar.
+        Args:
+            total_steps: Total number of steps
+        """
+        self.pbar = tqdm(total=total_steps, desc="Diffusion")
+    def update(self, masks_remaining: int = 0):
+        """
+        Update progress bar.
+        Args:
+            masks_remaining: Number of masks still remaining
+        """
+        self.pbar.update(1)
+        self.pbar.set_postfix({'masks': masks_remaining})
+    def close(self):
+        """Close the progress bar."""
+        self.pbar.close()
+        print("\n✨ Generation complete!\n")