# Copyright 2025 Radical Numerics Inc. # # This source code is licensed under the Apache License, Version 2.0, found in the # LICENSE file in the root directory of this source tree. """ RND1 Model Configuration. This module defines the configuration class for RND1 models. The default settings are derived from Qwen/Qwen3-30B-A3B and augmented with RND1-specific parameters. """ from transformers.configuration_utils import PretrainedConfig # Qwen3-30B-A3B / checkpoint defaults CONFIG_DEFAULTS = { "attention_bias": False, "attention_dropout": 0.0, "decoder_sparse_step": 1, "eos_token_id": 151645, "head_dim": 128, "hidden_act": "silu", "hidden_size": 2048, "initializer_range": 0.02, "intermediate_size": 6144, "max_position_embeddings": 40960, "max_window_layers": 48, "mlp_only_layers": [], "moe_intermediate_size": 768, "norm_topk_prob": True, "num_attention_heads": 32, "num_experts": 128, "num_experts_per_tok": 8, "num_hidden_layers": 48, "num_key_value_heads": 4, "output_router_logits": False, "pad_token_id": 151643, "rms_norm_eps": 1e-06, "rope_scaling": False, "rope_theta": 1000000.0, "router_aux_loss_coef": 0.001, "sliding_window": False, "tie_word_embeddings": False, "torch_dtype": "bfloat16", "use_cache": False, "use_sliding_window": False, "vocab_size": 151936, } class RND1Config(PretrainedConfig): """ Configuration class for RND1 models. This configuration extends Qwen3MoeConfig with additional parameters specific to the RND1 (Radical Numerics Diffusion v1) architecture. Args: moe_backend: Backend for MoE computation ("hf", "flashinfer", or "sglang") num_diffusion_steps: Default number of diffusion steps for generation mask_token_id: Token ID used for masking (default: 151669 for Qwen) **kwargs: Additional arguments passed to Qwen3MoeConfig """ model_type = "rnd1" def __init__( self, moe_backend: str = "hf", num_diffusion_steps: int = 256, mask_token_id: int = 151669, **kwargs, ): # Force non-causal and no caching for RND1 kwargs["use_cache"] = False kwargs["is_causal"] = False super().__init__(**kwargs) # Set defaults after pretrained init to prevent overrides self.set_config_defaults() # QoL: set attn impl directly from config if "attn_implementation" in kwargs: self._attn_implementation = kwargs["attn_implementation"] # RND1-specific parameters self.moe_backend = moe_backend self.num_diffusion_steps = num_diffusion_steps self.mask_token_id = mask_token_id # Ensure bidirectional attention and no caching self.is_causal = False self.use_cache = False def set_config_defaults(self): """ Ensure model defaults are set according to final training checkpoint Qwen3MoeConfig defaults don't match Qwen/Qwen3-30B-A3B settings from which RND1 is derived. """ for k, v in CONFIG_DEFAULTS.items(): setattr(self, k, v) def to_dict(self): """ Serializes configuration to dictionary with auto_map for Hub. The auto_map ensures that when users load from HuggingFace Hub, the correct custom classes are automatically resolved. """ data = super().to_dict() data.setdefault( "auto_map", { "AutoConfig": "configuration_rnd.RND1Config", "AutoModel": "modeling_rnd.RND1Model", "AutoModelForMaskedLM": "modeling_rnd.RND1LM", }, ) return data