|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | """ | 
					
						
						|  | RND1 Model Configuration. | 
					
						
						|  |  | 
					
						
						|  | This module defines the configuration class for RND1 models. | 
					
						
						|  | The default settings are derived from Qwen/Qwen3-30B-A3B and augmented | 
					
						
						|  | with RND1-specific parameters. | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | from transformers.configuration_utils import PretrainedConfig | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | CONFIG_DEFAULTS = { | 
					
						
						|  | "attention_bias": False, | 
					
						
						|  | "attention_dropout": 0.0, | 
					
						
						|  | "bos_token_id": 151643, | 
					
						
						|  | "decoder_sparse_step": 1, | 
					
						
						|  | "eos_token_id": 151645, | 
					
						
						|  | "head_dim": 128, | 
					
						
						|  | "hidden_act": "silu", | 
					
						
						|  | "hidden_size": 2048, | 
					
						
						|  | "initializer_range": 0.02, | 
					
						
						|  | "intermediate_size": 6144, | 
					
						
						|  | "max_position_embeddings": 40960, | 
					
						
						|  | "max_window_layers": 48, | 
					
						
						|  | "mlp_only_layers": [], | 
					
						
						|  | "moe_intermediate_size": 768, | 
					
						
						|  | "norm_topk_prob": True, | 
					
						
						|  | "num_attention_heads": 32, | 
					
						
						|  | "num_experts": 128, | 
					
						
						|  | "num_experts_per_tok": 8, | 
					
						
						|  | "num_hidden_layers": 48, | 
					
						
						|  | "num_key_value_heads": 4, | 
					
						
						|  | "output_router_logits": False, | 
					
						
						|  | "rms_norm_eps": 1e-06, | 
					
						
						|  | "rope_scaling": False, | 
					
						
						|  | "rope_theta": 1000000.0, | 
					
						
						|  | "router_aux_loss_coef": 0.001, | 
					
						
						|  | "sliding_window": False, | 
					
						
						|  | "tie_word_embeddings": False, | 
					
						
						|  | "torch_dtype": "bfloat16", | 
					
						
						|  | "use_cache": False, | 
					
						
						|  | "use_sliding_window": False, | 
					
						
						|  | "vocab_size": 151936, | 
					
						
						|  | } | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | class RND1Config(PretrainedConfig): | 
					
						
						|  | """ | 
					
						
						|  | Configuration class for RND1 models. | 
					
						
						|  |  | 
					
						
						|  | This configuration extends Qwen3MoeConfig with additional parameters | 
					
						
						|  | specific to the RND1 (Radical Numerics Diffusion v1) architecture. | 
					
						
						|  |  | 
					
						
						|  | Args: | 
					
						
						|  | moe_backend: Backend for MoE computation ("hf", "flashinfer", or "sglang") | 
					
						
						|  | num_diffusion_steps: Default number of diffusion steps for generation | 
					
						
						|  | mask_token_id: Token ID used for masking (default: 151669 for Qwen) | 
					
						
						|  | **kwargs: Additional arguments passed to Qwen3MoeConfig | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | model_type = "rnd1" | 
					
						
						|  |  | 
					
						
						|  | def __init__( | 
					
						
						|  | self, | 
					
						
						|  | moe_backend: str = "hf", | 
					
						
						|  | num_diffusion_steps: int = 256, | 
					
						
						|  | mask_token_id: int = 151669, | 
					
						
						|  | **kwargs, | 
					
						
						|  | ): | 
					
						
						|  |  | 
					
						
						|  | kwargs["use_cache"] = False | 
					
						
						|  | kwargs["is_causal"] = False | 
					
						
						|  |  | 
					
						
						|  | super().__init__(**kwargs) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.set_config_defaults() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | if "attn_implementation" in kwargs: | 
					
						
						|  | self._attn_implementation = kwargs["attn_implementation"] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.moe_backend = moe_backend | 
					
						
						|  | self.num_diffusion_steps = num_diffusion_steps | 
					
						
						|  | self.mask_token_id = mask_token_id | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | self.is_causal = False | 
					
						
						|  | self.use_cache = False | 
					
						
						|  |  | 
					
						
						|  | def set_config_defaults(self): | 
					
						
						|  | """ | 
					
						
						|  | Ensure model defaults are set according to final training checkpoint | 
					
						
						|  |  | 
					
						
						|  | Qwen3MoeConfig defaults don't match Qwen/Qwen3-30B-A3B settings from which | 
					
						
						|  | RND1 is derived. | 
					
						
						|  | """ | 
					
						
						|  | for k, v in CONFIG_DEFAULTS.items(): | 
					
						
						|  | setattr(self, k, v) | 
					
						
						|  |  | 
					
						
						|  | def to_dict(self): | 
					
						
						|  | """ | 
					
						
						|  | Serializes configuration to dictionary with auto_map for Hub. | 
					
						
						|  |  | 
					
						
						|  | The auto_map ensures that when users load from HuggingFace Hub, | 
					
						
						|  | the correct custom classes are automatically resolved. | 
					
						
						|  | """ | 
					
						
						|  | data = super().to_dict() | 
					
						
						|  | data.setdefault( | 
					
						
						|  | "auto_map", | 
					
						
						|  | { | 
					
						
						|  | "AutoConfig": "configuration_rnd.RND1Config", | 
					
						
						|  | "AutoModel": "modeling_rnd.RND1Model", | 
					
						
						|  | "AutoModelForMaskedLM": "modeling_rnd.RND1LM", | 
					
						
						|  | }, | 
					
						
						|  | ) | 
					
						
						|  | return data | 
					
						
						|  |  |