radicalnumerics
/

RND1-Base-0910

@@ -1,16 +1,55 @@
 """
 RND1 Model Configuration.
-This module defines the configuration class for RND1 models,
-extending Qwen3MoeConfig with RND1-specific parameters.
 """
-from typing import Optional
-from transformers.models.qwen3_moe.configuration_qwen3_moe import Qwen3MoeConfig
-from transformers import AutoConfig
-class RND1Config(Qwen3MoeConfig):
     """
     Configuration class for RND1 models.
@@ -31,22 +70,20 @@ class RND1Config(Qwen3MoeConfig):
         moe_backend: str = "hf",
         num_diffusion_steps: int = 256,
         mask_token_id: int = 151669,
-        use_cache: bool = False,
         **kwargs,
     ):
         # Force non-causal and no caching for RND1
         kwargs["use_cache"] = False
         kwargs["is_causal"] = False
         super().__init__(**kwargs)
-        # `head_dim` needs to be 128 for Qwen3MoE
-        # need to ensure that the config has this attr if directly passing config to RND1LM at instantiation
-        if not hasattr(self, "head_dim"):
-            self.head_dim = 128
-        # Note that in transformers 4.57.0 there is an error in the config
-        # num_hidden_layers is defaulted to 24
-        self.num_hidden_layers = 48
         # RND1-specific parameters
         self.moe_backend = moe_backend
@@ -57,6 +94,16 @@ class RND1Config(Qwen3MoeConfig):
         self.is_causal = False
         self.use_cache = False
     def to_dict(self):
         """
         Serializes configuration to dictionary with auto_map for Hub.

+# Copyright 2025 Radical Numerics Inc.
+#
+# This source code is licensed under the Apache License, Version 2.0, found in the
+# LICENSE file in the root directory of this source tree.
 """
 RND1 Model Configuration.
+This module defines the configuration class for RND1 models.
+The default settings are derived from Qwen/Qwen3-30B-A3B and augmented
+with RND1-specific parameters.
 """
+from transformers.configuration_utils import PretrainedConfig
+# Qwen3-30B-A3B / checkpoint defaults
+CONFIG_DEFAULTS = {
+    "attention_bias": False,
+    "attention_dropout": 0.0,
+    "bos_token_id": 151643,
+    "decoder_sparse_step": 1,
+    "eos_token_id": 151645,
+    "head_dim": 128,
+    "hidden_act": "silu",
+    "hidden_size": 2048,
+    "initializer_range": 0.02,
+    "intermediate_size": 6144,
+    "max_position_embeddings": 40960,
+    "max_window_layers": 48,
+    "mlp_only_layers": [],
+    "moe_intermediate_size": 768,
+    "norm_topk_prob": True,
+    "num_attention_heads": 32,
+    "num_experts": 128,
+    "num_experts_per_tok": 8,
+    "num_hidden_layers": 48,
+    "num_key_value_heads": 4,
+    "output_router_logits": False,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": False,
+    "rope_theta": 1000000.0,
+    "router_aux_loss_coef": 0.001,
+    "sliding_window": False,
+    "tie_word_embeddings": False,
+    "torch_dtype": "bfloat16",
+    "use_cache": False,
+    "use_sliding_window": False,
+    "vocab_size": 151936,
+}
+class RND1Config(PretrainedConfig):
     """
     Configuration class for RND1 models.
         moe_backend: str = "hf",
         num_diffusion_steps: int = 256,
         mask_token_id: int = 151669,
         **kwargs,
     ):
         # Force non-causal and no caching for RND1
         kwargs["use_cache"] = False
         kwargs["is_causal"] = False
         super().__init__(**kwargs)
+        # Set defaults after pretrained init to prevent overrides
+        self.set_config_defaults()
+        # QoL: set attn impl directly from config
+        if "attn_implementation" in kwargs:
+            self._attn_implementation = kwargs["attn_implementation"]
         # RND1-specific parameters
         self.moe_backend = moe_backend
         self.is_causal = False
         self.use_cache = False
+    def set_config_defaults(self):
+        """
+        Ensure model defaults are set according to final training checkpoint
+        Qwen3MoeConfig defaults don't match Qwen/Qwen3-30B-A3B settings from which
+        RND1 is derived.
+        """
+        for k, v in CONFIG_DEFAULTS.items():
+            setattr(self, k, v)
     def to_dict(self):
         """
         Serializes configuration to dictionary with auto_map for Hub.