jeromeku commited on
Commit
a22884d
·
verified ·
1 Parent(s): 939a16f

RND1 config fix

Browse files
Files changed (1) hide show
  1. configuration_rnd.py +61 -14
configuration_rnd.py CHANGED
@@ -1,16 +1,55 @@
 
 
 
 
 
1
  """
2
  RND1 Model Configuration.
3
 
4
- This module defines the configuration class for RND1 models,
5
- extending Qwen3MoeConfig with RND1-specific parameters.
 
6
  """
7
 
8
- from typing import Optional
9
- from transformers.models.qwen3_moe.configuration_qwen3_moe import Qwen3MoeConfig
10
- from transformers import AutoConfig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
 
13
- class RND1Config(Qwen3MoeConfig):
14
  """
15
  Configuration class for RND1 models.
16
 
@@ -31,22 +70,20 @@ class RND1Config(Qwen3MoeConfig):
31
  moe_backend: str = "hf",
32
  num_diffusion_steps: int = 256,
33
  mask_token_id: int = 151669,
34
- use_cache: bool = False,
35
  **kwargs,
36
  ):
37
  # Force non-causal and no caching for RND1
38
  kwargs["use_cache"] = False
39
  kwargs["is_causal"] = False
 
40
  super().__init__(**kwargs)
41
 
42
- # `head_dim` needs to be 128 for Qwen3MoE
43
- # need to ensure that the config has this attr if directly passing config to RND1LM at instantiation
44
- if not hasattr(self, "head_dim"):
45
- self.head_dim = 128
46
 
47
- # Note that in transformers 4.57.0 there is an error in the config
48
- # num_hidden_layers is defaulted to 24
49
- self.num_hidden_layers = 48
50
 
51
  # RND1-specific parameters
52
  self.moe_backend = moe_backend
@@ -57,6 +94,16 @@ class RND1Config(Qwen3MoeConfig):
57
  self.is_causal = False
58
  self.use_cache = False
59
 
 
 
 
 
 
 
 
 
 
 
60
  def to_dict(self):
61
  """
62
  Serializes configuration to dictionary with auto_map for Hub.
 
1
+ # Copyright 2025 Radical Numerics Inc.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0, found in the
4
+ # LICENSE file in the root directory of this source tree.
5
+
6
  """
7
  RND1 Model Configuration.
8
 
9
+ This module defines the configuration class for RND1 models.
10
+ The default settings are derived from Qwen/Qwen3-30B-A3B and augmented
11
+ with RND1-specific parameters.
12
  """
13
 
14
+ from transformers.configuration_utils import PretrainedConfig
15
+
16
+ # Qwen3-30B-A3B / checkpoint defaults
17
+ CONFIG_DEFAULTS = {
18
+ "attention_bias": False,
19
+ "attention_dropout": 0.0,
20
+ "bos_token_id": 151643,
21
+ "decoder_sparse_step": 1,
22
+ "eos_token_id": 151645,
23
+ "head_dim": 128,
24
+ "hidden_act": "silu",
25
+ "hidden_size": 2048,
26
+ "initializer_range": 0.02,
27
+ "intermediate_size": 6144,
28
+ "max_position_embeddings": 40960,
29
+ "max_window_layers": 48,
30
+ "mlp_only_layers": [],
31
+ "moe_intermediate_size": 768,
32
+ "norm_topk_prob": True,
33
+ "num_attention_heads": 32,
34
+ "num_experts": 128,
35
+ "num_experts_per_tok": 8,
36
+ "num_hidden_layers": 48,
37
+ "num_key_value_heads": 4,
38
+ "output_router_logits": False,
39
+ "rms_norm_eps": 1e-06,
40
+ "rope_scaling": False,
41
+ "rope_theta": 1000000.0,
42
+ "router_aux_loss_coef": 0.001,
43
+ "sliding_window": False,
44
+ "tie_word_embeddings": False,
45
+ "torch_dtype": "bfloat16",
46
+ "use_cache": False,
47
+ "use_sliding_window": False,
48
+ "vocab_size": 151936,
49
+ }
50
 
51
 
52
+ class RND1Config(PretrainedConfig):
53
  """
54
  Configuration class for RND1 models.
55
 
 
70
  moe_backend: str = "hf",
71
  num_diffusion_steps: int = 256,
72
  mask_token_id: int = 151669,
 
73
  **kwargs,
74
  ):
75
  # Force non-causal and no caching for RND1
76
  kwargs["use_cache"] = False
77
  kwargs["is_causal"] = False
78
+
79
  super().__init__(**kwargs)
80
 
81
+ # Set defaults after pretrained init to prevent overrides
82
+ self.set_config_defaults()
 
 
83
 
84
+ # QoL: set attn impl directly from config
85
+ if "attn_implementation" in kwargs:
86
+ self._attn_implementation = kwargs["attn_implementation"]
87
 
88
  # RND1-specific parameters
89
  self.moe_backend = moe_backend
 
94
  self.is_causal = False
95
  self.use_cache = False
96
 
97
+ def set_config_defaults(self):
98
+ """
99
+ Ensure model defaults are set according to final training checkpoint
100
+
101
+ Qwen3MoeConfig defaults don't match Qwen/Qwen3-30B-A3B settings from which
102
+ RND1 is derived.
103
+ """
104
+ for k, v in CONFIG_DEFAULTS.items():
105
+ setattr(self, k, v)
106
+
107
  def to_dict(self):
108
  """
109
  Serializes configuration to dictionary with auto_map for Hub.