Upload configuration_longcat_flash.py
Browse files
configuration_longcat_flash.py
CHANGED
|
@@ -53,7 +53,7 @@ class LongcatFlashConfig(PretrainedConfig):
|
|
| 53 |
Dimension of the value heads.
|
| 54 |
qk_nope_head_dim (`int`, *optional*, defaults to 128):
|
| 55 |
Dimension of the query/key heads that don't use rotary position embeddings.
|
| 56 |
-
norm_topk_prob (`bool`, *optional*, defaults to `
|
| 57 |
Whether to normalize the weights of the routed experts.
|
| 58 |
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
| 59 |
The non-linear activation function (function or string) in the decoder.
|
|
|
|
| 53 |
Dimension of the value heads.
|
| 54 |
qk_nope_head_dim (`int`, *optional*, defaults to 128):
|
| 55 |
Dimension of the query/key heads that don't use rotary position embeddings.
|
| 56 |
+
norm_topk_prob (`bool`, *optional*, defaults to `False`):
|
| 57 |
Whether to normalize the weights of the routed experts.
|
| 58 |
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
| 59 |
The non-linear activation function (function or string) in the decoder.
|