File size: 2,049 Bytes
3e7a3bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
"""
RND1 Model Configuration.

This module defines the configuration class for RND1 models,
extending Qwen3MoeConfig with RND1-specific parameters.
"""

from typing import Optional
from transformers.models.qwen3_moe.configuration_qwen3_moe import Qwen3MoeConfig


class RND1Config(Qwen3MoeConfig):
    """
    Configuration class for RND1 models.

    This configuration extends Qwen3MoeConfig with additional parameters
    specific to the RND1 (Radical Numerics Diffusion v1) architecture.

    Args:
        moe_backend: Backend for MoE computation ("hf", "flashinfer", or "sglang")
        num_diffusion_steps: Default number of diffusion steps for generation
        mask_token_id: Token ID used for masking (default: 151669 for Qwen)
        **kwargs: Additional arguments passed to Qwen3MoeConfig
    """

    model_type = "rnd1"

    def __init__(
        self,
        moe_backend: str = "hf",
        num_diffusion_steps: int = 256,
        mask_token_id: int = 151669,  # Default for Qwen-based RND1 models
        use_cache: bool = False,
        **kwargs,
    ):
        # Force non-causal and no caching for RND1
        kwargs['use_cache'] = False
        kwargs['is_causal'] = False
        super().__init__(**kwargs)

        # RND1-specific parameters
        self.moe_backend = moe_backend
        self.num_diffusion_steps = num_diffusion_steps
        self.mask_token_id = mask_token_id

        # Ensure bidirectional attention and no caching
        self.is_causal = False
        self.use_cache = False

    def to_dict(self):
        """
        Serializes configuration to dictionary with auto_map for Hub.

        The auto_map ensures that when users load from HuggingFace Hub,
        the correct custom classes are automatically resolved.
        """
        data = super().to_dict()
        data.setdefault("auto_map", {
            "AutoConfig": "configuration_rnd.RND1Config",
            "AutoModel": "modeling_rnd.RND1Model",
            "AutoModelForMaskedLM": "modeling_rnd.RND1LM",
        })
        return data