Upload folder using huggingface_hub

Browse files

Files changed (14) hide show

.gitattributes +1 -0
added_tokens.json +28 -0
config.json +102 -0
configuration_rnd.py +63 -0
generation_config.json +17 -0
generation_config.py +77 -0
generation_utils.py +149 -0
merges.txt +0 -0
modeling_rnd.py +529 -0
sampling.py +271 -0
special_tokens_map.json +38 -0
tokenizer.json +3 -0
tokenizer_config.json +249 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

added_tokens.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "</think>": 151668,
+  "</tool_call>": 151658,
+  "</tool_response>": 151666,
+  "<think>": 151667,
+  "<tool_call>": 151657,
+  "<tool_response>": 151665,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

config.json ADDED Viewed

	@@ -0,0 +1,102 @@

+{
+  "vocab_size": 151936,
+  "max_position_embeddings": 40960,
+  "hidden_size": 2048,
+  "intermediate_size": 6144,
+  "num_hidden_layers": 48,
+  "num_attention_heads": 32,
+  "use_sliding_window": false,
+  "sliding_window": null,
+  "num_key_value_heads": 4,
+  "hidden_act": "silu",
+  "initializer_range": 0.02,
+  "rms_norm_eps": 1e-06,
+  "use_cache": false,
+  "rope_theta": 1000000.0,
+  "rope_scaling": null,
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "decoder_sparse_step": 1,
+  "moe_intermediate_size": 768,
+  "num_experts_per_tok": 8,
+  "num_experts": 128,
+  "norm_topk_prob": true,
+  "output_router_logits": false,
+  "router_aux_loss_coef": 0.001,
+  "mlp_only_layers": [],
+  "return_dict": true,
+  "output_hidden_states": false,
+  "torchscript": false,
+  "dtype": "bfloat16",
+  "pruned_heads": {},
+  "tie_word_embeddings": false,
+  "chunk_size_feed_forward": 0,
+  "is_encoder_decoder": false,
+  "is_decoder": false,
+  "cross_attention_hidden_size": null,
+  "add_cross_attention": false,
+  "tie_encoder_decoder": false,
+  "architectures": [
+    "Qwen3MoeForCausalLM"
+  ],
+  "finetuning_task": null,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1"
+  },
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1
+  },
+  "task_specific_params": null,
+  "problem_type": null,
+  "tokenizer_class": null,
+  "prefix": null,
+  "bos_token_id": 151643,
+  "pad_token_id": null,
+  "eos_token_id": 151645,
+  "sep_token_id": null,
+  "decoder_start_token_id": null,
+  "max_length": 20,
+  "min_length": 0,
+  "do_sample": false,
+  "early_stopping": false,
+  "num_beams": 1,
+  "num_beam_groups": 1,
+  "diversity_penalty": 0.0,
+  "temperature": 1.0,
+  "top_k": 50,
+  "top_p": 1.0,
+  "typical_p": 1.0,
+  "repetition_penalty": 1.0,
+  "length_penalty": 1.0,
+  "no_repeat_ngram_size": 0,
+  "encoder_no_repeat_ngram_size": 0,
+  "bad_words_ids": null,
+  "num_return_sequences": 1,
+  "output_scores": false,
+  "return_dict_in_generate": false,
+  "forced_bos_token_id": null,
+  "forced_eos_token_id": null,
+  "remove_invalid_values": false,
+  "exponential_decay_length_penalty": null,
+  "suppress_tokens": null,
+  "begin_suppress_tokens": null,
+  "_name_or_path": "",
+  "transformers_version": "4.56.1",
+  "head_dim": 128,
+  "max_window_layers": 48,
+  "model_type": "rnd1",
+  "is_causal": false,
+  "tf_legacy_loss": false,
+  "use_bfloat16": false,
+  "moe_backend": "hf",
+  "num_diffusion_steps": 256,
+  "mask_token_id": 151669,
+  "output_attentions": false,
+  "auto_map": {
+    "AutoConfig": "configuration_rnd.RND1Config",
+    "AutoModel": "modeling_rnd.RND1Model",
+    "AutoModelForMaskedLM": "modeling_rnd.RND1LM"
+  }
+}

configuration_rnd.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""
+RND1 Model Configuration.
+This module defines the configuration class for RND1 models,
+extending Qwen3MoeConfig with RND1-specific parameters.
+"""
+from typing import Optional
+from transformers.models.qwen3_moe.configuration_qwen3_moe import Qwen3MoeConfig
+class RND1Config(Qwen3MoeConfig):
+    """
+    Configuration class for RND1 models.
+    This configuration extends Qwen3MoeConfig with additional parameters
+    specific to the RND1 (Radical Numerics Diffusion v1) architecture.
+    Args:
+        moe_backend: Backend for MoE computation ("hf", "flashinfer", or "sglang")
+        num_diffusion_steps: Default number of diffusion steps for generation
+        mask_token_id: Token ID used for masking (default: 151669 for Qwen)
+        **kwargs: Additional arguments passed to Qwen3MoeConfig
+    """
+    model_type = "rnd1"
+    def __init__(
+        self,
+        moe_backend: str = "hf",
+        num_diffusion_steps: int = 256,
+        mask_token_id: int = 151669,  # Default for Qwen-based RND1 models
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        # Force non-causal and no caching for RND1
+        kwargs['use_cache'] = False
+        kwargs['is_causal'] = False
+        super().__init__(**kwargs)
+        # RND1-specific parameters
+        self.moe_backend = moe_backend
+        self.num_diffusion_steps = num_diffusion_steps
+        self.mask_token_id = mask_token_id
+        # Ensure bidirectional attention and no caching
+        self.is_causal = False
+        self.use_cache = False
+    def to_dict(self):
+        """
+        Serializes configuration to dictionary with auto_map for Hub.
+        The auto_map ensures that when users load from HuggingFace Hub,
+        the correct custom classes are automatically resolved.
+        """
+        data = super().to_dict()
+        data.setdefault("auto_map", {
+            "AutoConfig": "configuration_rnd.RND1Config",
+            "AutoModel": "modeling_rnd.RND1Model",
+            "AutoModelForMaskedLM": "modeling_rnd.RND1LM",
+        })
+        return data

generation_config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "pad_token_id": 151643,
+  "mask_token_id": 151669,
+  "max_length": 256,
+  "max_new_tokens": 256,
+  "num_diffusion_steps": 256,
+  "temperature": 1.0,
+  "top_k": null,
+  "top_p": null,
+  "do_sample": true,
+  "greedy": true,
+  "use_cache": false,
+  "_from_model_config": true,
+  "transformers_version": "4.45.2"
+}

generation_config.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""
+RND1 Generation Configuration.
+This module defines the generation configuration for RND1 models,
+controlling the diffusion-based generation process.
+"""
+from typing import Optional
+from transformers.generation.configuration_utils import GenerationConfig
+class RND1GenerationConfig(GenerationConfig):
+    """
+    Configuration class for RND1 generation parameters.
+    This class extends the base GenerationConfig to include parameters
+    specific to diffusion-based language generation.
+    Args:
+        max_length: Maximum sequence length
+        num_diffusion_steps: Number of denoising steps in the diffusion process
+        mask_token_id: Token ID used for masking during diffusion
+        temperature: Temperature for sampling (higher = more random)
+        top_k: Optional top-k filtering
+        top_p: Optional nucleus (top-p) filtering
+        greedy: Whether to use greedy decoding (True) or stochastic sampling (False)
+        **kwargs: Additional arguments passed to GenerationConfig
+    """
+    def __init__(
+        self,
+        max_length: int = 256,
+        num_diffusion_steps: int = 256,
+        mask_token_id: int = 151669,  # Default for Qwen-based RND1 models
+        temperature: float = 1.0,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        greedy: bool = True,
+        seed: Optional[int] = None,  # For reproducible generation
+        bos_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        # Force no caching for RND1 generation - remove from kwargs if present
+        kwargs.pop('use_cache', None)
+        super().__init__(
+            max_length=max_length,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            do_sample=not greedy,
+            use_cache=False,  # Always False for RND1
+            **kwargs,
+        )
+        # RND1-specific parameters
+        self.num_diffusion_steps = num_diffusion_steps
+        self.mask_token_id = mask_token_id
+        self.greedy = greedy
+        self.temperature = float(temperature)  # Ensure it's a float
+        self.seed = seed
+    def to_dict(self):
+        """Convert configuration to dictionary."""
+        output = super().to_dict()
+        output["num_diffusion_steps"] = self.num_diffusion_steps
+        output["mask_token_id"] = self.mask_token_id
+        output["greedy"] = self.greedy
+        if self.seed is not None:
+            output["seed"] = self.seed
+        return output

generation_utils.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""
+RND1 Generation Utilities.
+This module provides generation utilities and mixins for RND1 models,
+including the main GenerationMixin class that integrates with HuggingFace.
+"""
+import torch
+import torch.nn as nn
+from typing import Optional, Union, Dict, Any
+from transformers import GenerationMixin as HFGenerationMixin
+from transformers.generation import GenerationConfig
+from .sampling import diffusion_sample, apply_top_k_filtering, apply_top_p_filtering
+class RND1GenerationMixin(HFGenerationMixin):
+    """
+    Generation mixin for RND1 models.
+    This mixin provides generation methods compatible with HuggingFace's
+    generation API while using RND1's diffusion-based sampling internally.
+    """
+    def generate(
+        self,
+        inputs: Optional[torch.LongTensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        # RND1-specific parameters
+        prefix_ids: Optional[torch.LongTensor] = None,
+        suffix_ids: Optional[torch.LongTensor] = None,
+        infill_length: Optional[int] = None,
+        return_dict_in_generate: Optional[bool] = None,
+        **kwargs,  # Accept all kwargs to be compatible with pipelines
+    ) -> Union[torch.LongTensor, Dict[str, Any]]:
+        """
+        Generate text using RND1's diffusion-based sampling.
+        Follows HuggingFace's standard generate API, using diffusion sampling
+        internally. Supports both standard generation and infilling.
+        Args:
+            inputs: Input token IDs to use as prefix (standard HF parameter)
+            generation_config: Generation configuration object
+            prefix_ids: Alternative to inputs for infilling tasks
+            suffix_ids: Optional suffix for infilling tasks
+            infill_length: Length of infill region (for infilling)
+            return_dict_in_generate: Whether to return GenerateDecoderOnlyOutput
+            **kwargs: Additional arguments (accepted for compatibility)
+        Returns:
+            Generated token IDs or GenerateDecoderOnlyOutput
+        """
+        if generation_config is not None:
+            gen_config = generation_config
+            model_kwargs = kwargs.copy()
+        else:
+            # Only prepare config from kwargs if no config was provided
+            gen_config, model_kwargs = self._prepare_generation_config(None, **kwargs)
+        device = next(self.parameters()).device
+        if inputs is not None:
+            prefix_ids = inputs.to(device)
+        elif prefix_ids is not None:
+            prefix_ids = prefix_ids.to(device)
+        else:
+            prefix_ids = None
+        if suffix_ids is not None:
+            suffix_ids = suffix_ids.to(device)
+        eos_token_id = gen_config.eos_token_id or getattr(self.config, "eos_token_id", 151645)
+        pad_token_id = gen_config.pad_token_id or getattr(self.config, "pad_token_id", None)
+        bos_token_id = gen_config.bos_token_id or getattr(self.config, "bos_token_id", None)
+        mask_token_id = getattr(gen_config, "mask_token_id", getattr(self.config, "mask_token_id", 151669))
+        if infill_length is not None and prefix_ids is not None:
+            # Infilling mode: use specified infill_length
+            prefix_len = prefix_ids.shape[1] if prefix_ids is not None else 0
+            suffix_len = suffix_ids.shape[1] if suffix_ids is not None else 0
+            seq_len = prefix_len + infill_length + suffix_len
+        else:
+            # Standard generation mode
+            if prefix_ids is not None:
+                prefix_len = prefix_ids.shape[1]
+                if gen_config.max_new_tokens is not None:
+                    seq_len = prefix_len + gen_config.max_new_tokens
+                else:
+                    seq_len = gen_config.max_length or self.config.max_position_embeddings
+            else:
+                seq_len = gen_config.max_length or self.config.max_position_embeddings
+        num_diffusion_steps = getattr(gen_config, "num_diffusion_steps",
+                                     getattr(self.config, "num_diffusion_steps", 256))
+        temperature = float(getattr(gen_config, "temperature", 1.0))
+        top_k = getattr(gen_config, "top_k", None)
+        top_p = getattr(gen_config, "top_p", None)
+        greedy = getattr(gen_config, "greedy",
+                        not bool(gen_config.do_sample) if hasattr(gen_config, "do_sample") else True)
+        generator = model_kwargs.get("generator", None)
+        if generator is None:
+            seed = getattr(gen_config, 'seed', None)
+            if seed is not None:
+                generator = torch.Generator(device=device)
+                generator.manual_seed(seed)
+        with torch.inference_mode():
+            sequences = diffusion_sample(
+                model=self,
+                seq_len=seq_len,
+                num_steps=num_diffusion_steps,
+                mask_token_id=mask_token_id,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                greedy=greedy,
+                prefix_ids=prefix_ids,
+                suffix_ids=suffix_ids,
+                infill_length=infill_length,
+                eos_token_id=eos_token_id,
+                pad_token_id=pad_token_id,
+                bos_token_id=bos_token_id,
+                device=device,
+                generator=generator,
+                visualizer=model_kwargs.get("visualizer", None),  # Optional visualizer from kwargs
+            )
+        if return_dict_in_generate or getattr(gen_config, "return_dict_in_generate", False):
+            from transformers.generation.utils import GenerateDecoderOnlyOutput
+            return GenerateDecoderOnlyOutput(sequences=sequences)
+        return sequences
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        """
+        Prepare inputs for generation (required by HuggingFace).
+        For RND1, we don't use the standard autoregressive generation,
+        so this just returns the input_ids.
+        """
+        return {"input_ids": input_ids}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

modeling_rnd.py ADDED Viewed

	@@ -0,0 +1,529 @@

+"""
+RND1 model implementation.
+This module implements the RND1 architecture with bidirectional attention for
+diffusion-based language modeling. Includes support for Mixture of Experts (MoE)
+with multiple backend options (HF, FlashInfer, SGLang).
+Based on the Qwen3Moe architecture:
+https://github.com/huggingface/transformers/blob/v4.57.0/src/transformers/models/qwen3_moe/modeling_qwen3_moe.py
+"""
+from __future__ import annotations
+import os
+from typing import Optional, Tuple, List, Union
+import torch
+from torch import nn
+from transformers.utils import logging
+from transformers.cache_utils import Cache
+from transformers.modeling_outputs import (
+    MoeModelOutputWithPast,
+    MaskedLMOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.configuration_utils import PretrainedConfig
+from transformers.generation import GenerationConfig
+from .configuration_rnd import RND1Config
+from .generation_utils import RND1GenerationMixin
+from .generation_config import RND1GenerationConfig
+from transformers.models.qwen3_moe.modeling_qwen3_moe import (
+    Qwen3MoeConfig,
+    Qwen3MoeRMSNorm,
+    Qwen3MoeRotaryEmbedding,
+    Qwen3MoeSparseMoeBlock,
+    Qwen3MoeMLP,
+    apply_rotary_pos_emb
+)
+import torch.nn.functional as F
+try:
+    import flashinfer.fused_moe as fused_moe
+except Exception:
+    fused_moe = None
+try:
+    from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe as sglang_fused_moe
+    from sglang.srt.layers.moe.topk import StandardTopKOutput
+except Exception:
+    sglang_fused_moe = None
+    StandardTopKOutput = None
+logger = logging.get_logger(__name__)
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """Expand key/value heads to match query heads for grouped-query attention."""
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class RND1Attention(nn.Module):
+    """RND1 attention layer with bidirectional attention for diffusion modeling."""
+    def __init__(self, config: RND1Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.scaling = self.head_dim ** -0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = False
+        self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(config.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(config.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, config.hidden_size, bias=config.attention_bias)
+        self.q_norm = Qwen3MoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = Qwen3MoeRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.sliding_window = getattr(config, "sliding_window", None)
+        self.rotary_emb = Qwen3MoeRotaryEmbedding(config=config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, Tuple[torch.Tensor, torch.Tensor]]] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        dual_cache: Optional[bool] = False,
+        replace_position: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Union[Cache, Tuple[torch.Tensor, torch.Tensor]]]]:
+        bsz, q_len, _ = hidden_states.size()
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        use_sdpa = (getattr(self.config, "_attn_implementation", "eager") == "sdpa")
+        if use_sdpa:
+            if attention_mask is not None and isinstance(attention_mask, torch.Tensor):
+                if attention_mask.dtype not in [torch.bool, torch.float32, torch.float16, torch.bfloat16]:
+                    attention_mask = attention_mask.to(dtype=query_states.dtype)
+            assert not self.is_causal, f"Attention layer {self.layer_idx} is causal"
+            attn_out = torch.nn.functional.scaled_dot_product_attention(
+                query_states, key_states, value_states,
+                attn_mask=attention_mask if isinstance(attention_mask, torch.Tensor) else None,
+                dropout_p=self.attention_dropout if self.training else 0.0,
+                is_causal=self.is_causal,
+            )
+            attn_out = attn_out.transpose(1, 2).contiguous()
+            attn_out = attn_out.view(bsz, q_len, self.num_heads * self.head_dim)
+            attn_out = self.o_proj(attn_out)
+            return attn_out, None
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_out = torch.matmul(attn_weights, value_states)
+        attn_out = attn_out.transpose(1, 2).contiguous().view(hidden_states.size(0), hidden_states.size(1), -1)
+        attn_out = self.o_proj(attn_out)
+        return attn_out, None
+class RND1DecoderLayer(nn.Module):
+    """RND1 decoder layer with bidirectional attention for diffusion language modeling."""
+    def __init__(self, config: RND1Config, layer_idx: int):
+        super().__init__()
+        self.self_attn = RND1Attention(config, layer_idx)
+        self.mlp = RND1SparseMoeBlock(config)
+        self.input_layernorm = Qwen3MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen3MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        replace_position: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.Tensor]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        attn_out, attn_weights = self.self_attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            position_embeddings=position_embeddings,
+            replace_position=replace_position,
+        )
+        hidden_states = residual + attn_out
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        ff_out = self.mlp(hidden_states)
+        if isinstance(ff_out, tuple):
+            ff_out = ff_out[0]
+        hidden_states = residual + ff_out
+        return hidden_states, attn_weights
+class RND1SparseMoeBlock(nn.Module):
+    """RND1 Sparse MoE block with multiple backend support (HF, FlashInfer, SGLang)."""
+    def __init__(self, config: RND1Config):
+        super().__init__()
+        self.config = config
+        self.backend = getattr(config, "moe_backend", "hf")
+        self.num_experts = config.num_experts
+        self.top_k = config.num_experts_per_tok
+        self.norm_topk_prob = config.norm_topk_prob
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = getattr(config, "moe_intermediate_size", config.intermediate_size)
+        self.gate = nn.Linear(self.hidden_size, self.num_experts, bias=False)
+        self.experts = nn.ModuleList(
+            [Qwen3MoeMLP(config, intermediate_size=self.intermediate_size) for _ in range(self.num_experts)]
+        )
+        # Cached weight tensors for optimized backends
+        self._flashinfer_fc1_weights = None
+        self._flashinfer_fc2_weights = None
+        self._sglang_w1 = None
+        self._sglang_w2 = None
+        if self.backend == "sglang":
+            if sglang_fused_moe is None or StandardTopKOutput is None:
+                raise RuntimeError("sglang is not available, cannot use sglang backend")
+        elif self.backend == "flashinfer":
+            if fused_moe is None:
+                raise RuntimeError("flashinfer is not available, cannot use flashinfer backend")
+    def _initialize_flashinfer_weights(self):
+        """Initialize FlashInfer-compatible weight format."""
+        fc1_list = []
+        fc2_list = []
+        for expert in self.experts:
+            gate_w = expert.gate_proj.weight  # [I, H]
+            up_w = expert.up_proj.weight      # [I, H]
+            down_w = expert.down_proj.weight  # [H, I]
+            # FlashInfer expects [up; gate] ordering
+            fc1_list.append(torch.cat([up_w, gate_w], dim=0))  # [2I, H]
+            fc2_list.append(down_w)  # [H, I]
+        self._flashinfer_fc1_weights = torch.stack(fc1_list, dim=0).contiguous()
+        self._flashinfer_fc2_weights = torch.stack(fc2_list, dim=0).contiguous()
+    def _initialize_sglang_weights(self):
+        """Initialize SGLang-compatible weight format."""
+        w1_list = []
+        w2_list = []
+        for expert in self.experts:
+            gate_w = expert.gate_proj.weight  # [I, H]
+            up_w = expert.up_proj.weight      # [I, H]
+            down_w = expert.down_proj.weight  # [H, I]
+            w1 = torch.cat([gate_w, up_w], dim=0)  # [2I, H]
+            w1_list.append(w1)
+            w2_list.append(down_w)
+        self._sglang_w1 = torch.stack(w1_list, dim=0).contiguous()
+        self._sglang_w2 = torch.stack(w2_list, dim=0).contiguous()
+    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Forward pass with expert routing and computation."""
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        x = hidden_states.view(-1, hidden_dim)
+        # Expert routing
+        router_logits = self.gate(x)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        if self.norm_topk_prob:
+            routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
+        if self.backend == "hf":
+            final_hidden_states = torch.zeros(
+                (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+            )
+            expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+            expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+            for expert_idx in expert_hit:
+                expert_layer = self.experts[expert_idx]
+                idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
+                current_state = x[top_x]
+                current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+                final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+            out = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+            return out, router_logits.view(batch_size, sequence_length, -1)
+        elif self.backend == "flashinfer":
+            if self._flashinfer_fc1_weights is None or self._flashinfer_fc2_weights is None:
+                self._initialize_flashinfer_weights()
+            result = fused_moe.cutlass_fused_moe(
+                input=x,
+                token_selected_experts=selected_experts.to(torch.int),
+                token_final_scales=routing_weights.to(torch.float32),
+                fc1_expert_weights=self._flashinfer_fc1_weights,
+                fc2_expert_weights=self._flashinfer_fc2_weights,
+                output_dtype=x.dtype,
+                quant_scales=None,
+            )
+            if isinstance(result, (list, tuple)):
+                out_flat = result[0]
+            else:
+                out_flat = result
+            out = out_flat.view(batch_size, sequence_length, hidden_dim)
+            return out, router_logits.view(batch_size, sequence_length, -1)
+        elif self.backend == "sglang":
+            if self._sglang_w1 is None or self._sglang_w2 is None:
+                self._initialize_sglang_weights()
+            topk_output = StandardTopKOutput(
+                topk_weights=routing_weights,
+                topk_ids=selected_experts,
+                router_logits=router_logits,
+            )
+            out_flat = sglang_fused_moe(
+                hidden_states=x,
+                w1=self._sglang_w1,
+                w2=self._sglang_w2,
+                topk_output=topk_output,
+            )
+            out = out_flat.view(batch_size, sequence_length, hidden_dim)
+            return out, router_logits.view(batch_size, sequence_length, -1)
+        else:
+            raise ValueError(f"Invalid backend: {self.backend}")
+class RND1PreTrainedModel(PreTrainedModel):
+    """Base class for RND1 models with weight initialization and loading support."""
+    config_class = RND1Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["RND1DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    def _init_weights(self, module):
+        """Initialize weights using normal distribution."""
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
+        *model_args,
+        config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None,
+        cache_dir: Optional[Union[str, os.PathLike]] = None,
+        ignore_mismatched_sizes: bool = False,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        token: Optional[Union[str, bool]] = None,
+        revision: str = "main",
+        use_safetensors: Optional[bool] = None,
+        weights_only: bool = True,
+        **kwargs,
+    ):
+        """Load pretrained model with generation config."""
+        _model = super().from_pretrained(
+            pretrained_model_name_or_path,
+            *model_args,
+            config=config,
+            cache_dir=cache_dir,
+            ignore_mismatched_sizes=ignore_mismatched_sizes,
+            force_download=force_download,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            use_safetensors=use_safetensors,
+            weights_only=weights_only,
+            **kwargs,
+        )
+        resume_download = kwargs.get("resume_download", None)
+        proxies = kwargs.get("proxies", None)
+        subfolder = kwargs.get("subfolder", "")
+        from_auto_class = kwargs.get("_from_auto", False)
+        from_pipeline = kwargs.get("_from_pipeline", None)
+        _model.generation_config = GenerationConfig.from_pretrained(
+            pretrained_model_name_or_path,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            resume_download=resume_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            _from_auto=from_auto_class,
+            _from_pipeline=from_pipeline,
+        )
+        return _model
+class RND1Model(RND1PreTrainedModel):
+    """RND1 transformer model with bidirectional attention for diffusion language modeling."""
+    def __init__(self, config: RND1Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([RND1DecoderLayer(config, i) for i in range(config.num_hidden_layers)])
+        self.norm = Qwen3MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Qwen3MoeRotaryEmbedding(config=config)
+        self.post_init()
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ) -> MoeModelOutputWithPast:
+        """Forward pass through the RND1 model."""
+        if (input_ids is None) == (inputs_embeds is None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if position_ids is None:
+            position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device).unsqueeze(0)
+        position_embeddings = self.rotary_emb(inputs_embeds, position_ids)
+        hidden_states = inputs_embeds
+        for layer in self.layers:
+            hidden_states, _ = layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                position_embeddings=position_embeddings,
+            )
+        hidden_states = self.norm(hidden_states)
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            router_logits=None,
+        )
+class RND1LM(RND1PreTrainedModel, RND1GenerationMixin):
+    """Radical Numerics Diffusion Language Model with bidirectional attention."""
+    def __init__(self, config: RND1Config):
+        super().__init__(config)
+        self.model = RND1Model(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def get_input_embeddings(self):
+        """Get the input embeddings layer."""
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        """Set the input embeddings layer."""
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        """Get the output embeddings layer (lm_head)."""
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        """Set the output embeddings layer (lm_head)."""
+        self.lm_head = new_embeddings
+    @classmethod
+    def can_generate(cls) -> bool:
+        """Indicates this model can generate text."""
+        return True
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> MaskedLMOutput:
+        """Forward pass with optional loss computation."""
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            **kwargs,
+        )
+        logits = self.lm_head(outputs.last_hidden_state)
+        loss = None
+        if labels is not None:
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
+        return MaskedLMOutput(
+            loss=loss,
+            logits=logits,
+        )

sampling.py ADDED Viewed

	@@ -0,0 +1,271 @@

+"""
+RND1 sampling module for masked diffusion generation.
+This module implements entropy-based token selection for iterative denoising
+in diffusion language models. Supports both greedy and stochastic sampling
+with optional prefix/suffix constraints and infilling.
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Tuple, Union
+def apply_top_k_filtering(logits: torch.Tensor, k: int) -> torch.Tensor:
+    """
+    Apply top-k filtering to logits: with non-top-k values set to -inf
+    """
+    top_k_values, top_k_indices = torch.topk(logits, min(k, logits.size(-1)), dim=-1)
+    filtered_logits = torch.full_like(logits, float('-inf'))
+    filtered_logits.scatter_(-1, top_k_indices, top_k_values)
+    return filtered_logits
+def apply_top_p_filtering(logits: torch.Tensor, p: float) -> torch.Tensor:
+    """
+    Apply top-p (nucleus) filtering to logits: with tokens beyond threshold set to -inf
+    """
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+    cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+    # Remove tokens with cumulative probability above threshold
+    sorted_indices_to_remove = cumulative_probs > p
+    sorted_indices_to_remove[..., 0] = False  # Keep at least one token
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    indices_to_remove = sorted_indices_to_remove.scatter(-1, sorted_indices, sorted_indices_to_remove)
+    return logits.masked_fill(indices_to_remove, float('-inf'))
+@torch.no_grad()
+def diffusion_sample(
+    model: nn.Module,
+    seq_len: int = 256,
+    num_steps: int = 256,
+    top_k: Optional[int] = None,
+    top_p: Optional[float] = None,
+    temperature: float = 1.0,
+    greedy: bool = True,
+    mask_token_id: int = 151669,
+    prefix_ids: Optional[torch.LongTensor] = None,
+    suffix_ids: Optional[torch.LongTensor] = None,
+    infill_length: Optional[int] = None,
+    eos_token_id: int = 151645,
+    pad_token_id: Optional[int] = None,
+    bos_token_id: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    generator: Optional[torch.Generator] = None,
+    visualizer: Optional['TerminalVisualizer'] = None,
+) -> torch.LongTensor:
+    """
+    Perform masked diffusion sampling with entropy-based token selection.
+    Args:
+        model: The RND1 language model
+        seq_len: Target sequence length
+        num_steps: Number of denoising steps
+        top_k: Optional top-k filtering for sampling (None = no filtering)
+        top_p: Optional nucleus (top-p) filtering for sampling (None = no filtering)
+               When both top_k and top_p are set, top_k is applied first, then top_p
+        temperature: Temperature for sampling (higher = more random, lower = more deterministic)
+                    Values close to 0 are clamped to 1e-8 to avoid division by zero
+        greedy: Whether to use greedy sampling (True) or stochastic (False)
+        mask_token_id: Token ID for masked positions (default: 151669)
+        prefix_ids: Optional prefix token IDs to preserve
+        suffix_ids: Optional suffix token IDs to preserve
+        infill_length: Length of infill region between prefix/suffix
+        eos_token_id: End of sequence token ID (default: 151645)
+        pad_token_id: Padding token ID (default: None, uses 0 if needed)
+        bos_token_id: Beginning of sequence token ID (default: None)
+        device: Device for computation (None = infer from model)
+        generator: Optional torch generator for reproducible sampling
+        visualizer: Optional TerminalVisualizer for live visualization
+    Returns:
+        Generated token IDs as LongTensor
+    """
+    model.eval()
+    if device is None:
+        device = next(model.parameters()).device
+    else:
+        device = torch.device(device)
+    dtype = next(model.parameters()).dtype
+    if pad_token_id is None:
+        pad_token_id = 0
+    # Build initial masked sequence
+    # When prefix_ids is provided, we create a sequence of length seq_len where:
+    # - The prefix occupies the first pre_len positions
+    # - The remaining (seq_len - pre_len) positions are filled with mask tokens to be generated
+    if prefix_ids is not None or suffix_ids is not None:
+        if prefix_ids is not None:
+            prefix_ids = prefix_ids.to(device) if isinstance(prefix_ids, torch.Tensor) else torch.tensor(prefix_ids, device=device)
+            pre_len = prefix_ids.shape[-1] if prefix_ids.dim() > 0 else 0
+        else:
+            pre_len = 0
+        if suffix_ids is not None:
+            suffix_ids = suffix_ids.to(device) if isinstance(suffix_ids, torch.Tensor) else torch.tensor(suffix_ids, device=device)
+            suf_len = suffix_ids.shape[-1] if suffix_ids.dim() > 0 else 0
+        else:
+            suf_len = 0
+        reserved = (1 if bos_token_id is not None else 0) + (1 if eos_token_id is not None else 0)
+        used = pre_len + suf_len + reserved
+        if used > seq_len:
+            raise ValueError(
+                f"Combined length of prefix ({pre_len}), suffix ({suf_len}), "
+                f"and special tokens ({reserved}) = {used} exceeds seq_len ({seq_len}). "
+                f"Please increase seq_len or reduce input lengths."
+            )
+        elif used == seq_len:
+            raise ValueError(
+                f"No space for generation: prefix ({pre_len}) + suffix ({suf_len}) "
+                f"+ special tokens ({reserved}) = seq_len ({seq_len}). "
+                f"Need at least 1 position for generation."
+            )
+        infill_length = min(infill_length or (seq_len - used), seq_len - used)
+        x = torch.full((1, seq_len), pad_token_id, dtype=torch.long, device=device)
+        pos = 0
+        if bos_token_id is not None:
+            x[0, pos] = bos_token_id; pos += 1
+        if pre_len > 0:
+            x[0, pos:pos+pre_len] = prefix_ids.flatten()[:pre_len]; pos += pre_len
+        fill_start, fill_end = pos, pos + infill_length
+        x[0, fill_start:fill_end] = mask_token_id
+        pos = fill_end
+        if suf_len > 0:
+            x[0, pos:pos+suf_len] = suffix_ids.flatten()[:suf_len]; pos += suf_len
+        if eos_token_id is not None and pos < seq_len:
+            if isinstance(eos_token_id, (list, tuple)):
+                x[0, pos] = eos_token_id[0]
+            else:
+                x[0, pos] = eos_token_id
+        init_maskable = torch.zeros_like(x, dtype=torch.bool)
+        init_maskable[0, fill_start:fill_end] = True
+    else:
+        x = torch.full((1, seq_len), mask_token_id, dtype=torch.long, device=device)
+        if bos_token_id is not None:
+            x[0, 0] = bos_token_id
+        if eos_token_id is not None:
+            # If eos_token_id is a list, use the first one
+            if isinstance(eos_token_id, (list, tuple)):
+                x[0, -1] = eos_token_id[0]
+            else:
+                x[0, -1] = eos_token_id
+        init_maskable = x.eq(mask_token_id)
+    if bos_token_id is not None:
+        init_maskable[:, 0] = False
+    if eos_token_id is not None:
+        # Handle both single token and list of tokens
+        if isinstance(eos_token_id, (list, tuple)):
+            for eos_id in eos_token_id:
+                init_maskable &= x.ne(eos_id)
+        else:
+            init_maskable &= x.ne(eos_token_id)
+    init_maskable &= x.ne(pad_token_id)
+    maskable = init_maskable.clone()
+    xt = x.clone()
+    if visualizer:
+        visualizer.start_visualization(xt, maskable, num_steps)
+    def forward_scores(tokens):
+        """Compute predictions and entropy scores for next tokens."""
+        # Try with input_ids parameter first (standard HF models)
+        try:
+            model_output = model(input_ids=tokens)
+        except TypeError:
+            # Fall back to positional argument
+            model_output = model(tokens)
+        safe_temperature = max(temperature, 1e-8)  # Prevent division by zero
+        logits = model_output.logits / safe_temperature
+        # Note: When both top_k and top_p are provided, they are applied sequentially:
+        # First top_k filters to k tokens, then top_p filters from those k tokens
+        if top_k is not None and top_k > 0:
+            logits = apply_top_k_filtering(logits, top_k)
+        if top_p is not None and 0 < top_p < 1.0:
+            logits = apply_top_p_filtering(logits, top_p)
+        logp = torch.log_softmax(logits, dim=-1)
+        if greedy:
+            pred_next = logp.argmax(-1)
+        else:
+            # Sample from categorical distribution with proper RNG handling
+            if generator is not None:
+                # Use multinomial with generator for reproducible sampling
+                probs = logp.exp()
+                pred_next = torch.multinomial(probs.view(-1, probs.size(-1)), 1, generator=generator).squeeze(-1).view(probs.shape[:-1])
+            else:
+                pred_next = torch.distributions.Categorical(logits=logp).sample()
+        conf_next = torch.gather(logp, -1, pred_next.unsqueeze(-1)).squeeze(-1)
+        p = logp.exp()
+        ent_next = -(p * logp).sum(-1)
+        # Shift predictions: pos i predicts token i+1
+        pred_i = tokens.clone()
+        conf_i = torch.full_like(conf_next, torch.finfo(conf_next.dtype).min)
+        ent_i = torch.zeros_like(ent_next)
+        pred_i[:, 1:] = pred_next[:, :-1]
+        conf_i[:, 1:] = conf_next[:, :-1]
+        ent_i[:, 1:] = ent_next[:, :-1]
+        return pred_i, conf_i, ent_i
+    pred_i, conf_i, ent_i = forward_scores(xt)
+    total_masked = init_maskable.sum(1, keepdim=True)
+    finf = torch.finfo(conf_i.dtype)
+    for step in range(num_steps - 1, 0, -1):
+        rate = step / num_steps
+        cutoff_len = (total_masked * rate).long().clamp(min=0)
+        # Choose HIGH-entropy tokens to keep masked
+        sel_scores = ent_i.masked_fill(~maskable, -finf.max)
+        B, L = sel_scores.shape
+        k_max = cutoff_len.max().item()
+        if k_max > 0:
+            sss, idx = torch.topk(sel_scores, k_max, dim=-1, largest=True)
+            keep_mask = torch.zeros_like(sel_scores, dtype=torch.bool)
+            for b in range(B):
+                k_b = int(cutoff_len[b].item())
+                if k_b > 0:
+                    keep_mask[b, idx[b, :k_b]] = True
+        else:
+            keep_mask = torch.zeros_like(sel_scores, dtype=torch.bool)
+        to_unmask = maskable & ~keep_mask
+        if to_unmask.any():
+            xt[to_unmask] = pred_i[to_unmask]
+            maskable[to_unmask] = False
+        if visualizer:
+            visualizer.update_step(xt, maskable, num_steps - step, ent_i, conf_i)
+        if maskable.any():
+            pred_i, conf_i, ent_i = forward_scores(xt)
+    if maskable.any():
+        xt[maskable] = pred_i[maskable]
+    if visualizer:
+        visualizer.stop_visualization()
+    return xt

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<|mask|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
+size 11422654

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,249 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151666": {
+      "content": "</tool_response>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151667": {
+      "content": "<think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151668": {
+      "content": "</think>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151669": {
+      "content": "<|mask|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "mask_token": "<|mask|>",
+  "mask_token_id": 151669
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff