burtenshaw HF Staff commited on 1 day ago

Commit

788c379

verified ·

1 Parent(s): da16226

Upload folder using huggingface_hub

Browse files

Files changed (27) hide show

__init__.py +5 -0
config.json +22 -0
configuration_nanogpt.py +34 -0
d20/meta_000060.json +10 -0
d20/meta_000120.json +10 -0
d20/meta_000180.json +10 -0
d20/meta_000240.json +10 -0
d20/meta_000300.json +10 -0
d20/meta_000360.json +10 -0
d20/meta_000420.json +10 -0
d20/meta_000466.json +10 -0
d20/model_000060.pt +3 -0
d20/model_000120.pt +3 -0
d20/model_000180.pt +3 -0
d20/model_000240.pt +3 -0
d20/model_000300.pt +3 -0
d20/model_000360.pt +3 -0
d20/model_000420.pt +3 -0
d20/model_000466.pt +3 -0
meta_000650.json +20 -0
model_000650.pt +3 -0
modeling_nanogpt.py +386 -0
pytorch_model.bin +3 -0
token_bytes.pt +3 -0
tokenizer.pkl +3 -0
tokenizer_config.json +10 -0
tokenizer_nanogpt.py +362 -0

__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from .configuration_nanogpt import NanoGPTConfig
+from .modeling_nanogpt import NanoGPTModel, NanoGPTChat
+from .tokenizer_nanogpt import NanoGPTTokenizer, NanoGPTChatTokenizer

config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+    "model_type": "nanogpt",
+    "architectures": [
+        "NanoGPTChat"
+    ],
+    "auto_map": {
+        "AutoConfig": "configuration_nanogpt.NanoGPTConfig",
+        "AutoModel": "modeling_nanogpt.NanoGPTChat",
+        "AutoModelForCausalLM": "modeling_nanogpt.NanoGPTChat",
+        "AutoTokenizer": "tokenizer_nanogpt.NanoGPTChatTokenizer"
+    },
+    "bos_token": "<|bos|>",
+    "eos_token": "<|assistant_end|>",
+    "pad_token": "<|assistant_end|>",
+    "sequence_len": 2048,
+    "vocab_size": 65536,
+    "n_layer": 20,
+    "n_head": 10,
+    "n_kv_head": 10,
+    "n_embd": 1280,
+    "chat_template": "{% if messages[0]['role'] == 'system' %}<|bos|><|user_start|>{{ messages[0]['content'] }}\n\n{{ messages[1]['content'] }}<|user_end|>{% set messages = messages[2:] %}{% else %}<|bos|>{% endif %}{% for message in messages %}{% if loop.index0 % 2 == 0 %}<|user_start|>{{ message['content'] }}<|user_end|>{% else %}<|assistant_start|>{{ message['content'] }}<|assistant_end|>{% endif %}{% endfor %}"
+}

configuration_nanogpt.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from transformers import PretrainedConfig
+class NanoGPTConfig(PretrainedConfig):
+    model_type = "nanogpt"
+    def __init__(
+        self,
+        sequence_len: int = 1024,
+        vocab_size: int = 50304,
+        n_layer: int = 12,
+        n_head: int = 6,
+        n_kv_head: int = 6,
+        n_embd: int = 768,
+        bos_token_id: int = 0,
+        eos_token_id: int = 1,
+        pad_token_id: int = 1,
+        **kwargs,
+    ):
+        self.sequence_len = sequence_len
+        self.vocab_size = vocab_size
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_kv_head = n_kv_head
+        self.n_embd = n_embd
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            pad_token_id=pad_token_id,
+            **kwargs,
+        )

d20/meta_000060.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "model_config": {
+    "sequence_len": 2048,
+    "vocab_size": 65536,
+    "n_layer": 20,
+    "n_head": 10,
+    "n_kv_head": 10,
+    "n_embd": 1280
+  }
+}

d20/meta_000120.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "model_config": {
+    "sequence_len": 2048,
+    "vocab_size": 65536,
+    "n_layer": 20,
+    "n_head": 10,
+    "n_kv_head": 10,
+    "n_embd": 1280
+  }
+}

d20/meta_000180.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "model_config": {
+    "sequence_len": 2048,
+    "vocab_size": 65536,
+    "n_layer": 20,
+    "n_head": 10,
+    "n_kv_head": 10,
+    "n_embd": 1280
+  }
+}

d20/meta_000240.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "model_config": {
+    "sequence_len": 2048,
+    "vocab_size": 65536,
+    "n_layer": 20,
+    "n_head": 10,
+    "n_kv_head": 10,
+    "n_embd": 1280
+  }
+}

d20/meta_000300.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "model_config": {
+    "sequence_len": 2048,
+    "vocab_size": 65536,
+    "n_layer": 20,
+    "n_head": 10,
+    "n_kv_head": 10,
+    "n_embd": 1280
+  }
+}

d20/meta_000360.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "model_config": {
+    "sequence_len": 2048,
+    "vocab_size": 65536,
+    "n_layer": 20,
+    "n_head": 10,
+    "n_kv_head": 10,
+    "n_embd": 1280
+  }
+}

d20/meta_000420.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "model_config": {
+    "sequence_len": 2048,
+    "vocab_size": 65536,
+    "n_layer": 20,
+    "n_head": 10,
+    "n_kv_head": 10,
+    "n_embd": 1280
+  }
+}

d20/meta_000466.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "model_config": {
+    "sequence_len": 2048,
+    "vocab_size": 65536,
+    "n_layer": 20,
+    "n_head": 10,
+    "n_kv_head": 10,
+    "n_embd": 1280
+  }
+}

d20/model_000060.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba1199debccab5e267bceaf87e4dfc0ecc479ae920f8867b4700bbbd52200bd6
+size 2076230219

d20/model_000120.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:809d694a727c414d173dca22ee02333b2eb2fee522fe0d1dabec21518224e2cc
+size 2076230219

d20/model_000180.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:703f5049b2e3804a3e2cc55a3af444820ada278a42771d4ef5e679da80fa8a88
+size 2076230219

d20/model_000240.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e58a748174610eaa914aaea9372a47f5c501af757bd7c306d28ac795539d7a68
+size 2076230219

d20/model_000300.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8a03c2d2143f33c412e502a75dad2e505da01d04aa61ee5d4bab2c2c6a99669d
+size 2076230219

d20/model_000360.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0dd3c35b34043ef1571ef81ca644d41e3e4f8aa722fffe47f4dd6a9eee9a5684
+size 2076230219

d20/model_000420.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03a9c4557759a64c5ce1322107b3d57a295092d6ee62f994d1aec61fcb08d4e7
+size 2076230219

d20/model_000466.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a7abb8f892b7aa004f3a54ac54988871e12d099996c371806708f5e9a0bea3c
+size 2076230219

meta_000650.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "step": 650,
+  "val_loss": 1.0664211511611938,
+  "mmlu_acc": 0.3623046875,
+  "arc_easy_acc": 0.419921875,
+  "gsm8k_acc": 0.03125,
+  "humaneval_acc": 0.03125,
+  "model_config": {
+    "sequence_len": 2048,
+    "vocab_size": 65536,
+    "n_layer": 20,
+    "n_head": 10,
+    "n_kv_head": 10,
+    "n_embd": 1280,
+    "bos_token": "<|bos|>",
+    "eos_token": "<|assistant_end|>",
+    "pad_token": "<|assistant_end|>",
+    "chat_template": "{% if messages[0]['role'] == 'system' %}<|bos|><|user_start|>{{ messages[0]['content'] }}\n\n{{ messages[1]['content'] }}<|user_end|>{% set messages = messages[2:] %}{% else %}<|bos|>{% endif %}{% for message in messages %}{% if loop.index0 % 2 == 0 %}<|user_start|>{{ message['content'] }}<|user_end|>{% else %}<|assistant_start|>{{ message['content'] }}<|assistant_end|>{% endif %}{% endfor %}"
+  }
+}

model_000650.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff2eee182e2aa396615d3b481ddf17884a7dbabb3caa47f66eede343135accff
+size 2076230219

modeling_nanogpt.py ADDED Viewed

	@@ -0,0 +1,386 @@

+import glob
+import math
+import os
+import shutil
+from typing import Dict, List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from huggingface_hub import snapshot_download
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .configuration_nanogpt import NanoGPTConfig
+def _rms_norm(x: torch.Tensor) -> torch.Tensor:
+    return F.rms_norm(x, (x.size(-1),))
+def _apply_rotary_emb(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+    assert x.ndim == 4
+    d = x.shape[3] // 2
+    x1, x2 = x[..., :d], x[..., d:]
+    y1 = x1 * cos + x2 * sin
+    y2 = x1 * (-sin) + x2 * cos
+    out = torch.cat([y1, y2], 3)
+    return out.to(x.dtype)
+def _repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    if n_rep == 1:
+        return x
+    bs, n_kv_heads, slen, head_dim = x.shape
+    return (
+        x[:, :, None, :, :]
+        .expand(bs, n_kv_heads, n_rep, slen, head_dim)
+        .reshape(bs, n_kv_heads * n_rep, slen, head_dim)
+    )
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config: NanoGPTConfig, layer_idx: int):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.n_head = config.n_head
+        self.n_kv_head = config.n_kv_head
+        self.n_embd = config.n_embd
+        self.head_dim = self.n_embd // self.n_head
+        assert self.n_embd % self.n_head == 0
+        assert self.n_kv_head <= self.n_head and self.n_head % self.n_kv_head == 0
+        self.c_q = nn.Linear(self.n_embd, self.n_head * self.head_dim, bias=False)
+        self.c_k = nn.Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False)
+        self.c_v = nn.Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False)
+        self.c_proj = nn.Linear(self.n_embd, self.n_embd, bias=False)
+    def forward(self, x: torch.Tensor, cos_sin, kv_cache=None) -> torch.Tensor:
+        B, T, C = x.size()
+        q = self.c_q(x).view(B, T, self.n_head, self.head_dim)
+        k = self.c_k(x).view(B, T, self.n_kv_head, self.head_dim)
+        v = self.c_v(x).view(B, T, self.n_kv_head, self.head_dim)
+        cos, sin = cos_sin
+        q, k = _apply_rotary_emb(q, cos, sin), _apply_rotary_emb(k, cos, sin)
+        q, k = _rms_norm(q), _rms_norm(k)
+        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
+        Tq = q.size(2)
+        Tk = k.size(2)
+        nrep = self.n_head // self.n_kv_head
+        k, v = _repeat_kv(k, nrep), _repeat_kv(v, nrep)
+        if Tq == Tk:
+            y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
+        elif Tq == 1:
+            y = F.scaled_dot_product_attention(q, k, v, is_causal=False)
+        else:
+            attn_mask = torch.zeros((Tq, Tk), dtype=torch.bool, device=q.device)
+            prefix_len = Tk - Tq
+            if prefix_len > 0:
+                attn_mask[:, :prefix_len] = True
+            attn_mask[:, prefix_len:] = torch.tril(torch.ones((Tq, Tq), dtype=torch.bool, device=q.device))
+            y = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
+        y = y.transpose(1, 2).contiguous().view(B, T, -1)
+        y = self.c_proj(y)
+        return y
+    def forward_with_cache(
+        self,
+        x: torch.Tensor,
+        cos_sin,
+        past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
+        B, T, _ = x.size()
+        q = self.c_q(x).view(B, T, self.n_head, self.head_dim)
+        k = self.c_k(x).view(B, T, self.n_kv_head, self.head_dim)
+        v = self.c_v(x).view(B, T, self.n_kv_head, self.head_dim)
+        cos, sin = cos_sin
+        q, k = _apply_rotary_emb(q, cos, sin), _apply_rotary_emb(k, cos, sin)
+        q, k = _rms_norm(q), _rms_norm(k)
+        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
+        if past_key_value is not None:
+            past_k, past_v = past_key_value
+            if past_k is not None and past_v is not None:
+                k = torch.cat([past_k, k], dim=2)
+                v = torch.cat([past_v, v], dim=2)
+        present = (k, v) if use_cache else None
+        Tq = q.size(2)
+        Tk = k.size(2)
+        nrep = self.n_head // self.n_kv_head
+        k_rep = _repeat_kv(k, nrep)
+        v_rep = _repeat_kv(v, nrep)
+        attn_mask = None
+        if attention_mask is not None:
+            attn_mask = attention_mask.to(dtype=torch.bool, device=q.device)
+            if attn_mask.dim() == 2:
+                attn_mask = attn_mask[:, None, None, :]
+            elif attn_mask.dim() == 4:
+                pass
+            else:
+                raise ValueError("Unsupported attention_mask dimensions")
+            if attn_mask.size(-1) != Tk:
+                attn_mask = torch.nn.functional.pad(attn_mask, (Tk - attn_mask.size(-1), 0))
+            attn_mask = (~attn_mask).to(dtype=q.dtype) * -1e4
+        if Tq == Tk:
+            y = F.scaled_dot_product_attention(q, k_rep, v_rep, attn_mask=attn_mask, is_causal=True)
+        else:
+            y = F.scaled_dot_product_attention(q, k_rep, v_rep, attn_mask=attn_mask, is_causal=False)
+        y = y.transpose(1, 2).contiguous().view(B, T, -1)
+        y = self.c_proj(y)
+        return y, present
+class MLP(nn.Module):
+    def __init__(self, config: NanoGPTConfig):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=False)
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.c_fc(x)
+        x = F.relu(x).square()
+        x = self.c_proj(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, config: NanoGPTConfig, layer_idx: int):
+        super().__init__()
+        self.attn = CausalSelfAttention(config, layer_idx)
+        self.mlp = MLP(config)
+    def forward(self, x: torch.Tensor, cos_sin, kv_cache=None) -> torch.Tensor:
+        x = x + self.attn(_rms_norm(x), cos_sin, kv_cache)
+        x = x + self.mlp(_rms_norm(x))
+        return x
+class NanoGPTModel(PreTrainedModel):
+    config_class = NanoGPTConfig
+    _CANONICAL_WEIGHT_NAMES = (
+        "pytorch_model.bin",
+        "model.safetensors",
+        "model.ckpt.index",
+        "tf_model.h5",
+        "flax_model.msgpack",
+    )
+    _PT_PATTERN = "model_*.pt"
+    @classmethod
+    def _snapshot_kwargs(cls, source_kwargs: Dict) -> Dict:
+        keys = {
+            "cache_dir",
+            "force_download",
+            "local_files_only",
+            "proxies",
+            "resume_download",
+            "revision",
+            "token",
+            "use_auth_token",
+        }
+        return {k: source_kwargs[k] for k in keys if k in source_kwargs}
+    @classmethod
+    def _resolve_checkpoint_dir(cls, pretrained_model_name_or_path, subfolder=None, **kwargs):
+        if os.path.isdir(pretrained_model_name_or_path):
+            base_dir = pretrained_model_name_or_path
+        else:
+            snapshot_params = cls._snapshot_kwargs(kwargs)
+            token = snapshot_params.pop("token", None)
+            if token is None:
+                token = snapshot_params.pop("use_auth_token", None)
+            if token is not None:
+                snapshot_params["token"] = token
+            base_dir = snapshot_download(pretrained_model_name_or_path, **snapshot_params)
+        if subfolder:
+            base_dir = os.path.join(base_dir, subfolder)
+        cls._ensure_canonical_weights(base_dir)
+        return base_dir
+    @classmethod
+    def _ensure_canonical_weights(cls, checkpoint_dir):
+        for name in cls._CANONICAL_WEIGHT_NAMES:
+            candidate = os.path.join(checkpoint_dir, name)
+            if os.path.isfile(candidate):
+                return candidate
+        pt_candidates = sorted(
+            glob.glob(os.path.join(checkpoint_dir, cls._PT_PATTERN)),
+            reverse=True,
+        )
+        if not pt_candidates:
+            raise FileNotFoundError(
+                f"No checkpoint weights found in {checkpoint_dir}. Expected one of {cls._CANONICAL_WEIGHT_NAMES} "
+                f"or files matching {cls._PT_PATTERN}."
+            )
+        source_path = pt_candidates[0]
+        target_path = os.path.join(checkpoint_dir, "pytorch_model.bin")
+        if (
+            not os.path.isfile(target_path)
+            or os.path.getmtime(source_path) > os.path.getmtime(target_path)
+        ):
+            shutil.copyfile(source_path, target_path)
+        return target_path
+    def __init__(self, config: NanoGPTConfig):
+        super().__init__(config)
+        config.use_cache = getattr(config, "use_cache", True)
+        config.num_hidden_layers = config.n_layer
+        config.num_attention_heads = config.n_head
+        config.hidden_size = config.n_embd
+        self.transformer = nn.ModuleDict({
+            "wte": nn.Embedding(config.vocab_size, config.n_embd),
+            "h": nn.ModuleList([Block(config, layer_idx) for layer_idx in range(config.n_layer)]),
+        })
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.rotary_seq_len = config.sequence_len * 10
+        head_dim = config.n_embd // config.n_head
+        cos, sin = self._precompute_rotary_embeddings(self.rotary_seq_len, head_dim)
+        self.register_buffer("cos", cos, persistent=False)
+        self.register_buffer("sin", sin, persistent=False)
+        # ensure fp32 activations
+        self.transformer.wte.to(dtype=torch.bfloat16)
+        # following HF API expectations
+        self.post_init()
+    def _init_weights(self, module: nn.Module):
+        if isinstance(module, nn.Linear):
+            fan_out = module.weight.size(0)
+            fan_in = module.weight.size(1)
+            std = 1.0 / math.sqrt(fan_in) * min(1.0, math.sqrt(fan_out / fan_in))
+            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=1.0)
+    def _precompute_rotary_embeddings(self, seq_len: int, head_dim: int, base: int = 10000, device=None):
+        if device is None:
+            device = self.transformer.wte.weight.device
+            # Handle meta device case - use CPU as fallback
+            if device.type == 'meta':
+                device = torch.device('cpu')
+        channel_range = torch.arange(0, head_dim, 2, dtype=torch.float32, device=device)
+        inv_freq = 1.0 / (base ** (channel_range / head_dim))
+        t = torch.arange(seq_len, dtype=torch.float32, device=device)
+        freqs = torch.outer(t, inv_freq)
+        cos, sin = freqs.cos(), freqs.sin()
+        cos, sin = cos.bfloat16(), sin.bfloat16()
+        cos, sin = cos[None, :, None, :], sin[None, :, None, :]
+        return cos, sin
+    def _apply_softcap(self, logits: torch.Tensor) -> torch.Tensor:
+        softcap = 15
+        return softcap * torch.tanh(logits / softcap)
+    def _forward_impl(self, idx: torch.Tensor, cos_sin, kv_cache=None) -> torch.Tensor:
+        x = self.transformer.wte(idx)
+        x = x.float()
+        x = _rms_norm(x)
+        for block in self.transformer.h:
+            x = block(x, cos_sin, kv_cache)
+        x = _rms_norm(x)
+        logits = self.lm_head(x)
+        return self._apply_softcap(logits)
+    def forward(self, input_ids: torch.Tensor, labels=None, loss_reduction: str = 'mean', **kwargs):
+        idx = input_ids
+        B, T = idx.size()
+        T0 = 0
+        cos_sin = self.cos[:, T0:T0+T], self.sin[:, T0:T0+T]
+        logits = self._forward_impl(idx, cos_sin, kv_cache=None)
+        loss = None
+        if labels is not None:
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)),
+                labels.view(-1),
+                ignore_index=-1,
+                reduction=loss_reduction,
+            )
+        return {"loss": loss, "logits": logits}
+class NanoGPTChat(NanoGPTModel):
+    """Chat-optimized variant with HF-friendly generate and support for KV cache."""
+    def __init__(self, config: NanoGPTConfig):
+        super().__init__(config)
+        self.use_cache = getattr(config, "use_cache", True)
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs):
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+        return {"input_ids": input_ids, "past_key_values": past_key_values, **kwargs}
+    def _expand_past_length(self, past_key_values):
+        if not past_key_values:
+            return 0
+        past_k, _ = past_key_values[0]
+        if past_k is None:
+            return 0
+        return past_k.size(2)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[Tuple[torch.Tensor, torch.Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+        loss_reduction: str = "mean",
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        idx = input_ids
+        B, T = idx.size()
+        use_cache = self.use_cache if use_cache is None else use_cache
+        past_length = self._expand_past_length(past_key_values)
+        T0 = past_length
+        cos_sin = self.cos[:, T0:T0+T], self.sin[:, T0:T0+T]
+        x = self.transformer.wte(idx)
+        x = x.float()
+        x = _rms_norm(x)
+        presents = [] if use_cache else None
+        for layer_idx, block in enumerate(self.transformer.h):
+            past = None
+            if past_key_values is not None and past_key_values[layer_idx] is not None:
+                past = past_key_values[layer_idx]
+            attn_output, present = block.attn.forward_with_cache(
+                _rms_norm(x),
+                cos_sin,
+                past_key_value=past,
+                attention_mask=attention_mask,
+                use_cache=use_cache,
+            )
+            x = x + attn_output
+            x = x + block.mlp(_rms_norm(x))
+            if use_cache:
+                presents.append(present)
+        x = _rms_norm(x)
+        logits = self.lm_head(x)
+        loss = None
+        if labels is not None:
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)),
+                labels.view(-1),
+                ignore_index=-1,
+                reduction=loss_reduction,
+            )
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=presents,
+        )

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff2eee182e2aa396615d3b481ddf17884a7dbabb3caa47f66eede343135accff
+size 2076230219

token_bytes.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1b6cdee5d02fe1018b2b1d2ae5b736be665f9c0e7d10c81dcf935e7efaf8cb5
+size 263721

tokenizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8467414b90511a50c4dac438af25c075817e9d62d799a5ef613b186c977f5d1b
+size 846518

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenizer_nanogpt.NanoGPTChatTokenizer",
+      null
+    ]
+  },
+  "tokenizer_class": "NanoGPTChatTokenizer",
+  "chat_template": "{% if messages[0]['role'] == 'system' %}<|bos|><|user_start|>{{ messages[0]['content'] }}\n\n{{ messages[1]['content'] }}<|user_end|>{% set messages = messages[2:] %}{% else %}<|bos|>{% endif %}{% for message in messages %}{% if loop.index0 % 2 == 0 %}<|user_start|>{{ message['content'] }}<|user_end|>{% else %}<|assistant_start|>{{ message['content'] }}<|assistant_end|>{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant_start|>{% endif %}"
+}

tokenizer_nanogpt.py ADDED Viewed

	@@ -0,0 +1,362 @@

+import os
+import pickle
+import shutil
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple
+from huggingface_hub import hf_hub_download, snapshot_download
+from huggingface_hub.utils import HfHubHTTPError
+from transformers import PreTrainedTokenizer
+class _BaseNanoGPTTokenizer:
+    """Lightweight wrapper used by the base (non-chat) checkpoints."""
+    special_tokens = {
+        "bos": "<|bos|>",
+        "user_start": "<|user_start|>",
+        "user_end": "<|user_end|>",
+        "assistant_start": "<|assistant_start|>",
+        "assistant_end": "<|assistant_end|>",
+        "python_start": "<|python_start|>",
+        "python_end": "<|python_end|>",
+        "output_start": "<|output_start|>",
+        "output_end": "<|output_end|>",
+    }
+    def __init__(self, enc):
+        self.enc = enc
+        self.bos_token_id = enc.encode_single_token(self.special_tokens["bos"])
+    @classmethod
+    def register_for_auto_class(cls, auto_class="AutoTokenizer"):
+        pass
+    @classmethod
+    def _load_encoding(cls, pretrained_model_name_or_path, **kwargs):
+        subfolder = kwargs.get("subfolder")
+        base_path = (
+            os.path.join(pretrained_model_name_or_path, subfolder)
+            if subfolder
+            else pretrained_model_name_or_path
+        )
+        local_tok_path = os.path.join(base_path, "tokenizer.pkl")
+        if os.path.isfile(local_tok_path):
+            with open(local_tok_path, "rb") as f:
+                return pickle.load(f)
+        snapshot_kwargs = {k: kwargs[k] for k in kwargs if k in {
+            "cache_dir",
+            "force_download",
+            "local_files_only",
+            "proxies",
+            "resume_download",
+            "revision",
+            "token",
+            "use_auth_token",
+        }}
+        token = snapshot_kwargs.pop("token", None)
+        if token is None:
+            token = snapshot_kwargs.pop("use_auth_token", None)
+        if token is not None:
+            snapshot_kwargs["token"] = token
+        snapshot_dir = snapshot_download(pretrained_model_name_or_path, **snapshot_kwargs)
+        tok_path = os.path.join(snapshot_dir, subfolder, "tokenizer.pkl") if subfolder else os.path.join(snapshot_dir, "tokenizer.pkl")
+        if not os.path.isfile(tok_path):
+            try:
+                tok_path = hf_hub_download(
+                    repo_id=pretrained_model_name_or_path,
+                    filename="tokenizer.pkl",
+                    subfolder=subfolder,
+                    **snapshot_kwargs,
+                )
+            except (HfHubHTTPError, OSError) as e:
+                raise ValueError(
+                    f"Could not load tokenizer.pkl from {pretrained_model_name_or_path}. "
+                    f"Make sure the path exists or the repo is accessible on the Hub."
+                ) from e
+        with open(tok_path, "rb") as f:
+            return pickle.load(f)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        enc = cls._load_encoding(pretrained_model_name_or_path, **kwargs)
+        return cls(enc)
+    def encode(self, text, prepend=None):
+        ids = self.enc.encode_ordinary(text)
+        if prepend is not None:
+            prepend_id = prepend if isinstance(prepend, int) else self.enc.encode_single_token(prepend)
+            ids.insert(0, prepend_id)
+        return ids
+    def decode(self, ids):
+        return self.enc.decode(ids)
+    def get_bos_token_id(self):
+        return self.bos_token_id
+    def encode_special(self, token):
+        return self.enc.encode_single_token(token)
+class NanoGPTTokenizer(_BaseNanoGPTTokenizer):
+    pass
+class NanoGPTChatTokenizer(PreTrainedTokenizer):
+    """Transformers-compatible tokenizer with chat helpers."""
+    vocab_files_names = {"vocab_file": "tokenizer.pkl"}
+    model_input_names = ["input_ids"]
+    _special_tokens = {
+        "bos": "<|bos|>",
+        "user_start": "<|user_start|>",
+        "user_end": "<|user_end|>",
+        "assistant_start": "<|assistant_start|>",
+        "assistant_end": "<|assistant_end|>",
+        "python_start": "<|python_start|>",
+        "python_end": "<|python_end|>",
+        "output_start": "<|output_start|>",
+        "output_end": "<|output_end|>",
+    }
+    def __init__(
+        self,
+        vocab_file: str,
+        bos_token: str = "<|bos|>",
+        eos_token: str = "<|assistant_end|>",
+        pad_token: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        # Load encoding and build vocab mappings before parent init
+        with open(vocab_file, "rb") as f:
+            self.enc = pickle.load(f)
+        self.vocab_file = vocab_file
+        self.special_token_ids: Dict[str, int] = {
+            name: self.enc.encode_single_token(token)
+            for name, token in self._special_tokens.items()
+        }
+        self.bos_token_id = self.special_token_ids["bos"]
+        self.eos_token_id = self.special_token_ids["assistant_end"]
+        pad_token = pad_token or eos_token
+        self.pad_token_id = self.special_token_ids["assistant_end"]
+        self._build_vocabulary()
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            **kwargs,
+        )
+        additional_special_tokens = [
+            token
+            for key, token in self._special_tokens.items()
+            if token not in {bos_token, eos_token, pad_token}
+        ]
+        if additional_special_tokens:
+            self.add_special_tokens({"additional_special_tokens": additional_special_tokens})
+        self.chat_template = kwargs.get("chat_template", getattr(self, "chat_template", None))
+    # ------------------------------------------------------------------
+    # Core tokenizer API
+    # ------------------------------------------------------------------
+    def _build_vocabulary(self) -> None:
+        id_to_token: Dict[int, str] = {}
+        token_to_id: Dict[str, int] = {}
+        for idx in range(self.enc.n_vocab):
+            token_bytes = self.enc.decode_single_token_bytes(idx)
+            token_str = token_bytes.decode("utf-8", errors="replace")
+            id_to_token[idx] = token_str
+            token_to_id[token_str] = idx
+        self._id_to_token = id_to_token
+        self._token_to_id = token_to_id
+    def get_vocab(self) -> Dict[str, int]:
+        return dict(self._token_to_id)
+    @property
+    def vocab_size(self) -> int:  # type: ignore[override]
+        return self.enc.n_vocab
+    def _tokenize(self, text: str, **kwargs) -> List[str]:
+        ids = self.enc.encode_ordinary(text)
+        return [self._id_to_token[i] for i in ids]
+    def _convert_token_to_id(self, token: str) -> int:
+        if token in self._token_to_id:
+            return self._token_to_id[token]
+        raise KeyError(f"Token not found in vocabulary: {token}")
+    def _convert_id_to_token(self, index: int) -> str:
+        return self._id_to_token[index]
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:  # type: ignore[override]
+        ids = [self._token_to_id[token] for token in tokens]
+        return self.enc.decode(ids)
+    def build_inputs_with_special_tokens(  # type: ignore[override]
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+    ) -> List[int]:
+        if token_ids_1 is not None:
+            return token_ids_0 + token_ids_1
+        return token_ids_0
+    def get_special_tokens_mask(  # type: ignore[override]
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+    ) -> List[int]:
+        all_ids = token_ids_0 if token_ids_1 is None else token_ids_0 + token_ids_1
+        return [1 if token in self.special_token_ids else 0 for token in all_ids]
+    def num_special_tokens_to_add(self, pair: bool = False) -> int:  # type: ignore[override]
+        return 0
+    def save_vocabulary(
+        self,
+        save_directory: str,
+        filename_prefix: Optional[str] = None,
+    ) -> Tuple[str]:  # type: ignore[override]
+        os.makedirs(save_directory, exist_ok=True)
+        filename = "tokenizer.pkl"
+        if filename_prefix is not None:
+            filename = f"{filename_prefix}-{filename}"
+        save_path = os.path.join(save_directory, filename)
+        shutil.copyfile(self.vocab_file, save_path)
+        return (save_path,)
+    # ------------------------------------------------------------------
+    # Chat helpers
+    # ------------------------------------------------------------------
+    def encode_special(self, token: str) -> int:
+        if token in self.special_token_ids:
+            return self.special_token_ids[token]
+        return self._token_to_id[token]
+    def _encode_text(self, text: str) -> List[int]:
+        return self.enc.encode_ordinary(text)
+    def _encode_python_block(self, token_id: int, content: str) -> List[int]:
+        tokens = [token_id]
+        tokens.extend(self._encode_text(content))
+        closing = {
+            self.special_token_ids["python_start"]: self.special_token_ids["python_end"],
+            self.special_token_ids["output_start"]: self.special_token_ids["output_end"],
+        }[token_id]
+        tokens.append(closing)
+        return tokens
+    def _encode_assistant_content(self, content) -> List[int]:
+        if isinstance(content, str):
+            return self._encode_text(content)
+        if isinstance(content, list):
+            tokens: List[int] = []
+            for part in content:
+                part_type = part.get("type", "text")
+                text = part.get("text", "")
+                if part_type == "text":
+                    tokens.extend(self._encode_text(text))
+                elif part_type == "python":
+                    tokens.extend(
+                        self._encode_python_block(
+                            self.special_token_ids["python_start"],
+                            text,
+                        )
+                    )
+                elif part_type == "python_output":
+                    tokens.extend(
+                        self._encode_python_block(
+                            self.special_token_ids["output_start"],
+                            text,
+                        )
+                    )
+                else:
+                    raise ValueError(f"Unknown assistant content part: {part_type}")
+            return tokens
+        raise ValueError(f"Unsupported assistant content type: {type(content)}")
+    def _render_conversation_ids(self, conversation: Sequence[Dict[str, object]]) -> List[int]:
+        if not conversation:
+            raise ValueError("Conversation must contain at least one message")
+        messages = list(conversation)
+        if messages[0]["role"] == "system":
+            if len(messages) < 2 or messages[1]["role"] != "user":
+                raise ValueError("System message must be followed by a user message")
+            merged = dict(messages[1])
+            merged["content"] = f"{messages[0]['content']}\n\n{messages[1]['content']}"
+            messages = [merged] + messages[2:]
+        ids: List[int] = [self.bos_token_id]
+        for idx, message in enumerate(messages):
+            expected_role = "user" if idx % 2 == 0 else "assistant"
+            role = message.get("role")
+            if role != expected_role:
+                raise ValueError(f"Expected role {expected_role}, received {role} at index {idx}")
+            content = message.get("content")
+            if expected_role == "user":
+                start = self.special_token_ids["user_start"]
+                end = self.special_token_ids["user_end"]
+                if not isinstance(content, str):
+                    raise ValueError("User messages must contain string content")
+                ids.append(start)
+                ids.extend(self._encode_text(content))
+                ids.append(end)
+            else:
+                start = self.special_token_ids["assistant_start"]
+                end = self.special_token_ids["assistant_end"]
+                ids.append(start)
+                ids.extend(self._encode_assistant_content(content))
+                ids.append(end)
+        return ids
+    def apply_chat_template(  # type: ignore[override]
+        self,
+        conversation,
+        tokenize: bool = False,
+        add_generation_prompt: bool = False,
+        return_tensors: Optional[str] = None,
+        padding: bool = False,
+        truncation: bool = False,
+        max_length: Optional[int] = None,
+        **kwargs,
+    ):
+        if isinstance(conversation, dict) and "messages" in conversation:
+            messages = conversation["messages"]
+        else:
+            messages = conversation
+        token_ids = self._render_conversation_ids(messages)
+        if add_generation_prompt:
+            token_ids.append(self.special_token_ids["assistant_start"])
+        if tokenize:
+            if return_tensors is not None:
+                return self(
+                    [token_ids],
+                    add_special_tokens=False,
+                    return_tensors=return_tensors,
+                    padding=padding,
+                    truncation=truncation,
+                    max_length=max_length,
+                    **kwargs,
+                )
+            return token_ids
+        return self.decode(token_ids, skip_special_tokens=False)
+    def encode_chat_message(self, role: str, content: str) -> List[int]:
+        rendered = self.apply_chat_template(
+            [
+                {"role": role, "content": content},
+            ],
+            tokenize=True,
+            add_generation_prompt=False,
+        )
+        return rendered