nanochat-base-d20-test / tokenizer_nanogpt.py
loocorez's picture
Upload tokenizer_nanogpt.py with huggingface_hub
210c84c verified
import os
import pickle
from huggingface_hub import hf_hub_download
from huggingface_hub.utils import HfHubHTTPError
class NanoGPTTokenizer:
"""Lightweight wrapper over a tiktoken Encoding stored in tokenizer.pkl.
Provides minimal encode/decode needed for inference and a from_pretrained
constructor so it can be loaded via AutoTokenizer with trust_remote_code.
"""
def __init__(self, enc):
self.enc = enc
self.bos_token_id = enc.encode_single_token("<|bos|>")
@classmethod
def register_for_auto_class(cls, auto_class="AutoTokenizer"):
"""Required for AutoTokenizer registration."""
pass
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
"""
Load tokenizer from either:
- Local directory path
- Hugging Face Hub repo ID
- Cached directory (handled automatically)
"""
# First, try to load from local path
local_tok_path = os.path.join(pretrained_model_name_or_path, "tokenizer.pkl")
if os.path.isfile(local_tok_path):
# Local file exists, load it directly
with open(local_tok_path, "rb") as f:
enc = pickle.load(f)
else:
# Try to download from Hugging Face Hub
try:
# This handles cache automatically and returns the cached file path
tok_path = hf_hub_download(
repo_id=pretrained_model_name_or_path,
filename="tokenizer.pkl"
)
with open(tok_path, "rb") as f:
enc = pickle.load(f)
except (HfHubHTTPError, OSError) as e:
raise ValueError(
f"Could not load tokenizer.pkl from {pretrained_model_name_or_path}. "
f"Make sure the path exists or the repo is accessible on the Hub."
) from e
return cls(enc)
def encode(self, text, prepend=None):
ids = self.enc.encode_ordinary(text)
if prepend is not None:
prepend_id = prepend if isinstance(prepend, int) else self.enc.encode_single_token(prepend)
ids.insert(0, prepend_id)
return ids
def decode(self, ids):
return self.enc.decode(ids)
def get_bos_token_id(self):
return self.bos_token_id