# tokenization_shivik_m1.py # Minimal slow tokenizer wrapper for Shivik-M1 (uses vocab.json) import json import os from transformers import PreTrainedTokenizer class ShivikM1Tokenizer(PreTrainedTokenizer): model_input_names = ["input_ids", "attention_mask"] def __init__(self, vocab_file="vocab.json", unk_token="", bos_token="", eos_token="", pad_token="", **kwargs): super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs) self.vocab_file = vocab_file with open(vocab_file, "r", encoding="utf-8") as fh: self.encoder = json.load(fh) # decoder self.decoder = {int(v): k for k, v in self.encoder.items()} if all(isinstance(v, int) for v in self.encoder.values()) else {v:k for k,v in self.encoder.items()} @property def vocab_size(self): return len(self.encoder) def get_vocab(self): return dict(self.encoder) def _tokenize(self, text): # very simple whitespace tokenizer — replace with BPE if desired return text.split() def _convert_token_to_id(self, token): return self.encoder.get(token, self.encoder.get(self.unk_token, 0)) def _convert_id_to_token(self, index): return self.decoder.get(index, self.unk_token) def convert_tokens_to_string(self, tokens): return " ".join(tokens) def save_vocabulary(self, save_directory): dest = os.path.join(save_directory, "vocab.json") with open(dest, "w", encoding="utf-8") as fh: json.dump(self.encoder, fh, ensure_ascii=False) return (dest,)