i3-12m / tokenization_i3.py
FlameF0X's picture
Create tokenization_i3.py
d655584 verified
import json
from transformers import PreTrainedTokenizer
class I3Tokenizer(PreTrainedTokenizer):
def __init__(self, vocab_file, **kwargs):
super().__init__(**kwargs)
with open(vocab_file, "r") as f:
vocab_data = json.load(f)
self.chunk_to_idx = vocab_data["chunk_to_idx"]
self.idx_to_chunk = {int(k): v for k, v in vocab_data["idx_to_chunk"].items()}
self.vocab_size = vocab_data["vocab_size"]
@property
def vocab_size(self):
return len(self.chunk_to_idx)
def _tokenize(self, text):
# replicate your ChunkTokenizer.encode logic
text = text.lower()
pos = 0
tokens = []
while pos < len(text):
chunk = text[pos:pos+2]
if chunk in self.chunk_to_idx:
tokens.append(chunk)
pos += 2
else:
pos += 1
return tokens
def _convert_token_to_id(self, token):
return self.chunk_to_idx.get(token, 0)
def _convert_id_to_token(self, index):
return self.idx_to_chunk.get(index, "")
def convert_tokens_to_string(self, tokens):
return "".join(tokens)
def save_vocabulary(self, save_directory):
vocab_file = f"{save_directory}/tokenizer.json"
with open(vocab_file, "w") as f:
json.dump({
"chunk_to_idx": self.chunk_to_idx,
"idx_to_chunk": self.idx_to_chunk,
"vocab_size": self.vocab_size,
}, f)
return (vocab_file,)