|
|
import json |
|
|
from transformers import PreTrainedTokenizer |
|
|
|
|
|
class I3Tokenizer(PreTrainedTokenizer): |
|
|
def __init__(self, vocab_file, **kwargs): |
|
|
super().__init__(**kwargs) |
|
|
with open(vocab_file, "r") as f: |
|
|
vocab_data = json.load(f) |
|
|
self.chunk_to_idx = vocab_data["chunk_to_idx"] |
|
|
self.idx_to_chunk = {int(k): v for k, v in vocab_data["idx_to_chunk"].items()} |
|
|
self.vocab_size = vocab_data["vocab_size"] |
|
|
|
|
|
@property |
|
|
def vocab_size(self): |
|
|
return len(self.chunk_to_idx) |
|
|
|
|
|
def _tokenize(self, text): |
|
|
|
|
|
text = text.lower() |
|
|
pos = 0 |
|
|
tokens = [] |
|
|
while pos < len(text): |
|
|
chunk = text[pos:pos+2] |
|
|
if chunk in self.chunk_to_idx: |
|
|
tokens.append(chunk) |
|
|
pos += 2 |
|
|
else: |
|
|
pos += 1 |
|
|
return tokens |
|
|
|
|
|
def _convert_token_to_id(self, token): |
|
|
return self.chunk_to_idx.get(token, 0) |
|
|
|
|
|
def _convert_id_to_token(self, index): |
|
|
return self.idx_to_chunk.get(index, "") |
|
|
|
|
|
def convert_tokens_to_string(self, tokens): |
|
|
return "".join(tokens) |
|
|
|
|
|
def save_vocabulary(self, save_directory): |
|
|
vocab_file = f"{save_directory}/tokenizer.json" |
|
|
with open(vocab_file, "w") as f: |
|
|
json.dump({ |
|
|
"chunk_to_idx": self.chunk_to_idx, |
|
|
"idx_to_chunk": self.idx_to_chunk, |
|
|
"vocab_size": self.vocab_size, |
|
|
}, f) |
|
|
return (vocab_file,) |
|
|
|