FlameF0X
/

i3-12m

Text Generation

i3-architecture

Model card Files Files and versions

i3-12m / tokenization_i3.py

FlameF0X's picture

Create tokenization_i3.py

d655584 verified about 2 months ago

history blame contribute delete

1.54 kB

	import json
	from transformers import PreTrainedTokenizer

	class I3Tokenizer(PreTrainedTokenizer):
	def __init__(self, vocab_file, **kwargs):
	super().__init__(**kwargs)
	with open(vocab_file, "r") as f:
	vocab_data = json.load(f)
	self.chunk_to_idx = vocab_data["chunk_to_idx"]
	self.idx_to_chunk = {int(k): v for k, v in vocab_data["idx_to_chunk"].items()}
	self.vocab_size = vocab_data["vocab_size"]

	@property
	def vocab_size(self):
	return len(self.chunk_to_idx)

	def _tokenize(self, text):
	# replicate your ChunkTokenizer.encode logic
	text = text.lower()
	pos = 0
	tokens = []
	while pos < len(text):
	chunk = text[pos:pos+2]
	if chunk in self.chunk_to_idx:
	tokens.append(chunk)
	pos += 2
	else:
	pos += 1
	return tokens

	def _convert_token_to_id(self, token):
	return self.chunk_to_idx.get(token, 0)

	def _convert_id_to_token(self, index):
	return self.idx_to_chunk.get(index, "")

	def convert_tokens_to_string(self, tokens):
	return "".join(tokens)

	def save_vocabulary(self, save_directory):
	vocab_file = f"{save_directory}/tokenizer.json"
	with open(vocab_file, "w") as f:
	json.dump({
	"chunk_to_idx": self.chunk_to_idx,
	"idx_to_chunk": self.idx_to_chunk,
	"vocab_size": self.vocab_size,
	}, f)
	return (vocab_file,)