leafspark commited on
Commit
4ad3970
·
verified ·
1 Parent(s): c8fbeeb

add tokenizer and script to run

Browse files
Files changed (2) hide show
  1. claude_tokenizer.py +87 -0
  2. tokenizer_config.json +0 -0
claude_tokenizer.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ from typing import List, Dict
4
+
5
+ class ClaudeTokenizer:
6
+ def __init__(self, config_file: str):
7
+ with open(config_file, "r") as f:
8
+ config = json.load(f)
9
+
10
+ self.vocab = config["vocab"]
11
+ self.vocab_size = config["n_vocab_size"]
12
+ self.pat_str = config["pat_str"]
13
+ self.special_tokens = config["special_tokens"]
14
+
15
+ self.token_to_id = {token: i for i, token in enumerate(self.vocab)}
16
+ self.id_to_token = {i: token for token, i in self.token_to_id.items()}
17
+
18
+ for token, id in self.special_tokens.items():
19
+ self.token_to_id[token] = id
20
+ self.id_to_token[id] = token
21
+
22
+ self.pat = re.compile(self.pat_str)
23
+ self.vocab_trie = self._build_trie(self.vocab)
24
+
25
+ def _build_trie(self, vocab: List[str]) -> Dict:
26
+ trie = {}
27
+ for token in vocab:
28
+ current = trie
29
+ for char in token:
30
+ if isinstance(current, str):
31
+ break
32
+ if char not in current:
33
+ current[char] = {}
34
+ current = current[char]
35
+ if isinstance(current, dict):
36
+ current["*"] = token
37
+ return trie
38
+
39
+ def tokenize(self, text: str) -> List[str]:
40
+ tokens = []
41
+ for part in self.pat.findall(text):
42
+ tokens.extend(self._tokenize_part(part))
43
+ return tokens
44
+
45
+ def encode(self, text: str) -> List[int]:
46
+ tokens = self.tokenize(text)
47
+ return [
48
+ self.token_to_id.get(token, self.special_tokens["<META>"])
49
+ for token in tokens
50
+ ]
51
+
52
+ def decode(self, ids: List[int]) -> str:
53
+ return "".join(self.id_to_token.get(id, "") for id in ids)
54
+
55
+ def _tokenize_part(self, text: str) -> List[str]:
56
+ tokens = []
57
+ while text:
58
+ current = self.vocab_trie
59
+ longest_match = ""
60
+ for i, char in enumerate(text):
61
+ if char not in current:
62
+ break
63
+ current = current[char]
64
+ if "*" in current:
65
+ longest_match = current["*"]
66
+ if longest_match:
67
+ tokens.append(longest_match)
68
+ text = text[len(longest_match) :]
69
+ else:
70
+ tokens.append(text[0])
71
+ text = text[1:]
72
+ return tokens
73
+
74
+
75
+ # Usage example
76
+ if __name__ == "__main__":
77
+ tokenizer = ClaudeTokenizer("tokenizer_config.json")
78
+
79
+ test_text = """Hello! It's nice to meet you. How can I assist you today? I'm here to help with any questions you might have or tasks you need help with."""
80
+ tokens = tokenizer.tokenize(test_text)
81
+ print("Tokens:", tokens)
82
+
83
+ encoded = tokenizer.encode(test_text)
84
+ print("Encoded:", encoded)
85
+
86
+ decoded = tokenizer.decode(encoded)
87
+ print("Decoded:", decoded)
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff