leafspark
/

claude-3-tokenizer

Model card Files Files and versions Community

leafspark commited on Jul 16, 2024

Commit

4ad3970

verified ·

1 Parent(s): c8fbeeb

add tokenizer and script to run

Browse files

Files changed (2) hide show

claude_tokenizer.py +87 -0
tokenizer_config.json +0 -0

claude_tokenizer.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import re
+import json
+from typing import List, Dict
+class ClaudeTokenizer:
+    def __init__(self, config_file: str):
+        with open(config_file, "r") as f:
+            config = json.load(f)
+        self.vocab = config["vocab"]
+        self.vocab_size = config["n_vocab_size"]
+        self.pat_str = config["pat_str"]
+        self.special_tokens = config["special_tokens"]
+        self.token_to_id = {token: i for i, token in enumerate(self.vocab)}
+        self.id_to_token = {i: token for token, i in self.token_to_id.items()}
+        for token, id in self.special_tokens.items():
+            self.token_to_id[token] = id
+            self.id_to_token[id] = token
+        self.pat = re.compile(self.pat_str)
+        self.vocab_trie = self._build_trie(self.vocab)
+    def _build_trie(self, vocab: List[str]) -> Dict:
+        trie = {}
+        for token in vocab:
+            current = trie
+            for char in token:
+                if isinstance(current, str):
+                    break
+                if char not in current:
+                    current[char] = {}
+                current = current[char]
+            if isinstance(current, dict):
+                current["*"] = token
+        return trie
+    def tokenize(self, text: str) -> List[str]:
+        tokens = []
+        for part in self.pat.findall(text):
+            tokens.extend(self._tokenize_part(part))
+        return tokens
+    def encode(self, text: str) -> List[int]:
+        tokens = self.tokenize(text)
+        return [
+            self.token_to_id.get(token, self.special_tokens["<META>"])
+            for token in tokens
+        ]
+    def decode(self, ids: List[int]) -> str:
+        return "".join(self.id_to_token.get(id, "") for id in ids)
+    def _tokenize_part(self, text: str) -> List[str]:
+        tokens = []
+        while text:
+            current = self.vocab_trie
+            longest_match = ""
+            for i, char in enumerate(text):
+                if char not in current:
+                    break
+                current = current[char]
+                if "*" in current:
+                    longest_match = current["*"]
+            if longest_match:
+                tokens.append(longest_match)
+                text = text[len(longest_match) :]
+            else:
+                tokens.append(text[0])
+                text = text[1:]
+        return tokens
+# Usage example
+if __name__ == "__main__":
+    tokenizer = ClaudeTokenizer("tokenizer_config.json")
+    test_text = """Hello! It's nice to meet you. How can I assist you today? I'm here to help with any questions you might have or tasks you need help with."""
+    tokens = tokenizer.tokenize(test_text)
+    print("Tokens:", tokens)
+    encoded = tokenizer.encode(test_text)
+    print("Encoded:", encoded)
+    decoded = tokenizer.decode(encoded)
+    print("Decoded:", decoded)

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff