pasha
commited on
Commit
·
b4d132c
1
Parent(s):
236bb7f
Version updated
Browse files- tokenizer.json +0 -0
- tokenizer.py +40 -20
- tokenizer_config.json +5 -1
tokenizer.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import re
|
|
|
|
| 4 |
from typing import List
|
| 5 |
|
| 6 |
from tokenizers import pre_tokenizers, decoders, NormalizedString, PreTokenizedString, AddedToken
|
|
@@ -12,21 +13,23 @@ DEFAULT_MODEL_NAME = "evilfreelancer/ruMorpheme-v0.2"
|
|
| 12 |
|
| 13 |
END, BEGIN, PAD, UNKNOWN, CAP, ALL_CAPS = 0, 1, 2, 3, 4, 5
|
| 14 |
SYSTEM, USER, ASSISTANT, FUNCTION_CALL, FUNCTION_RESPONSE = 6, 7, 8, 9, 10
|
| 15 |
-
SPACE = 11
|
| 16 |
|
| 17 |
AUXILIARY = [
|
| 18 |
"</s>", "<s>", "<pad>", "<unk>", "<cap>", "<all_caps>",
|
| 19 |
"system", "user", "assistant", "function_call", "function_response",
|
| 20 |
-
" ",
|
| 21 |
]
|
| 22 |
|
| 23 |
NUMBERS = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
|
|
|
|
|
|
|
| 24 |
|
| 25 |
|
| 26 |
class RuMorphemePreTokenizer:
|
| 27 |
"""
|
| 28 |
Pre-tokenizer for RuMorpheme model.
|
| 29 |
-
Splits on spaces and
|
| 30 |
Then, applies morpheme splitting to non-space tokens.
|
| 31 |
"""
|
| 32 |
|
|
@@ -35,35 +38,47 @@ class RuMorphemePreTokenizer:
|
|
| 35 |
self.model.eval()
|
| 36 |
|
| 37 |
def pre_tokenize(self, pretok: PreTokenizedString):
|
| 38 |
-
# First, split on spaces and
|
| 39 |
pretok.split(self.split_on_spaces)
|
| 40 |
-
|
| 41 |
-
|
|
|
|
| 42 |
|
| 43 |
def split_on_spaces(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
|
| 44 |
"""
|
| 45 |
-
Splits on spaces and
|
| 46 |
-
TODO: Need to make performance tests on this function.
|
| 47 |
"""
|
| 48 |
text = str(normalized_string)
|
| 49 |
-
splits = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
return splits
|
| 51 |
|
| 52 |
-
def
|
| 53 |
"""
|
| 54 |
-
|
|
|
|
| 55 |
"""
|
| 56 |
word = str(normalized_string)
|
| 57 |
|
| 58 |
-
# If
|
| 59 |
if word.isspace() or word.isdigit():
|
| 60 |
return [normalized_string]
|
| 61 |
|
| 62 |
-
# Ignore
|
| 63 |
if not any(c.isalpha() for c in word):
|
| 64 |
return [normalized_string]
|
| 65 |
|
| 66 |
-
# Detect capitalization
|
| 67 |
cap_token = None
|
| 68 |
if word[0].isupper():
|
| 69 |
cap_token = NormalizedString(AUXILIARY[CAP])
|
|
@@ -73,15 +88,20 @@ class RuMorphemePreTokenizer:
|
|
| 73 |
# Convert word to lowercase for morpheme splitting
|
| 74 |
word_lower = word.lower()
|
| 75 |
|
| 76 |
-
# Make predictions
|
| 77 |
all_predictions, all_log_probs = self.model.predict([word_lower])
|
| 78 |
morphs, morph_types, _ = labels_to_morphemes(word_lower, all_predictions[0], all_log_probs[0])
|
| 79 |
|
| 80 |
-
#
|
| 81 |
-
morpheme_tokens = [
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
|
| 86 |
# Insert capitalization token if needed
|
| 87 |
if cap_token:
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import re
|
| 4 |
+
import string
|
| 5 |
from typing import List
|
| 6 |
|
| 7 |
from tokenizers import pre_tokenizers, decoders, NormalizedString, PreTokenizedString, AddedToken
|
|
|
|
| 13 |
|
| 14 |
END, BEGIN, PAD, UNKNOWN, CAP, ALL_CAPS = 0, 1, 2, 3, 4, 5
|
| 15 |
SYSTEM, USER, ASSISTANT, FUNCTION_CALL, FUNCTION_RESPONSE = 6, 7, 8, 9, 10
|
| 16 |
+
SPACE, NEWLINE, TAB = 11, 12, 13
|
| 17 |
|
| 18 |
AUXILIARY = [
|
| 19 |
"</s>", "<s>", "<pad>", "<unk>", "<cap>", "<all_caps>",
|
| 20 |
"system", "user", "assistant", "function_call", "function_response",
|
| 21 |
+
" ", "\n", "\t"
|
| 22 |
]
|
| 23 |
|
| 24 |
NUMBERS = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
|
| 25 |
+
LETTERS_CYRILLIC = list(map(chr, range(ord('а'), ord('я') + 1)))
|
| 26 |
+
LETTERS_LATIN = list(string.ascii_lowercase)
|
| 27 |
|
| 28 |
|
| 29 |
class RuMorphemePreTokenizer:
|
| 30 |
"""
|
| 31 |
Pre-tokenizer for RuMorpheme model.
|
| 32 |
+
Splits on spaces, newlines, and tabs, including these as tokens.
|
| 33 |
Then, applies morpheme splitting to non-space tokens.
|
| 34 |
"""
|
| 35 |
|
|
|
|
| 38 |
self.model.eval()
|
| 39 |
|
| 40 |
def pre_tokenize(self, pretok: PreTokenizedString):
|
| 41 |
+
# First, split on spaces (including newlines and tabs) and add them as tokens
|
| 42 |
pretok.split(self.split_on_spaces)
|
| 43 |
+
|
| 44 |
+
# Apply morpheme or character-level splitting to non-space tokens
|
| 45 |
+
pretok.split(self.morpheme_or_char_split)
|
| 46 |
|
| 47 |
def split_on_spaces(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
|
| 48 |
"""
|
| 49 |
+
Splits on spaces, newlines, and tabs, including these as tokens.
|
|
|
|
| 50 |
"""
|
| 51 |
text = str(normalized_string)
|
| 52 |
+
splits = [
|
| 53 |
+
NormalizedString(match.group())
|
| 54 |
+
for match in re.finditer(r'\s+|\S+', text)
|
| 55 |
+
]
|
| 56 |
+
|
| 57 |
+
# Convert newlines and tabs to tokens
|
| 58 |
+
for idx, split in enumerate(splits):
|
| 59 |
+
if split == "\n":
|
| 60 |
+
splits[idx] = NormalizedString(AUXILIARY[NEWLINE])
|
| 61 |
+
elif split == "\t":
|
| 62 |
+
splits[idx] = NormalizedString(AUXILIARY[TAB])
|
| 63 |
+
|
| 64 |
return splits
|
| 65 |
|
| 66 |
+
def morpheme_or_char_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
|
| 67 |
"""
|
| 68 |
+
Attempts to split the token into morphemes. If the token starts with "UNKNOWN/",
|
| 69 |
+
splits it into individual characters.
|
| 70 |
"""
|
| 71 |
word = str(normalized_string)
|
| 72 |
|
| 73 |
+
# If the token is whitespace or digits, return as is
|
| 74 |
if word.isspace() or word.isdigit():
|
| 75 |
return [normalized_string]
|
| 76 |
|
| 77 |
+
# Ignore tokens that are only punctuation or non-alphabetical
|
| 78 |
if not any(c.isalpha() for c in word):
|
| 79 |
return [normalized_string]
|
| 80 |
|
| 81 |
+
# Detect capitalization and add relevant token if necessary
|
| 82 |
cap_token = None
|
| 83 |
if word[0].isupper():
|
| 84 |
cap_token = NormalizedString(AUXILIARY[CAP])
|
|
|
|
| 88 |
# Convert word to lowercase for morpheme splitting
|
| 89 |
word_lower = word.lower()
|
| 90 |
|
| 91 |
+
# Make predictions to get morphemes
|
| 92 |
all_predictions, all_log_probs = self.model.predict([word_lower])
|
| 93 |
morphs, morph_types, _ = labels_to_morphemes(word_lower, all_predictions[0], all_log_probs[0])
|
| 94 |
|
| 95 |
+
# Handle unknown tokens by splitting into characters
|
| 96 |
+
morpheme_tokens = []
|
| 97 |
+
for morph, morph_type in zip(morphs, morph_types):
|
| 98 |
+
if morph_type == "UNKNOWN":
|
| 99 |
+
# Split unknown morpheme into characters
|
| 100 |
+
char_tokens = [NormalizedString(char) for char in morph]
|
| 101 |
+
morpheme_tokens.extend(char_tokens)
|
| 102 |
+
else:
|
| 103 |
+
# Add as a single morpheme token
|
| 104 |
+
morpheme_tokens.append(NormalizedString(f"{morph_type}/{morph}"))
|
| 105 |
|
| 106 |
# Insert capitalization token if needed
|
| 107 |
if cap_token:
|
tokenizer_config.json
CHANGED
|
@@ -37,11 +37,15 @@
|
|
| 37 |
"clean_up_tokenization_spaces": false,
|
| 38 |
"eos_token": "</s>",
|
| 39 |
"model_max_length": 1000000000000000019884624838656,
|
|
|
|
| 40 |
"pad_token": "<pad>",
|
| 41 |
"tokenizer_class": "RuMorphemeTokenizerFast",
|
| 42 |
"unk_token": "<unk>",
|
| 43 |
"use_fast": true,
|
| 44 |
"auto_map": {
|
| 45 |
-
"AutoTokenizer": [
|
|
|
|
|
|
|
|
|
|
| 46 |
}
|
| 47 |
}
|
|
|
|
| 37 |
"clean_up_tokenization_spaces": false,
|
| 38 |
"eos_token": "</s>",
|
| 39 |
"model_max_length": 1000000000000000019884624838656,
|
| 40 |
+
"model_name": "./model",
|
| 41 |
"pad_token": "<pad>",
|
| 42 |
"tokenizer_class": "RuMorphemeTokenizerFast",
|
| 43 |
"unk_token": "<unk>",
|
| 44 |
"use_fast": true,
|
| 45 |
"auto_map": {
|
| 46 |
+
"AutoTokenizer": [
|
| 47 |
+
"",
|
| 48 |
+
"tokenizer.RuMorphemeTokenizerFast"
|
| 49 |
+
]
|
| 50 |
}
|
| 51 |
}
|