Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,047 Bytes
a1e382b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import logging
from typing import List
import unicodedata
from abc import ABC, abstractmethod
def normalize_unicode_text(text: str) -> str:
if not unicodedata.is_normalized("NFC", text):
text = unicodedata.normalize("NFC", text)
return text
def any_locale_text_preprocessing(text: str) -> str:
res = []
for c in normalize_unicode_text(text):
if c in ['’']:
res.append("'")
else:
res.append(c)
return ''.join(res)
class BaseTokenizer(ABC):
PAD, BLANK, OOV = '<pad>', '<blank>', '<oov>'
def __init__(self, tokens, *, pad=PAD, blank=BLANK, oov=OOV, sep='', add_blank_at=None):
"""Abstract class for creating an arbitrary tokenizer to convert string to list of int tokens.
Args:
tokens: List of tokens.
pad: Pad token as string.
blank: Blank token as string.
oov: OOV token as string.
sep: Separation token as string.
add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None),
if None then no blank in labels.
"""
super().__init__()
tokens = list(tokens)
# TODO @xueyang: in general, IDs of pad, sil, blank, and oov are preserved ahead instead of dynamically
# assigned according to the number of tokens. The downside of using dynamical assignment leads to different
# IDs for each.
self.pad, tokens = len(tokens), tokens + [pad] # Padding
if add_blank_at is not None:
self.blank, tokens = len(tokens), tokens + [blank] # Reserved for blank from asr-model
else:
# use add_blank_at=None only for ASR where blank is added automatically, disable blank here
self.blank = None
self.oov, tokens = len(tokens), tokens + [oov] # Out Of Vocabulary
if add_blank_at == "last":
tokens[-1], tokens[-2] = tokens[-2], tokens[-1]
self.oov, self.blank = self.blank, self.oov
self.tokens = tokens
self.sep = sep
self._util_ids = {self.pad, self.blank, self.oov}
self._token2id = {l: i for i, l in enumerate(tokens)}
self._id2token = tokens
def __call__(self, text: str) -> List[int]:
return self.encode(text)
@abstractmethod
def encode(self, text: str) -> List[int]:
"""Turns str text into int tokens."""
pass
def decode(self, tokens: List[int]) -> str:
"""Turns ints tokens into str text."""
return self.sep.join(self._id2token[t] for t in tokens if t not in self._util_ids)
class GermanCharsTokenizer(BaseTokenizer):
_PUNCT_LIST = ['!', '"', '(', ')', ',', '-', '.', '/', ':', ';', '?', '[', ']', '{', '}', '«', '»', '‒', '–', '—', '‘', '‚', '“', '„', '‹', '›']
_CHARSET_STR = 'ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÜẞabcdefghijklmnopqrstuvwxyzäöüß'
PUNCT_LIST = (
',', '.', '!', '?', '-',
':', ';', '/', '"', '(',
')', '[', ']', '{', '}',
)
def __init__(
self,
chars=_CHARSET_STR,
punct=True,
apostrophe=True,
add_blank_at=None,
pad_with_space=True,
non_default_punct_list=_PUNCT_LIST,
text_preprocessing_func=any_locale_text_preprocessing,
):
tokens = []
self.space, tokens = len(tokens), tokens + [' '] # Space
tokens.extend(chars)
if apostrophe:
tokens.append("'") # Apostrophe for saving "don't" and "Joe's"
if punct:
if non_default_punct_list is not None:
self.PUNCT_LIST = non_default_punct_list
tokens.extend(self.PUNCT_LIST)
super().__init__(tokens, add_blank_at=add_blank_at)
self.punct = punct
self.pad_with_space = pad_with_space
self.text_preprocessing_func = text_preprocessing_func
def encode(self, text):
"""See base class."""
cs, space, tokens = [], self.tokens[self.space], set(self.tokens)
text = self.text_preprocessing_func(text)
for c in text:
# Add a whitespace if the current char is a whitespace while the previous char is not a whitespace.
if c == space and len(cs) > 0 and cs[-1] != space:
cs.append(c)
# Add the current char that is an alphanumeric or an apostrophe.
elif (c.isalnum() or c == "'") and c in tokens:
cs.append(c)
# Add a punctuation that has a single char.
elif (c in self.PUNCT_LIST) and self.punct:
cs.append(c)
# Warn about unknown char
elif c != space:
logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.")
# Remove trailing spaces
if cs:
while cs[-1] == space:
cs.pop()
if self.pad_with_space:
cs = [space] + cs + [space]
return [self._token2id[p] for p in cs] |