Spaces:
Running
on
Zero
Running
on
Zero
| import logging | |
| from typing import List | |
| import unicodedata | |
| from abc import ABC, abstractmethod | |
| def normalize_unicode_text(text: str) -> str: | |
| if not unicodedata.is_normalized("NFC", text): | |
| text = unicodedata.normalize("NFC", text) | |
| return text | |
| def any_locale_text_preprocessing(text: str) -> str: | |
| res = [] | |
| for c in normalize_unicode_text(text): | |
| if c in ['’']: | |
| res.append("'") | |
| else: | |
| res.append(c) | |
| return ''.join(res) | |
| class BaseTokenizer(ABC): | |
| PAD, BLANK, OOV = '<pad>', '<blank>', '<oov>' | |
| def __init__(self, tokens, *, pad=PAD, blank=BLANK, oov=OOV, sep='', add_blank_at=None): | |
| """Abstract class for creating an arbitrary tokenizer to convert string to list of int tokens. | |
| Args: | |
| tokens: List of tokens. | |
| pad: Pad token as string. | |
| blank: Blank token as string. | |
| oov: OOV token as string. | |
| sep: Separation token as string. | |
| add_blank_at: Add blank to labels in the specified order ("last") or after tokens (any non None), | |
| if None then no blank in labels. | |
| """ | |
| super().__init__() | |
| tokens = list(tokens) | |
| # TODO @xueyang: in general, IDs of pad, sil, blank, and oov are preserved ahead instead of dynamically | |
| # assigned according to the number of tokens. The downside of using dynamical assignment leads to different | |
| # IDs for each. | |
| self.pad, tokens = len(tokens), tokens + [pad] # Padding | |
| if add_blank_at is not None: | |
| self.blank, tokens = len(tokens), tokens + [blank] # Reserved for blank from asr-model | |
| else: | |
| # use add_blank_at=None only for ASR where blank is added automatically, disable blank here | |
| self.blank = None | |
| self.oov, tokens = len(tokens), tokens + [oov] # Out Of Vocabulary | |
| if add_blank_at == "last": | |
| tokens[-1], tokens[-2] = tokens[-2], tokens[-1] | |
| self.oov, self.blank = self.blank, self.oov | |
| self.tokens = tokens | |
| self.sep = sep | |
| self._util_ids = {self.pad, self.blank, self.oov} | |
| self._token2id = {l: i for i, l in enumerate(tokens)} | |
| self._id2token = tokens | |
| def __call__(self, text: str) -> List[int]: | |
| return self.encode(text) | |
| def encode(self, text: str) -> List[int]: | |
| """Turns str text into int tokens.""" | |
| pass | |
| def decode(self, tokens: List[int]) -> str: | |
| """Turns ints tokens into str text.""" | |
| return self.sep.join(self._id2token[t] for t in tokens if t not in self._util_ids) | |
| class GermanCharsTokenizer(BaseTokenizer): | |
| _PUNCT_LIST = ['!', '"', '(', ')', ',', '-', '.', '/', ':', ';', '?', '[', ']', '{', '}', '«', '»', '‒', '–', '—', '‘', '‚', '“', '„', '‹', '›'] | |
| _CHARSET_STR = 'ABCDEFGHIJKLMNOPQRSTUVWXYZÄÖÜẞabcdefghijklmnopqrstuvwxyzäöüß' | |
| PUNCT_LIST = ( | |
| ',', '.', '!', '?', '-', | |
| ':', ';', '/', '"', '(', | |
| ')', '[', ']', '{', '}', | |
| ) | |
| def __init__( | |
| self, | |
| chars=_CHARSET_STR, | |
| punct=True, | |
| apostrophe=True, | |
| add_blank_at=None, | |
| pad_with_space=True, | |
| non_default_punct_list=_PUNCT_LIST, | |
| text_preprocessing_func=any_locale_text_preprocessing, | |
| ): | |
| tokens = [] | |
| self.space, tokens = len(tokens), tokens + [' '] # Space | |
| tokens.extend(chars) | |
| if apostrophe: | |
| tokens.append("'") # Apostrophe for saving "don't" and "Joe's" | |
| if punct: | |
| if non_default_punct_list is not None: | |
| self.PUNCT_LIST = non_default_punct_list | |
| tokens.extend(self.PUNCT_LIST) | |
| super().__init__(tokens, add_blank_at=add_blank_at) | |
| self.punct = punct | |
| self.pad_with_space = pad_with_space | |
| self.text_preprocessing_func = text_preprocessing_func | |
| def encode(self, text): | |
| """See base class.""" | |
| cs, space, tokens = [], self.tokens[self.space], set(self.tokens) | |
| text = self.text_preprocessing_func(text) | |
| for c in text: | |
| # Add a whitespace if the current char is a whitespace while the previous char is not a whitespace. | |
| if c == space and len(cs) > 0 and cs[-1] != space: | |
| cs.append(c) | |
| # Add the current char that is an alphanumeric or an apostrophe. | |
| elif (c.isalnum() or c == "'") and c in tokens: | |
| cs.append(c) | |
| # Add a punctuation that has a single char. | |
| elif (c in self.PUNCT_LIST) and self.punct: | |
| cs.append(c) | |
| # Warn about unknown char | |
| elif c != space: | |
| logging.warning(f"Text: [{text}] contains unknown char: [{c}]. Symbol will be skipped.") | |
| # Remove trailing spaces | |
| if cs: | |
| while cs[-1] == space: | |
| cs.pop() | |
| if self.pad_with_space: | |
| cs = [space] + cs + [space] | |
| return [self._token2id[p] for p in cs] |