from string import punctuation from tokenizers import Regex, Tokenizer from tokenizers.normalizers import Replace, Sequence, Strip def replace_normalizer( tokenizer: Tokenizer, ) -> Tokenizer: """ Replace the normalizer for the tokenizer. The new normalizer will replace punctuation with a space before and after the punctuation. It will also replace multiple spaces with a single space and strip the right side of the string. If the tokenizer already has a normalizer, it will be added to the new normalizer. If the tokenizer does not have a normalizer, a new normalizer will be created. :param tokenizer: The tokenizer to change. :return: The tokenizer with a replaced normalizer. """ normalizer = tokenizer.normalizer new_normalizers = [] for char in punctuation: new_normalizers.append(Replace(char, f" {char} ")) new_normalizers.append(Replace(Regex(r"\s+"), " ")) new_normalizers.append(Strip(right=True)) if normalizer is None: normalizer = Sequence(new_normalizers) # type: ignore else: normalizer = Sequence([normalizer, *new_normalizers]) # type: ignore tokenizer.normalizer = normalizer # type: ignore return tokenizer