codemalt / src /distiller /model2vec /tokenizer /tokenizer.py

Sarthak

chore: moved model2vec as in internal package

473c3a0 4 months ago

16.6 kB

	from __future__ import annotations

	import json
	import logging
	from typing import TYPE_CHECKING, Any, cast

	from tokenizers import Tokenizer
	from transformers import PreTrainedTokenizerFast

	from distiller.model2vec.tokenizer.datamodels import Token
	from distiller.model2vec.tokenizer.model import process_tokenizer
	from distiller.model2vec.tokenizer.normalizer import replace_normalizer
	from distiller.model2vec.tokenizer.pretokenizer import replace_pretokenizer

	if TYPE_CHECKING:
	import re

	from tokenizers.normalizers import Normalizer
	from tokenizers.pre_tokenizers import (
	PreTokenizer,
	)

	logger = logging.getLogger(__name__)


	_DEFAULT_POST_PROCESSOR_TEMPLATE = {
	"type": "TemplateProcessing",
	"single": [{"Sequence": {"id": "A", "type_id": 0}}],
	"pair": [{"Sequence": {"id": "A", "type_id": 0}}, {"Sequence": {"id": "B", "type_id": 0}}],
	"special_tokens": {},
	}


	def _remap_added_tokens(
	special_tokens: list[dict[str, Any]],
	vocabulary: list[str],
	) -> list[dict[str, Any]]:
	"""
	Remap special tokens in the tokenizer.

	This function updates the special tokens in the tokenizer based on a mapping provided.
	It also ensures that the special tokens are present in the vocabulary.

	:param special_tokens: The special tokens to remap.
	:param vocabulary: The vocabulary as a list of tokens.
	:return: The updated special tokens.
	"""
	# Deepcopy
	special_tokens = [{**x} for x in special_tokens]
	for token in special_tokens:
	token["id"] = vocabulary.index(token["content"])

	return special_tokens


	def replace_vocabulary(
	tokenizer: Tokenizer, new_vocabulary: list[Token], unk_token: str \| None, pad_token: str \| None
	) -> Tokenizer:
	"""Replace the vocabulary of a tokenizer with a new one."""
	tokenizer_json: dict[str, Any] = json.loads(tokenizer.to_str())
	added_tokens: list[dict[str, Any]] = tokenizer_json["added_tokens"]

	pre_tokenized_tokens = [x.normalized_form for x in new_vocabulary]

	# We need to remove the added tokens but keep [UNK] and [PAD] tokens.
	added_tokens = _rename_added_token(unk_token, "[UNK]", added_tokens, pre_tokenized_tokens)
	added_tokens = _rename_added_token(pad_token, "[PAD]", added_tokens, pre_tokenized_tokens)

	# Remove old added tokens from added tokens
	tokenizer_json["added_tokens"] = [x for x in added_tokens if x["content"] in {"[UNK]", "[PAD]"}]
	tokenizer_json = process_tokenizer(
	tokenizer_json, pre_tokenized_tokens, "[UNK]" if "[UNK]" in pre_tokenized_tokens else None
	)

	# Remap special tokens
	tokenizer_json["added_tokens"] = _remap_added_tokens(
	special_tokens=tokenizer_json["added_tokens"],
	vocabulary=pre_tokenized_tokens,
	)
	tokenizer_json["post_processor"] = _DEFAULT_POST_PROCESSOR_TEMPLATE

	return Tokenizer.from_str(json.dumps(tokenizer_json))


	def _rename_added_token(
	form: str \| None, new_form: str, added_tokens: list[dict[str, Any]], vocabulary: list[str]
	) -> list[dict[str, Any]]:
	"""Rename added tokens in the tokenizer."""
	if form is None:
	return added_tokens

	idx = vocabulary.index(form)
	added_token = [x for x in added_tokens if x["content"] == form]
	if added_token:
	added_token[0]["id"] = idx
	added_token[0]["content"] = new_form
	vocabulary[idx] = new_form

	return added_tokens


	def clean_and_create_vocabulary(
	tokenizer: PreTrainedTokenizerFast,
	vocabulary: list[str],
	token_remove_regex: re.Pattern \| None,
	) -> tuple[list[Token], Tokenizer]:
	"""Cleans a vocabulary by removing duplicates and tokens that were already in the vocabulary."""
	seen_tokens = set()
	post_normalize_seen_tokens = set()
	n_empty = 0
	n_duplicates = 0

	backend_tokenizer = tokenizer.backend_tokenizer

	# Make a base list of tokens.
	internal_vocab: dict[str, int] = tokenizer.get_vocab()
	internal_tokens: list[str] = [k for k, _ in sorted(internal_vocab.items(), key=lambda x: x[1])]

	cleaned_vocabulary = _process_internal_tokens(tokenizer, backend_tokenizer, internal_tokens, token_remove_regex)
	# Copy the backend tokenizer to avoid modifying the original.
	backend_tokenizer = backend_tokenizer.from_str(backend_tokenizer.to_str())
	backend_tokenizer = replace_normalizer(backend_tokenizer)

	internal_tokens_set = {token.form for token in cleaned_vocabulary}

	normalizer: Normalizer \| None = backend_tokenizer.normalizer
	for token in vocabulary:
	if normalizer is not None:
	token = cast("str", normalizer.normalize_str(token))

	if not token:
	n_empty += 1
	continue

	pre_tokenizer: PreTokenizer \| None = backend_tokenizer.pre_tokenizer
	normalized_token = token
	if pre_tokenizer is not None:
	normalized_token = _normalize_vocabulary_token(
	token=token,
	pre_tokenizer=pre_tokenizer,
	)

	# We need to check whether the pretokenized token is in the vocabulary.
	# But we need to return the original token, because that will be tokenized
	# again by the tokenizer during featurization.
	if normalized_token in seen_tokens or normalized_token in internal_tokens_set:
	n_duplicates += 1
	continue

	# Add the possibly pretokenized token to seen
	seen_tokens.add(normalized_token)

	# After checking the token exists, we need to normalize it into the token
	# it will become. For byte tokens, this means we don't do anything. For
	# other types of tokens, we will insert a metaspace.
	# In the case of multiword tokens, we replace any spaces with the metaspace
	# or byte prefix token.
	if not normalized_token.startswith(("▁", "Ġ")):
	normalized_token = normalized_token.replace(" ", "▁")
	normalized_token = f"▁{normalized_token}"
	else:
	normalized_token = normalized_token.replace(" ", normalized_token[0])

	if normalized_token in post_normalize_seen_tokens:
	n_duplicates += 1
	continue

	post_normalize_seen_tokens.add(normalized_token)
	# Add the original string to the vocabulary.
	cleaned_vocabulary.append(
	Token(form=token, normalized_form=normalized_token, is_subword=False, is_internal=False)
	)

	if n_duplicates:
	logger.warning(f"Removed {n_duplicates} duplicate tokens.")
	if n_empty:
	logger.warning(f"Removed {n_empty} empty tokens.")

	return cleaned_vocabulary, replace_pretokenizer(backend_tokenizer)


	def _process_internal_tokens(
	tokenizer: PreTrainedTokenizerFast,
	backend_tokenizer: Tokenizer,
	internal_tokens: list[str],
	token_remove_regex: re.Pattern \| None,
	) -> list[Token]:
	"""Clean internal tokens."""
	# Get the pad and unk token from the tokenizer.
	pad_token: str \| None = tokenizer.special_tokens_map.get("pad_token") # type: ignore[assignment]
	unk_token: str \| None = tokenizer.special_tokens_map.get("unk_token") # type: ignore[assignment]
	# Empty set if no pad or unk token is set.
	added_tokens_to_keep: set[str] = {x for x in (pad_token, unk_token) if x is not None}
	added_tokens_to_remove = set(tokenizer.added_tokens_encoder) - added_tokens_to_keep
	cleaned_internal_tokens: list[Token] = []

	# Figure out whether token is a subword or not.
	encoded = backend_tokenizer.encode(f" {'a' * 25}", add_special_tokens=False)
	first_token, second_token, *_ = encoded.tokens
	# Isolate the prefix. We can't do first_token[0] because we don't know
	# how long the prefix is.
	# e.g., "Ġaaaa" -> "Ġ"
	a_index = None if "a" not in first_token else first_token.index("a")
	word_prefix = first_token[:a_index]
	is_byte_prefix = word_prefix == "Ġ"
	second_token = encoded.tokens[1]
	# The second token is the first subword token.
	# If a tokenizer uses subwords, this token will have been prefixed.
	# We don't know how long the prefix is.
	a_index = None if "a" not in second_token else second_token.index("a")
	subword_prefix = second_token[:a_index]

	pre_tokenizer: PreTokenizer \| None = backend_tokenizer.pre_tokenizer

	for token in internal_tokens:
	# Create the token objects. If this returns None, it was unsucessful for some reason.
	if token_object := _create_single_internal_token(
	token=token,
	subword_prefix=subword_prefix,
	word_prefix=word_prefix,
	pre_tokenizer=pre_tokenizer,
	is_byte_prefix=is_byte_prefix,
	token_remove_regex=token_remove_regex,
	added_tokens_to_keep=added_tokens_to_keep,
	added_tokens_to_remove=added_tokens_to_remove,
	):
	cleaned_internal_tokens.append(token_object)

	if len(cleaned_internal_tokens) != len(internal_tokens):
	logger.info(
	f"Removed {len(internal_tokens) - len(cleaned_internal_tokens)} internal tokens from the vocabulary."
	)

	return cleaned_internal_tokens


	def _create_single_internal_token(
	token: str,
	subword_prefix: str,
	word_prefix: str,
	pre_tokenizer: PreTokenizer \| None,
	is_byte_prefix: bool,
	token_remove_regex: re.Pattern \| None,
	added_tokens_to_keep: set[str],
	added_tokens_to_remove: set[str],
	) -> Token \| None:
	"""Create a token object from a string."""
	if token in added_tokens_to_remove:
	# We remove any tokens that are added tokens that aren't [UNK] or [PAD].
	return None
	if token in added_tokens_to_keep:
	# Don't put added tokens through the regular motions.
	return Token(form=token, normalized_form=token, is_subword=False, is_internal=True)
	if token_remove_regex and token_remove_regex.match(token):
	# If the regex matches, remove the token.
	return None

	# A token is a subword if there is a subword prefix and the word
	# starts with a subword prefix, or if there is a WORD prefix, and the word
	# does not start with this prefix. For metaspace tokenizers, for example:
	# "doghouse" -> ["_dog", "house"]
	# So we can only tell that "house" is a subword by knowing that it is not prefixed
	# and word-initial tokens are.
	is_subword = False
	if subword_prefix:
	is_subword = bool(token.startswith(subword_prefix))
	if word_prefix:
	is_subword = not bool(token.startswith(word_prefix))

	# Byte prefixed tokenizers don't need to be checked.
	if pre_tokenizer is not None and not is_byte_prefix:
	# We need to check the thing without prefixes. If we have a word prefix,
	# we need to check tokens that have are subwords. Other way around for subword
	# prefixes.
	if (subword_prefix and not is_subword) or (word_prefix and is_subword):
	# If this is True, the token is unreachable, even though it is a subword token.
	if len(pre_tokenizer.pre_tokenize_str(token)) > 1:
	return None

	# Turn a token into a normalized form for later processing.
	normalized_form = _create_normalized_form(token, subword_prefix, word_prefix, is_byte_prefix, is_subword)

	return Token(form=token, normalized_form=normalized_form, is_subword=is_subword, is_internal=True)


	def _create_normalized_form(
	token: str, subword_prefix: str, word_prefix: str, is_byte_prefix: bool, is_subword: bool
	) -> str:
	"""Turn an internal token string into a normalized form."""
	# We don't need to check byte prefixed strings.
	if is_byte_prefix:
	return token
	# We need to check if the token is a subword or not and remove the prefix.
	if is_subword:
	return token.removeprefix(subword_prefix)
	# If the token is not a subword, we need to remove the word prefix, and add metaspace.
	return f"▁{token.removeprefix(word_prefix)}"


	def turn_tokens_into_ids(
	tokens: list[Token], tokenizer: PreTrainedTokenizerFast, unk_token: str \| None
	) -> list[list[int]]:
	"""
	Convert a list of Token objects to their corresponding token ID sequences.

	:param tokens: List of Token objects to convert
	:param tokenizer: The tokenizer to use for converting tokens to IDs
	:param unk_token: The string form of the unk token.
	:return: List of token IDs corresponding to the input tokens
	"""
	unk_id = None if unk_token is None else tokenizer.convert_tokens_to_ids(unk_token)
	prefix, suffix = find_eos_bos(tokenizer)

	token_ids: list[list[int]] = []
	for token in tokens:
	if token.is_internal:
	# Careful. Any incorrect tokens will just get `[UNK]``, so this could go horribly wrong
	# Cast because return type is wrong.
	token_id: int = cast("int", tokenizer.convert_tokens_to_ids(token.form)) or 0
	# Explicitly check and warn if `unk_id` appears, but don't crash.
	if unk_id is not None and token_id == unk_id and token.form != unk_token:
	logger.warning(f"Token {token.form} was set to unk. This is wrong.")
	token_ids.append([prefix, token_id, suffix])
	else:
	token_ids.append(tokenizer.encode(token.form))

	return token_ids


	def find_eos_bos(tokenizer: PreTrainedTokenizerFast) -> tuple[list[int], list[int]]:
	"""Finds the eos and bos tokens for a tokenizer."""
	# Little bit complicated, because not all tokenizers have eos and bos tokens.
	encoding = tokenizer.encode("a", add_special_tokens=True)
	if len(encoding) != 3:
	a_encoded = tokenizer.encode("a", add_special_tokens=False)
	if len(a_encoded) != 1:
	msg = f"Error while encoding, couldn't determine eos and bos tokens. The model tokenizes 'a' to '{a_encoded}'"
	raise ValueError(
	msg
	)
	a_idx = encoding.index(a_encoded[0])
	prefix, suffix = encoding[:a_idx], encoding[a_idx + 1 :]
	else:
	prefix, suffix = encoding[:1], encoding[2:]
	return prefix, suffix


	def _normalize_vocabulary_token(token: str, pre_tokenizer: PreTokenizer) -> str:
	"""Normalize a token that is not in the initial token vocabulary."""
	# Add prefix space for byte tokenizers.
	prefixed_token = f" {token}"
	pretokenized_tokens: tuple[str, ...]
	pretokenized_tokens, offsets = zip(*pre_tokenizer.pre_tokenize_str(prefixed_token), strict=False)
	# The first item is always the start of the token.
	new_token = [pretokenized_tokens[0]]
	# Loop over the subtokens and offsets.
	for t, (s, _) in zip(pretokenized_tokens[1:], offsets[1:], strict=False):
	# Do not prefix the token with a space if it starts with a metaspace.
	if t.startswith("▁"):
	new_token.append(t)
	# If the character before the subtoken is a space, we have a
	# multiword token. e.g., "room for the moon", which is split into
	# ["room", "for", "the", "moon"].
	# If it doesn't have a space, it is part of a complex multiword token,
	# e.g., "chat-gpt", which is split into ["chat", "-", "gpt"].
	elif prefixed_token[s - 1] == " ":
	new_token.append(f" {t}")
	else:
	new_token.append(t)
	return "".join(new_token)



	def create_tokenizer(
	tokenizer: PreTrainedTokenizerFast,
	vocabulary: list[str],
	token_remove_regex: re.Pattern \| None = None,
	) -> PreTrainedTokenizerFast:
	"""
	Create a tokenizer by adding tokens to the vocabulary.

	This function turns any tokenizer into a supertoken tokenizer. It does the following:
	1. Turns the tokenizer model into a unigram model.
	2. Adds a new pretokenizer, splitting on punctuation.
	3. Adds all tokens in vocabulary to the model.
	4. Removes any internal tokens that conform to the regex.

	:param tokenizer: The tokenizer to use.
	:param vocabulary: The vocabulary to use.
	:param token_remove_regex: The regex to use to remove tokens from the vocabulary.
	:return: The created tokenizer.
	"""
	unk_token = cast("str \| None", tokenizer.special_tokens_map.get("unk_token"))
	pad_token = cast("str \| None", tokenizer.special_tokens_map.get("pad_token"))
	cleaned_vocabulary, backend_tokenizer = clean_and_create_vocabulary(tokenizer, vocabulary, token_remove_regex)
	new_tokenizer = replace_vocabulary(backend_tokenizer, cleaned_vocabulary, unk_token, pad_token)

	return PreTrainedTokenizerFast(tokenizer_object=new_tokenizer)