import random import gradio as gr import unicodedata from transformers import AutoTokenizer, PreTrainedTokenizerFast tokenizers = { "Tabularis German Tokenizer_whiteS": PreTrainedTokenizerFast(tokenizer_file="tokenizer.json"), "Tabularis German Tokenizer": PreTrainedTokenizerFast(tokenizer_file="tokenizer_BPE.json"), "KoichiYasuoka/bert-base-german-upos": AutoTokenizer.from_pretrained("KoichiYasuoka/bert-base-german-upos"), "benjamin/gerpt2-large": AutoTokenizer.from_pretrained("benjamin/gerpt2-large"), "deepset/gbert-base": AutoTokenizer.from_pretrained("deepset/gbert-base"), "bert-base-german-cased Tokenizer": AutoTokenizer.from_pretrained("bert-base-german-cased"), "MiriUll/gpt2-wechsel-german_easy": AutoTokenizer.from_pretrained("MiriUll/gpt2-wechsel-german_easy"), "DeepSeek Tokenizer": AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1") } def decode_byte_token(token): token_clean = token.replace("Ġ", "") try: byte_seq = bytes([ord(c) for c in token_clean]) return unicodedata.normalize("NFC", byte_seq.decode("utf-8")) except Exception: return token_clean def visualize_tokens(text, tokenizer_name, show_token_ids): tokenizer = tokenizers[tokenizer_name] encoded = tokenizer(text, add_special_tokens=False, return_tensors=None) token_ids = encoded["input_ids"] tokens = tokenizer.convert_ids_to_tokens(token_ids) def random_pastel(): r = lambda: random.randint(100, 255) return f"rgb({r()},{r()},{r()})" def is_special_token(token): return ( token.startswith('[') and token.endswith(']') or token.startswith('<') and token.endswith('>') or token in tokenizer.all_special_tokens ) html_tokens = [] for token in tokens: prefix = "" token_body = token if tokenizer_name.startswith("Tabularis") and token.startswith("Ġ"): prefix = "Ġ" token_body = token[1:] try: byte_seq = bytes([ord(c) for c in token_body]) decoded = unicodedata.normalize("NFC", byte_seq.decode("utf-8")) except Exception: decoded = token_body label = f"{prefix}{decoded}" color = "lightgray" if is_special_token(token) else random_pastel() html_token = f""" {label} """ html_tokens.append(html_token) html_output = "".join(html_tokens) if show_token_ids: html_output += "

Token IDs:
" + str(token_ids) try: decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True) except Exception: decoded_output = "[Could not decode using this tokenizer]" return html_output, f"🔢 Token Count: {len(tokens)}", decoded_output # App with gr.Blocks() as app: gr.Markdown("# 🚀 German Tokenizers") with gr.Row(): with gr.Column(): text_input = gr.Textbox(lines=4, label="Enter your text here", placeholder="Type or paste text...") tokenizer_choice = gr.Dropdown(list(tokenizers.keys()), label="Choose Tokenizer") show_ids = gr.Checkbox(label="Show Token IDs", value=False) tokenize_btn = gr.Button("Tokenize!") with gr.Column(): html_output = gr.HTML(label="Tokens Visualized") token_count = gr.Label(label="Token Count") decoded_output = gr.Textbox(label="Decoded Text", lines=3) tokenize_btn.click( visualize_tokens, inputs=[text_input, tokenizer_choice, show_ids], outputs=[html_output, token_count, decoded_output] ) app.launch()