import gradio as gr import pkg_resources from turkish_tokenizer import TokenType, TurkishTokenizer # Get the version from the installed package try: VERSION = pkg_resources.get_distribution("turkish-tokenizer").version except: VERSION = "unknown" tokenizer = TurkishTokenizer() # Define colors for each token type color_map = { TokenType.ROOT.name: "#FF6B6B", # Red TokenType.SUFFIX.name: "#4ECDC4", # Teal TokenType.BPE.name: "#FFE66D", # Yellow } def tokenize_and_display(text): """ Tokenizes the input text and prepares it for display in Gradio's HighlightedText component. """ if not text: # Return a structure that matches all outputs to avoid errors return [], "", "", "" tokens, _ = tokenizer.tokenize_text(text) # Create the list of (token, label) for HighlightedText highlighted_tokens = [] token_stats = {"ROOT": 0, "SUFFIX": 0, "BPE": 0} for t in tokens: token_text = t["token"] token_type = t["type"].name # Count token types for statistics token_stats[token_type] = token_stats.get(token_type, 0) + 1 highlighted_tokens.append((token_text, token_type)) encoded_ids = tokenizer.encode(text) decoded_text = tokenizer.decode(encoded_ids) # Calculate statistics total_tokens = len(tokens) total_chars = len(text) compression_ratio = (1 - total_tokens / total_chars) * 100 if total_chars > 0 else 0 # Define colors for the stats block bg_col, text_col, card_col, border_col = ('#f8f9fa', '#2d3748', '#ffffff', '#e2e8f0') # Create statistics HTML stats_html = f"""

📊 Tokenization Statistics

{total_chars}
Characters
{total_tokens}
Tokens
{compression_ratio:.1f}%
Compression
Token Type Distribution:
🔴 Roots: {token_stats['ROOT']}
🔵 Suffixes: {token_stats['SUFFIX']}
🟡 BPE: {token_stats['BPE']}
""" return highlighted_tokens, str(encoded_ids), decoded_text, stats_html # Custom CSS for better styling custom_css = """ .gradio-container{font-family:'Inter',-apple-system,BlinkMacSystemFont,sans-serif;} .custom-button{background:linear-gradient(135deg,#667eea 0%,#764ba2 100%);border:none;border-radius:8px;padding:12px 24px;color:white;font-weight:600;transition:all .3s ease;} .custom-button:hover{transform:translateY(-2px);box-shadow:0 8px 25px rgba(0,0,0,.15);} .input-textbox{border-radius:12px!important;border:2px solid #e2e8f0!important;transition:all .3s ease;} .input-textbox:focus{border-color:#667eea!important;box-shadow:0 0 0 3px rgba(102,126,234,.1)!important;} """ # Create the Gradio Interface with gr.Blocks(theme=gr.themes.Soft(), title="Turkish Tokenizer", css=custom_css) as demo: with gr.Row(): with gr.Column(scale=3): gr.Markdown(f""" # Turkish Tokenizer ### Advanced Turkish Text Tokenization with Visual Analysis Enter text to see how it's tokenized. Tokens are color-coded by type. """) input_text = gr.Textbox( label="📝 Input Text", placeholder="Merhaba Dünya, kitapları okumak güzeldir.", lines=4, elem_classes=["input-textbox"] ) with gr.Row(): process_button = gr.Button("🚀 Tokenize", variant="primary", elem_classes=["custom-button"], size="lg") clear_button = gr.Button("🗑️ Clear", variant="secondary", size="lg") gr.Markdown("---") gr.Markdown("### 🔄 Encoded & Decoded Output") with gr.Row(): encoded_output = gr.Textbox(label="🔢 Encoded Token IDs", interactive=False, lines=2) decoded_output = gr.Textbox(label="📝 Decoded Text", interactive=False, lines=2) gr.Markdown("### 💡 Example Texts") gr.Examples( examples=[ ["Merhaba Dünya! Bu bir gelişmiş Türkçe tokenizer testidir."], ["İstanbul'da yaşıyorum ve Türkçe dilini öğreniyorum."], ["KitapOkumak çok güzeldir ve bilgi verir."], ["Türkiye Cumhuriyeti'nin başkenti Ankara'dır."], ["Yapay zeka ve makine öğrenmesi teknolojileri gelişiyor."], ], inputs=input_text, label="Try these examples:" ) gr.Markdown("---") gr.Markdown("### 🎨 Tokenization Output") highlighted_output = gr.HighlightedText( label="Colorized Tokens", color_map=color_map, show_legend=True ) gr.Markdown("---") gr.Markdown("### 📊 Statistics") stats_output = gr.HTML(label="") gr.Markdown(f"--- \n **Turkish Tokenizer v{VERSION}** - Advanced tokenization for Turkish text.") # --- Event Handlers --- def process_with_theme(text): return tokenize_and_display(text) def clear_all(): return "", [], "", "", "" # Connect the buttons to the functions process_button.click( fn=process_with_theme, inputs=[input_text], outputs=[highlighted_output, encoded_output, decoded_output, stats_output] ) clear_button.click( fn=clear_all, outputs=[input_text, highlighted_output, encoded_output, decoded_output, stats_output] ) # Auto-process on load with a default example demo.load( fn=lambda: tokenize_and_display("Merhaba Dünya!"), outputs=[highlighted_output, encoded_output, decoded_output, stats_output] ) if __name__ == "__main__": demo.launch(show_error=True)