import gradio as gr
import pkg_resources
from turkish_tokenizer import TokenType, TurkishTokenizer
# Get the version from the installed package
try:
VERSION = pkg_resources.get_distribution("turkish-tokenizer").version
except:
VERSION = "unknown"
tokenizer = TurkishTokenizer()
# Define colors for each token type
color_map = {
TokenType.ROOT.name: "#FF6B6B", # Red
TokenType.SUFFIX.name: "#4ECDC4", # Teal
TokenType.BPE.name: "#FFE66D", # Yellow
}
def tokenize_and_display(text):
"""
Tokenizes the input text and prepares it for display in Gradio's HighlightedText component.
"""
if not text:
# Return a structure that matches all outputs to avoid errors
return [], "", "", ""
tokens, _ = tokenizer.tokenize_text(text)
# Create the list of (token, label) for HighlightedText
highlighted_tokens = []
token_stats = {"ROOT": 0, "SUFFIX": 0, "BPE": 0}
for t in tokens:
token_text = t["token"]
token_type = t["type"].name
# Count token types for statistics
token_stats[token_type] = token_stats.get(token_type, 0) + 1
highlighted_tokens.append((token_text, token_type))
encoded_ids = tokenizer.encode(text)
decoded_text = tokenizer.decode(encoded_ids)
# Calculate statistics
total_tokens = len(tokens)
total_chars = len(text)
compression_ratio = (1 - total_tokens / total_chars) * 100 if total_chars > 0 else 0
# Define colors for the stats block
bg_col, text_col, card_col, border_col = ('#f8f9fa', '#2d3748', '#ffffff', '#e2e8f0')
# Create statistics HTML
stats_html = f"""
📊 Tokenization Statistics
{compression_ratio:.1f}%
Compression
Token Type Distribution:
🔴 Roots: {token_stats['ROOT']}
🔵 Suffixes: {token_stats['SUFFIX']}
🟡 BPE: {token_stats['BPE']}
"""
return highlighted_tokens, str(encoded_ids), decoded_text, stats_html
# Custom CSS for better styling
custom_css = """
.gradio-container{font-family:'Inter',-apple-system,BlinkMacSystemFont,sans-serif;}
.custom-button{background:linear-gradient(135deg,#667eea 0%,#764ba2 100%);border:none;border-radius:8px;padding:12px 24px;color:white;font-weight:600;transition:all .3s ease;}
.custom-button:hover{transform:translateY(-2px);box-shadow:0 8px 25px rgba(0,0,0,.15);}
.input-textbox{border-radius:12px!important;border:2px solid #e2e8f0!important;transition:all .3s ease;}
.input-textbox:focus{border-color:#667eea!important;box-shadow:0 0 0 3px rgba(102,126,234,.1)!important;}
"""
# Create the Gradio Interface
with gr.Blocks(theme=gr.themes.Soft(), title="Turkish Tokenizer", css=custom_css) as demo:
with gr.Row():
with gr.Column(scale=3):
gr.Markdown(f"""
# Turkish Tokenizer
### Advanced Turkish Text Tokenization with Visual Analysis
Enter text to see how it's tokenized. Tokens are color-coded by type.
""")
input_text = gr.Textbox(
label="📝 Input Text",
placeholder="Merhaba Dünya, kitapları okumak güzeldir.",
lines=4,
elem_classes=["input-textbox"]
)
with gr.Row():
process_button = gr.Button("🚀 Tokenize", variant="primary", elem_classes=["custom-button"], size="lg")
clear_button = gr.Button("🗑️ Clear", variant="secondary", size="lg")
gr.Markdown("---")
gr.Markdown("### 🔄 Encoded & Decoded Output")
with gr.Row():
encoded_output = gr.Textbox(label="🔢 Encoded Token IDs", interactive=False, lines=2)
decoded_output = gr.Textbox(label="📝 Decoded Text", interactive=False, lines=2)
gr.Markdown("### 💡 Example Texts")
gr.Examples(
examples=[
["Merhaba Dünya! Bu bir gelişmiş Türkçe tokenizer testidir."],
["İstanbul'da yaşıyorum ve Türkçe dilini öğreniyorum."],
["KitapOkumak çok güzeldir ve bilgi verir."],
["Türkiye Cumhuriyeti'nin başkenti Ankara'dır."],
["Yapay zeka ve makine öğrenmesi teknolojileri gelişiyor."],
],
inputs=input_text,
label="Try these examples:"
)
gr.Markdown("---")
gr.Markdown("### 🎨 Tokenization Output")
highlighted_output = gr.HighlightedText(
label="Colorized Tokens",
color_map=color_map,
show_legend=True
)
gr.Markdown("---")
gr.Markdown("### 📊 Statistics")
stats_output = gr.HTML(label="")
gr.Markdown(f"--- \n **Turkish Tokenizer v{VERSION}** - Advanced tokenization for Turkish text.")
# --- Event Handlers ---
def process_with_theme(text):
return tokenize_and_display(text)
def clear_all():
return "", [], "", "", ""
# Connect the buttons to the functions
process_button.click(
fn=process_with_theme,
inputs=[input_text],
outputs=[highlighted_output, encoded_output, decoded_output, stats_output]
)
clear_button.click(
fn=clear_all,
outputs=[input_text, highlighted_output, encoded_output, decoded_output, stats_output]
)
# Auto-process on load with a default example
demo.load(
fn=lambda: tokenize_and_display("Merhaba Dünya!"),
outputs=[highlighted_output, encoded_output, decoded_output, stats_output]
)
if __name__ == "__main__":
demo.launch(show_error=True)