Spaces:

alibayram
/

turkish_tiktokenizer

Running

File size: 7,063 Bytes

import gradio as gr
import pkg_resources
from turkish_tokenizer import TokenType, TurkishTokenizer

# Get the version from the installed package
try:
    VERSION = pkg_resources.get_distribution("turkish-tokenizer").version
except:
    VERSION = "unknown"

tokenizer = TurkishTokenizer()

# Define colors for each token type
color_map = {
    TokenType.ROOT.name: "#FF6B6B",      # Red
    TokenType.SUFFIX.name: "#4ECDC4",    # Teal
    TokenType.BPE.name: "#FFE66D",       # Yellow
}

def tokenize_and_display(text):
    """
    Tokenizes the input text and prepares it for display in Gradio's HighlightedText component.
    """
    if not text:
        # Return a structure that matches all outputs to avoid errors
        return [], "", "", ""

    tokens, _ = tokenizer.tokenize_text(text)

    # Create the list of (token, label) for HighlightedText
    highlighted_tokens = []
    token_stats = {"ROOT": 0, "SUFFIX": 0, "BPE": 0}

    for t in tokens:
        token_text = t["token"]
        token_type = t["type"].name

        # Count token types for statistics
        token_stats[token_type] = token_stats.get(token_type, 0) + 1

        highlighted_tokens.append((token_text, token_type))

    encoded_ids = tokenizer.encode(text)
    decoded_text = tokenizer.decode(encoded_ids)

    # Calculate statistics
    total_tokens = len(tokens)
    total_chars = len(text)
    compression_ratio = (1 - total_tokens / total_chars) * 100 if total_chars > 0 else 0

    # Define colors for the stats block
    bg_col, text_col, card_col, border_col = ('#f8f9fa', '#2d3748', '#ffffff', '#e2e8f0')

    # Create statistics HTML
    stats_html = f"""
    <div style="background:{bg_col};padding:20px;border-radius:12px;margin:20px 0;">
        <h4 style="color:{text_col};margin-bottom:15px;">📊 Tokenization Statistics</h4>
        <div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(150px,1fr));gap:15px;margin-bottom:20px;">
            <div style="background:{card_col};padding:15px;border-radius:8px;text-align:center;border:1px solid {border_col};"><div style="font-size:24px;font-weight:bold;color:#3b82f6;">{total_chars}</div><div style="color:#64748b;font-size:14px;">Characters</div></div>
            <div style="background:{card_col};padding:15px;border-radius:8px;text-align:center;border:1px solid {border_col};"><div style="font-size:24px;font-weight:bold;color:#10b981;">{total_tokens}</div><div style="color:#64748b;font-size:14px;">Tokens</div></div>
            <div style="background:{card_col};padding:15px;border-radius:8px;text-align:center;border:1px solid {border_col};"><div style="font-size:24px;font-weight:bold;color:#f59e0b;">{compression_ratio:.1f}%</div><div style="color:#64748b;font-size:14px;">Compression</div></div>
        </div>
        <div>
            <h5 style="color:{text_col};margin-bottom:10px;">Token Type Distribution:</h5>
            <div style="display:flex;gap:15px;flex-wrap:wrap;">
                <div style="background:#FFADAD;color:#2d3748;padding:8px 12px;border-radius:6px;font-size:14px;font-weight:600;">🔴 Roots: {token_stats['ROOT']}</div>
                <div style="background:#A0C4FF;color:#2d3748;padding:8px 12px;border-radius:6px;font-size:14px;font-weight:600;">🔵 Suffixes: {token_stats['SUFFIX']}</div>
                <div style="background:#FDFFB6;color:#2d3748;padding:8px 12px;border-radius:6px;font-size:14px;font-weight:600;">🟡 BPE: {token_stats['BPE']}</div>
            </div>
        </div>
    </div>"""
    return highlighted_tokens, str(encoded_ids), decoded_text, stats_html

# Custom CSS for better styling
custom_css = """
.gradio-container{font-family:'Inter',-apple-system,BlinkMacSystemFont,sans-serif;}
.custom-button{background:linear-gradient(135deg,#667eea 0%,#764ba2 100%);border:none;border-radius:8px;padding:12px 24px;color:white;font-weight:600;transition:all .3s ease;}
.custom-button:hover{transform:translateY(-2px);box-shadow:0 8px 25px rgba(0,0,0,.15);}
.input-textbox{border-radius:12px!important;border:2px solid #e2e8f0!important;transition:all .3s ease;}
.input-textbox:focus{border-color:#667eea!important;box-shadow:0 0 0 3px rgba(102,126,234,.1)!important;}
"""

# Create the Gradio Interface
with gr.Blocks(theme=gr.themes.Soft(), title="Turkish Tokenizer", css=custom_css) as demo:
    with gr.Row():
        with gr.Column(scale=3):
            gr.Markdown(f"""
                # Turkish Tokenizer
                ### Advanced Turkish Text Tokenization with Visual Analysis
                Enter text to see how it's tokenized. Tokens are color-coded by type.                
            """)

    input_text = gr.Textbox(
        label="📝 Input Text",
        placeholder="Merhaba Dünya, kitapları okumak güzeldir.",
        lines=4,
        elem_classes=["input-textbox"]
    )

    with gr.Row():
        process_button = gr.Button("🚀 Tokenize", variant="primary", elem_classes=["custom-button"], size="lg")
        clear_button = gr.Button("🗑️ Clear", variant="secondary", size="lg")

    gr.Markdown("---")
    gr.Markdown("### 🔄 Encoded & Decoded Output")
    with gr.Row():
        encoded_output = gr.Textbox(label="🔢 Encoded Token IDs", interactive=False, lines=2)
        decoded_output = gr.Textbox(label="📝 Decoded Text", interactive=False, lines=2)

    gr.Markdown("### 💡 Example Texts")
    gr.Examples(
        examples=[
            ["Merhaba Dünya! Bu bir gelişmiş Türkçe tokenizer testidir."],
            ["İstanbul'da yaşıyorum ve Türkçe dilini öğreniyorum."],
            ["KitapOkumak çok güzeldir ve bilgi verir."],
            ["Türkiye Cumhuriyeti'nin başkenti Ankara'dır."],
            ["Yapay zeka ve makine öğrenmesi teknolojileri gelişiyor."],
        ],
        inputs=input_text,
        label="Try these examples:"
    )

    gr.Markdown("---")
    gr.Markdown("### 🎨 Tokenization Output")
    highlighted_output = gr.HighlightedText(
        label="Colorized Tokens",
        color_map=color_map,
        show_legend=True
    )

    gr.Markdown("---")
    gr.Markdown("### 📊 Statistics")
    stats_output = gr.HTML(label="")


    gr.Markdown(f"--- \n **Turkish Tokenizer v{VERSION}** - Advanced tokenization for Turkish text.")

    # --- Event Handlers ---
    def process_with_theme(text):
        return tokenize_and_display(text)

    def clear_all():
        return "", [], "", "", ""

    # Connect the buttons to the functions
    process_button.click(
        fn=process_with_theme,
        inputs=[input_text],
        outputs=[highlighted_output, encoded_output, decoded_output, stats_output]
    )

    clear_button.click(
        fn=clear_all,
        outputs=[input_text, highlighted_output, encoded_output, decoded_output, stats_output]
    )

    # Auto-process on load with a default example
    demo.load(
        fn=lambda: tokenize_and_display("Merhaba Dünya!"),
        outputs=[highlighted_output, encoded_output, decoded_output, stats_output]
    )

if __name__ == "__main__":
    demo.launch(show_error=True)