File size: 7,063 Bytes
f8c9370
d809532
a1c26c5
f8c9370
d809532
 
 
 
 
 
a1c26c5
f8c9370
a1c26c5
 
 
f8c9370
a1c26c5
26ddb6c
 
0e68577
f8c9370
 
 
 
 
0e68577
f8c9370
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1c26c5
 
f8c9370
 
 
 
 
 
a1c26c5
 
 
f8c9370
 
 
 
 
 
 
 
 
 
0e68577
f8c9370
 
 
 
 
 
 
 
 
 
 
 
 
 
d809532
f8c9370
 
d809532
f8c9370
 
 
 
 
 
 
 
26ddb6c
f8c9370
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26ddb6c
f8c9370
 
 
 
a1c26c5
f8c9370
 
 
 
 
 
26ddb6c
 
d809532
f8c9370
 
0e68577
 
f8c9370
 
 
 
 
 
 
0e68577
 
26ddb6c
f8c9370
 
 
 
26ddb6c
f8c9370
 
 
0e68577
 
f8c9370
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import gradio as gr
import pkg_resources
from turkish_tokenizer import TokenType, TurkishTokenizer

# Get the version from the installed package
try:
    VERSION = pkg_resources.get_distribution("turkish-tokenizer").version
except:
    VERSION = "unknown"

tokenizer = TurkishTokenizer()

# Define colors for each token type
color_map = {
    TokenType.ROOT.name: "#FF6B6B",      # Red
    TokenType.SUFFIX.name: "#4ECDC4",    # Teal
    TokenType.BPE.name: "#FFE66D",       # Yellow
}

def tokenize_and_display(text):
    """
    Tokenizes the input text and prepares it for display in Gradio's HighlightedText component.
    """
    if not text:
        # Return a structure that matches all outputs to avoid errors
        return [], "", "", ""

    tokens, _ = tokenizer.tokenize_text(text)

    # Create the list of (token, label) for HighlightedText
    highlighted_tokens = []
    token_stats = {"ROOT": 0, "SUFFIX": 0, "BPE": 0}

    for t in tokens:
        token_text = t["token"]
        token_type = t["type"].name

        # Count token types for statistics
        token_stats[token_type] = token_stats.get(token_type, 0) + 1

        highlighted_tokens.append((token_text, token_type))

    encoded_ids = tokenizer.encode(text)
    decoded_text = tokenizer.decode(encoded_ids)

    # Calculate statistics
    total_tokens = len(tokens)
    total_chars = len(text)
    compression_ratio = (1 - total_tokens / total_chars) * 100 if total_chars > 0 else 0

    # Define colors for the stats block
    bg_col, text_col, card_col, border_col = ('#f8f9fa', '#2d3748', '#ffffff', '#e2e8f0')

    # Create statistics HTML
    stats_html = f"""
    <div style="background:{bg_col};padding:20px;border-radius:12px;margin:20px 0;">
        <h4 style="color:{text_col};margin-bottom:15px;">📊 Tokenization Statistics</h4>
        <div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(150px,1fr));gap:15px;margin-bottom:20px;">
            <div style="background:{card_col};padding:15px;border-radius:8px;text-align:center;border:1px solid {border_col};"><div style="font-size:24px;font-weight:bold;color:#3b82f6;">{total_chars}</div><div style="color:#64748b;font-size:14px;">Characters</div></div>
            <div style="background:{card_col};padding:15px;border-radius:8px;text-align:center;border:1px solid {border_col};"><div style="font-size:24px;font-weight:bold;color:#10b981;">{total_tokens}</div><div style="color:#64748b;font-size:14px;">Tokens</div></div>
            <div style="background:{card_col};padding:15px;border-radius:8px;text-align:center;border:1px solid {border_col};"><div style="font-size:24px;font-weight:bold;color:#f59e0b;">{compression_ratio:.1f}%</div><div style="color:#64748b;font-size:14px;">Compression</div></div>
        </div>
        <div>
            <h5 style="color:{text_col};margin-bottom:10px;">Token Type Distribution:</h5>
            <div style="display:flex;gap:15px;flex-wrap:wrap;">
                <div style="background:#FFADAD;color:#2d3748;padding:8px 12px;border-radius:6px;font-size:14px;font-weight:600;">🔴 Roots: {token_stats['ROOT']}</div>
                <div style="background:#A0C4FF;color:#2d3748;padding:8px 12px;border-radius:6px;font-size:14px;font-weight:600;">🔵 Suffixes: {token_stats['SUFFIX']}</div>
                <div style="background:#FDFFB6;color:#2d3748;padding:8px 12px;border-radius:6px;font-size:14px;font-weight:600;">🟡 BPE: {token_stats['BPE']}</div>
            </div>
        </div>
    </div>"""
    return highlighted_tokens, str(encoded_ids), decoded_text, stats_html

# Custom CSS for better styling
custom_css = """
.gradio-container{font-family:'Inter',-apple-system,BlinkMacSystemFont,sans-serif;}
.custom-button{background:linear-gradient(135deg,#667eea 0%,#764ba2 100%);border:none;border-radius:8px;padding:12px 24px;color:white;font-weight:600;transition:all .3s ease;}
.custom-button:hover{transform:translateY(-2px);box-shadow:0 8px 25px rgba(0,0,0,.15);}
.input-textbox{border-radius:12px!important;border:2px solid #e2e8f0!important;transition:all .3s ease;}
.input-textbox:focus{border-color:#667eea!important;box-shadow:0 0 0 3px rgba(102,126,234,.1)!important;}
"""

# Create the Gradio Interface
with gr.Blocks(theme=gr.themes.Soft(), title="Turkish Tokenizer", css=custom_css) as demo:
    with gr.Row():
        with gr.Column(scale=3):
            gr.Markdown(f"""
                # Turkish Tokenizer
                ### Advanced Turkish Text Tokenization with Visual Analysis
                Enter text to see how it's tokenized. Tokens are color-coded by type.                
            """)

    input_text = gr.Textbox(
        label="📝 Input Text",
        placeholder="Merhaba Dünya, kitapları okumak güzeldir.",
        lines=4,
        elem_classes=["input-textbox"]
    )

    with gr.Row():
        process_button = gr.Button("🚀 Tokenize", variant="primary", elem_classes=["custom-button"], size="lg")
        clear_button = gr.Button("🗑️ Clear", variant="secondary", size="lg")

    gr.Markdown("---")
    gr.Markdown("### 🔄 Encoded & Decoded Output")
    with gr.Row():
        encoded_output = gr.Textbox(label="🔢 Encoded Token IDs", interactive=False, lines=2)
        decoded_output = gr.Textbox(label="📝 Decoded Text", interactive=False, lines=2)

    gr.Markdown("### 💡 Example Texts")
    gr.Examples(
        examples=[
            ["Merhaba Dünya! Bu bir gelişmiş Türkçe tokenizer testidir."],
            ["İstanbul'da yaşıyorum ve Türkçe dilini öğreniyorum."],
            ["KitapOkumak çok güzeldir ve bilgi verir."],
            ["Türkiye Cumhuriyeti'nin başkenti Ankara'dır."],
            ["Yapay zeka ve makine öğrenmesi teknolojileri gelişiyor."],
        ],
        inputs=input_text,
        label="Try these examples:"
    )

    gr.Markdown("---")
    gr.Markdown("### 🎨 Tokenization Output")
    highlighted_output = gr.HighlightedText(
        label="Colorized Tokens",
        color_map=color_map,
        show_legend=True
    )

    gr.Markdown("---")
    gr.Markdown("### 📊 Statistics")
    stats_output = gr.HTML(label="")


    gr.Markdown(f"--- \n **Turkish Tokenizer v{VERSION}** - Advanced tokenization for Turkish text.")

    # --- Event Handlers ---
    def process_with_theme(text):
        return tokenize_and_display(text)

    def clear_all():
        return "", [], "", "", ""

    # Connect the buttons to the functions
    process_button.click(
        fn=process_with_theme,
        inputs=[input_text],
        outputs=[highlighted_output, encoded_output, decoded_output, stats_output]
    )

    clear_button.click(
        fn=clear_all,
        outputs=[input_text, highlighted_output, encoded_output, decoded_output, stats_output]
    )

    # Auto-process on load with a default example
    demo.load(
        fn=lambda: tokenize_and_display("Merhaba Dünya!"),
        outputs=[highlighted_output, encoded_output, decoded_output, stats_output]
    )

if __name__ == "__main__":
    demo.launch(show_error=True)