Spaces:
Running
Running
File size: 7,063 Bytes
f8c9370 d809532 a1c26c5 f8c9370 d809532 a1c26c5 f8c9370 a1c26c5 f8c9370 a1c26c5 26ddb6c 0e68577 f8c9370 0e68577 f8c9370 a1c26c5 f8c9370 a1c26c5 f8c9370 0e68577 f8c9370 d809532 f8c9370 d809532 f8c9370 26ddb6c f8c9370 26ddb6c f8c9370 a1c26c5 f8c9370 26ddb6c d809532 f8c9370 0e68577 f8c9370 0e68577 26ddb6c f8c9370 26ddb6c f8c9370 0e68577 f8c9370 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import gradio as gr
import pkg_resources
from turkish_tokenizer import TokenType, TurkishTokenizer
# Get the version from the installed package
try:
VERSION = pkg_resources.get_distribution("turkish-tokenizer").version
except:
VERSION = "unknown"
tokenizer = TurkishTokenizer()
# Define colors for each token type
color_map = {
TokenType.ROOT.name: "#FF6B6B", # Red
TokenType.SUFFIX.name: "#4ECDC4", # Teal
TokenType.BPE.name: "#FFE66D", # Yellow
}
def tokenize_and_display(text):
"""
Tokenizes the input text and prepares it for display in Gradio's HighlightedText component.
"""
if not text:
# Return a structure that matches all outputs to avoid errors
return [], "", "", ""
tokens, _ = tokenizer.tokenize_text(text)
# Create the list of (token, label) for HighlightedText
highlighted_tokens = []
token_stats = {"ROOT": 0, "SUFFIX": 0, "BPE": 0}
for t in tokens:
token_text = t["token"]
token_type = t["type"].name
# Count token types for statistics
token_stats[token_type] = token_stats.get(token_type, 0) + 1
highlighted_tokens.append((token_text, token_type))
encoded_ids = tokenizer.encode(text)
decoded_text = tokenizer.decode(encoded_ids)
# Calculate statistics
total_tokens = len(tokens)
total_chars = len(text)
compression_ratio = (1 - total_tokens / total_chars) * 100 if total_chars > 0 else 0
# Define colors for the stats block
bg_col, text_col, card_col, border_col = ('#f8f9fa', '#2d3748', '#ffffff', '#e2e8f0')
# Create statistics HTML
stats_html = f"""
<div style="background:{bg_col};padding:20px;border-radius:12px;margin:20px 0;">
<h4 style="color:{text_col};margin-bottom:15px;">📊 Tokenization Statistics</h4>
<div style="display:grid;grid-template-columns:repeat(auto-fit,minmax(150px,1fr));gap:15px;margin-bottom:20px;">
<div style="background:{card_col};padding:15px;border-radius:8px;text-align:center;border:1px solid {border_col};"><div style="font-size:24px;font-weight:bold;color:#3b82f6;">{total_chars}</div><div style="color:#64748b;font-size:14px;">Characters</div></div>
<div style="background:{card_col};padding:15px;border-radius:8px;text-align:center;border:1px solid {border_col};"><div style="font-size:24px;font-weight:bold;color:#10b981;">{total_tokens}</div><div style="color:#64748b;font-size:14px;">Tokens</div></div>
<div style="background:{card_col};padding:15px;border-radius:8px;text-align:center;border:1px solid {border_col};"><div style="font-size:24px;font-weight:bold;color:#f59e0b;">{compression_ratio:.1f}%</div><div style="color:#64748b;font-size:14px;">Compression</div></div>
</div>
<div>
<h5 style="color:{text_col};margin-bottom:10px;">Token Type Distribution:</h5>
<div style="display:flex;gap:15px;flex-wrap:wrap;">
<div style="background:#FFADAD;color:#2d3748;padding:8px 12px;border-radius:6px;font-size:14px;font-weight:600;">🔴 Roots: {token_stats['ROOT']}</div>
<div style="background:#A0C4FF;color:#2d3748;padding:8px 12px;border-radius:6px;font-size:14px;font-weight:600;">🔵 Suffixes: {token_stats['SUFFIX']}</div>
<div style="background:#FDFFB6;color:#2d3748;padding:8px 12px;border-radius:6px;font-size:14px;font-weight:600;">🟡 BPE: {token_stats['BPE']}</div>
</div>
</div>
</div>"""
return highlighted_tokens, str(encoded_ids), decoded_text, stats_html
# Custom CSS for better styling
custom_css = """
.gradio-container{font-family:'Inter',-apple-system,BlinkMacSystemFont,sans-serif;}
.custom-button{background:linear-gradient(135deg,#667eea 0%,#764ba2 100%);border:none;border-radius:8px;padding:12px 24px;color:white;font-weight:600;transition:all .3s ease;}
.custom-button:hover{transform:translateY(-2px);box-shadow:0 8px 25px rgba(0,0,0,.15);}
.input-textbox{border-radius:12px!important;border:2px solid #e2e8f0!important;transition:all .3s ease;}
.input-textbox:focus{border-color:#667eea!important;box-shadow:0 0 0 3px rgba(102,126,234,.1)!important;}
"""
# Create the Gradio Interface
with gr.Blocks(theme=gr.themes.Soft(), title="Turkish Tokenizer", css=custom_css) as demo:
with gr.Row():
with gr.Column(scale=3):
gr.Markdown(f"""
# Turkish Tokenizer
### Advanced Turkish Text Tokenization with Visual Analysis
Enter text to see how it's tokenized. Tokens are color-coded by type.
""")
input_text = gr.Textbox(
label="📝 Input Text",
placeholder="Merhaba Dünya, kitapları okumak güzeldir.",
lines=4,
elem_classes=["input-textbox"]
)
with gr.Row():
process_button = gr.Button("🚀 Tokenize", variant="primary", elem_classes=["custom-button"], size="lg")
clear_button = gr.Button("🗑️ Clear", variant="secondary", size="lg")
gr.Markdown("---")
gr.Markdown("### 🔄 Encoded & Decoded Output")
with gr.Row():
encoded_output = gr.Textbox(label="🔢 Encoded Token IDs", interactive=False, lines=2)
decoded_output = gr.Textbox(label="📝 Decoded Text", interactive=False, lines=2)
gr.Markdown("### 💡 Example Texts")
gr.Examples(
examples=[
["Merhaba Dünya! Bu bir gelişmiş Türkçe tokenizer testidir."],
["İstanbul'da yaşıyorum ve Türkçe dilini öğreniyorum."],
["KitapOkumak çok güzeldir ve bilgi verir."],
["Türkiye Cumhuriyeti'nin başkenti Ankara'dır."],
["Yapay zeka ve makine öğrenmesi teknolojileri gelişiyor."],
],
inputs=input_text,
label="Try these examples:"
)
gr.Markdown("---")
gr.Markdown("### 🎨 Tokenization Output")
highlighted_output = gr.HighlightedText(
label="Colorized Tokens",
color_map=color_map,
show_legend=True
)
gr.Markdown("---")
gr.Markdown("### 📊 Statistics")
stats_output = gr.HTML(label="")
gr.Markdown(f"--- \n **Turkish Tokenizer v{VERSION}** - Advanced tokenization for Turkish text.")
# --- Event Handlers ---
def process_with_theme(text):
return tokenize_and_display(text)
def clear_all():
return "", [], "", "", ""
# Connect the buttons to the functions
process_button.click(
fn=process_with_theme,
inputs=[input_text],
outputs=[highlighted_output, encoded_output, decoded_output, stats_output]
)
clear_button.click(
fn=clear_all,
outputs=[input_text, highlighted_output, encoded_output, decoded_output, stats_output]
)
# Auto-process on load with a default example
demo.load(
fn=lambda: tokenize_and_display("Merhaba Dünya!"),
outputs=[highlighted_output, encoded_output, decoded_output, stats_output]
)
if __name__ == "__main__":
demo.launch(show_error=True)
|