Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import PreTrainedTokenizerFast | |
# Load your tokenizer | |
tok = PreTrainedTokenizerFast.from_pretrained("snskrt/Sanskrit_Tokenizer") | |
def infer_shloka(text: str): | |
# Encode | |
enc = tok(text, add_special_tokens=False) | |
ids = enc["input_ids"] | |
toks = tok.convert_ids_to_tokens(ids) | |
# Manual merge: strip "##" and re-join | |
detok = [] | |
for t in toks: | |
if t.startswith("##"): | |
detok[-1] += t[2:] | |
else: | |
detok.append(t) | |
dec = " ".join(detok) | |
# Format output | |
out = ( | |
f"**Input IDs:**\n{ids}\n\n" | |
f"**Tokens:**\n{toks}\n\n" | |
f"**Decoded:**\n{dec}\n" | |
) | |
return out | |
# Three sample ślokas as examples | |
examples = [ | |
["ॐ सर्वे भवन्तु सुखिनः ॥"], | |
["धर्मो रक्षति रक्षितः ॥"], | |
["यथा दीपः निवातस्थः प्रवर्तमानः ॥"] | |
] | |
iface = gr.Interface( | |
fn=infer_shloka, | |
inputs=gr.Textbox(lines=2, placeholder="Enter a Sanskrit śloka here…"), | |
outputs=gr.Markdown(), | |
examples=examples, | |
title="Sanskrit-BPE Tokenizer Demo", | |
description="Encode a Devanāgarī śloka with your custom BPE tokenizer, view IDs, subtokens, and detokenized output.", | |
allow_flagging="never" | |
) | |
if __name__ == "__main__": | |
iface.launch() | |