import gradio as gr from transformers import PreTrainedTokenizerFast # Load your tokenizer tok = PreTrainedTokenizerFast.from_pretrained("snskrt/Sanskrit_Tokenizer") def infer_shloka(text: str): # Encode enc = tok(text, add_special_tokens=False) ids = enc["input_ids"] toks = tok.convert_ids_to_tokens(ids) # Manual merge: strip "##" and re-join detok = [] for t in toks: if t.startswith("##"): detok[-1] += t[2:] else: detok.append(t) dec = " ".join(detok) # Format output out = ( f"**Input IDs:**\n{ids}\n\n" f"**Tokens:**\n{toks}\n\n" f"**Decoded:**\n{dec}\n" ) return out # Three sample ślokas as examples examples = [ ["ॐ सर्वे भवन्तु सुखिनः ॥"], ["धर्मो रक्षति रक्षितः ॥"], ["यथा दीपः निवातस्थः प्रवर्तमानः ॥"] ] iface = gr.Interface( fn=infer_shloka, inputs=gr.Textbox(lines=2, placeholder="Enter a Sanskrit śloka here…"), outputs=gr.Markdown(), examples=examples, title="Sanskrit-BPE Tokenizer Demo", description="Encode a Devanāgarī śloka with your custom BPE tokenizer, view IDs, subtokens, and detokenized output.", allow_flagging="never" ) if __name__ == "__main__": iface.launch()