13Aluminium's picture
Update app.py
5ee8e99 verified
import gradio as gr
from transformers import PreTrainedTokenizerFast
# Load your tokenizer
tok = PreTrainedTokenizerFast.from_pretrained("snskrt/Sanskrit_Tokenizer")
def infer_shloka(text: str):
# Encode
enc = tok(text, add_special_tokens=False)
ids = enc["input_ids"]
toks = tok.convert_ids_to_tokens(ids)
# Manual merge: strip "##" and re-join
detok = []
for t in toks:
if t.startswith("##"):
detok[-1] += t[2:]
else:
detok.append(t)
dec = " ".join(detok)
# Format output
out = (
f"**Input IDs:**\n{ids}\n\n"
f"**Tokens:**\n{toks}\n\n"
f"**Decoded:**\n{dec}\n"
)
return out
# Three sample ślokas as examples
examples = [
["ॐ सर्वे भवन्तु सुखिनः ॥"],
["धर्मो रक्षति रक्षितः ॥"],
["यथा दीपः निवातस्थः प्रवर्तमानः ॥"]
]
iface = gr.Interface(
fn=infer_shloka,
inputs=gr.Textbox(lines=2, placeholder="Enter a Sanskrit śloka here…"),
outputs=gr.Markdown(),
examples=examples,
title="Sanskrit-BPE Tokenizer Demo",
description="Encode a Devanāgarī śloka with your custom BPE tokenizer, view IDs, subtokens, and detokenized output.",
allow_flagging="never"
)
if __name__ == "__main__":
iface.launch()