Spaces:
Sleeping
Sleeping
File size: 1,399 Bytes
4541c60 e5fc7ff 2ab9c8b e5fc7ff 6426bee 5ee8e99 6426bee e5fc7ff 6426bee e5fc7ff 6426bee 4541c60 6426bee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
import gradio as gr
from transformers import PreTrainedTokenizerFast
# Load your tokenizer
tok = PreTrainedTokenizerFast.from_pretrained("snskrt/Sanskrit_Tokenizer")
def infer_shloka(text: str):
# Encode
enc = tok(text, add_special_tokens=False)
ids = enc["input_ids"]
toks = tok.convert_ids_to_tokens(ids)
# Manual merge: strip "##" and re-join
detok = []
for t in toks:
if t.startswith("##"):
detok[-1] += t[2:]
else:
detok.append(t)
dec = " ".join(detok)
# Format output
out = (
f"**Input IDs:**\n{ids}\n\n"
f"**Tokens:**\n{toks}\n\n"
f"**Decoded:**\n{dec}\n"
)
return out
# Three sample ślokas as examples
examples = [
["ॐ सर्वे भवन्तु सुखिनः ॥"],
["धर्मो रक्षति रक्षितः ॥"],
["यथा दीपः निवातस्थः प्रवर्तमानः ॥"]
]
iface = gr.Interface(
fn=infer_shloka,
inputs=gr.Textbox(lines=2, placeholder="Enter a Sanskrit śloka here…"),
outputs=gr.Markdown(),
examples=examples,
title="Sanskrit-BPE Tokenizer Demo",
description="Encode a Devanāgarī śloka with your custom BPE tokenizer, view IDs, subtokens, and detokenized output.",
allow_flagging="never"
)
if __name__ == "__main__":
iface.launch()
|