|
--- |
|
license: apache-2.0 |
|
datasets: |
|
- lecslab/glosslm-corpus-split |
|
metrics: |
|
- accuracy |
|
- chrf |
|
- bleu |
|
base_model: |
|
- google/byt5-base |
|
library_name: transformers |
|
--- |
|
|
|
- Repo: https://github.com/foltaProject/glosslm |
|
- Paper: https://arxiv.org/abs/2403.06399 |
|
|
|
Usage: |
|
```python |
|
import transformers |
|
|
|
# Your inputs |
|
transcription = "o sey xtok rixoqiil" |
|
translation = "O sea busca esposa." |
|
lang = "Uspanteco" |
|
metalang = "Spanish" |
|
is_segmented = False |
|
|
|
prompt = f"""Provide the glosses for the following transcription in {lang}. |
|
|
|
Transcription in {lang}: {transcription} |
|
Transcription segmented: {is_segmented} |
|
Translation in {metalang}: {translation}\n |
|
Glosses: |
|
""" |
|
|
|
model = transformers.T5ForConditionalGeneration.from_pretrained("lecslab/glosslm") |
|
tokenizer = transformers.ByT5Tokenizer.from_pretrained( |
|
"google/byt5-base", use_fast=False |
|
) |
|
|
|
inputs = tokenizer(prompt, return_tensors="pt") |
|
outputs = tokenizer.batch_decode( |
|
model.generate(**inputs, max_length=1024), skip_special_tokens=True |
|
) |
|
print(outputs[0]) |
|
# o sea COM-buscar E3S-esposa |
|
``` |