RuthLemm / app.py
Futyn-Maker's picture
Upload folder using huggingface_hub
b070a94 verified
import gradio as gr
import pandas as pd
import re
import os
from simpletransformers.seq2seq import Seq2SeqModel
import time
# --- 1. MODEL LOADING ---
# Load models once at startup to avoid reloading on every request.
print("Loading models...")
MODELS = {
"no_morph": Seq2SeqModel(
encoder_decoder_type="bart",
encoder_decoder_name="Futyn-Maker/RuthLemm",
use_cuda=False
),
"morph": Seq2SeqModel(
encoder_decoder_type="bart",
encoder_decoder_name="Futyn-Maker/RuthLemm-morphology",
use_cuda=False
)
}
print("Models loaded successfully!")
# --- 2. PREPROCESSING LOGIC
def preprocess_form(form, pos=""):
# Handle special punctuation
if form in {"(", ")", "[", "]"}:
return form
# Remove brackets from within words
processed_form = form.replace(
"(", "").replace(
")", "").replace(
"[", "").replace(
"]", "")
# Apply case rules based on Part-of-Speech (POS)
if pos == "PROPN":
return processed_form.capitalize()
else:
return processed_form.lower()
# --- 3. CORE FUNCTIONS FOR GRADIO TABS ---
def lemmatize_string(raw_text: str):
"""Lemmatizes a raw string using the non-morphological model."""
if not raw_text.strip():
return ""
# Tokenize while preserving punctuation as separate tokens
tokens = re.findall(r'\w+|[^\w\s]', raw_text)
words_to_predict = []
# We use a special object to mark where words were, to reconstruct the
# sentence later
placeholder = object()
reconstruction_map = []
for token in tokens:
if re.match(r'\w+', token): # It's a word
# For raw text, we don't have POS tags, so we use the default rule
# (lowercase)
preprocessed = preprocess_form(token)
words_to_predict.append(preprocessed)
reconstruction_map.append(placeholder)
else: # It's punctuation
reconstruction_map.append(token)
# Get predictions from the model if there are any words
if not words_to_predict:
return raw_text
predictions = MODELS["no_morph"].predict(words_to_predict)
pred_iter = iter(predictions)
# Reconstruct the output string
output_parts = []
for item in reconstruction_map:
if item is placeholder:
output_parts.append(next(pred_iter))
else:
output_parts.append(item)
# Join with spaces, but clean up spacing around punctuation
return " ".join(output_parts).replace(
" .",
".").replace(
" ,",
",").replace(
" ?",
"?").replace(
" !",
"!")
def lemmatize_conllu(
conllu_input_text: str,
conllu_file_obj,
use_morphology: bool):
"""Lemmatizes a CoNLL-U formatted text using the selected model."""
# Determine the input source
if conllu_file_obj is not None:
with open(conllu_file_obj.name, 'r', encoding='utf-8') as f:
conllu_text = f.read()
else:
conllu_text = conllu_input_text
if not conllu_text.strip():
return "", None
lines = conllu_text.strip().split('\n')
inputs_for_model = []
token_lines_indices = [] # Store indices of lines that are actual tokens
for i, line in enumerate(lines):
if line.startswith('#') or not line.strip():
continue
parts = line.split('\t')
if len(parts) < 6: # Skip malformed lines
continue
token_lines_indices.append(i)
form = parts[1]
pos = parts[3]
features = parts[5]
preprocessed_form = preprocess_form(form, pos)
if use_morphology:
model_input = f"{preprocessed_form} {pos} {features}"
else:
model_input = preprocessed_form
inputs_for_model.append(model_input)
# If no valid token lines were found, return original text
if not inputs_for_model:
return conllu_text, None
# Select model and predict
model = MODELS["morph"] if use_morphology else MODELS["no_morph"]
predictions = model.predict(inputs_for_model)
# Replace lemma column with predictions
pred_iter = iter(predictions)
output_lines = list(lines) # Make a mutable copy
for line_idx in token_lines_indices:
parts = output_lines[line_idx].split('\t')
parts[2] = next(pred_iter) # Column 2 is the lemma
output_lines[line_idx] = '\t'.join(parts)
final_output = "\n".join(output_lines)
# Create a file for download
timestamp = int(time.time())
output_filename = f"/tmp/lemmatized_{timestamp}.conllu"
with open(output_filename, "w", encoding="utf-8") as f:
f.write(final_output)
return final_output, output_filename
# --- 4. GRADIO UI ---
# Explanatory text
readme_text = """
# RuthLemm Demo
This is a demonstration of **RuthLemm**, a transformer (BART-based) lemmatizer for the Old Belarusian (Ruthenian) language. It can process raw text or files in the CoNLL-U format used by Universal Dependencies.
### How to Use:
1. **Lemmatize String:** Enter any text in the text box. The tool will tokenize it, lemmatize each word, and return the result. This mode does not use morphological information.
2. **Lemmatize CoNLL-U:** Paste your CoNLL-U data into the text box or upload a `.conllu` file.
* You can choose whether to use morphological features to improve accuracy via the **"Use Morphology"** checkbox.
* The output will be the same CoNLL-U data with the `LEMMA` column updated. You can copy the result or download it as a file.
"""
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(readme_text)
with gr.Tabs():
with gr.TabItem("Lemmatize String"):
with gr.Row():
string_input = gr.Textbox(
lines=8,
label="Input Text",
placeholder="")
string_output = gr.Textbox(
lines=8, label="Lemmatized Output", interactive=False)
lemmatize_string_btn = gr.Button("Lemmatize", variant="primary")
with gr.TabItem("Lemmatize CoNLL-U"):
use_morphology_checkbox = gr.Checkbox(
label="Use Morphology",
value=False,
info="Check this to use POS tags and morphological features for better accuracy.")
with gr.Row():
with gr.Column():
conllu_input_text = gr.Textbox(
lines=10,
label="Paste CoNLL-U Data Here",
placeholder="")
conllu_upload = gr.File(
label="Or Upload a .conllu File",
file_types=[".conllu"])
with gr.Column():
conllu_output_text = gr.Textbox(
lines=10,
label="Lemmatized CoNLL-U Output",
interactive=False,
show_copy_button=True)
conllu_download = gr.File(
label="Download Result", interactive=False)
lemmatize_conllu_btn = gr.Button(
"Lemmatize CoNLL-U", variant="primary")
# Button click events
lemmatize_string_btn.click(
fn=lemmatize_string,
inputs=[string_input],
outputs=[string_output]
)
lemmatize_conllu_btn.click(
fn=lemmatize_conllu,
inputs=[conllu_input_text, conllu_upload, use_morphology_checkbox],
outputs=[conllu_output_text, conllu_download]
)
if __name__ == "__main__":
demo.launch()