Spaces:

Futyn-Maker
/

RuthLemm

Running

File size: 7,729 Bytes

import gradio as gr
import pandas as pd
import re
import os
from simpletransformers.seq2seq import Seq2SeqModel
import time

# --- 1. MODEL LOADING ---
# Load models once at startup to avoid reloading on every request.
print("Loading models...")
MODELS = {
    "no_morph": Seq2SeqModel(
        encoder_decoder_type="bart",
        encoder_decoder_name="Futyn-Maker/RuthLemm",
        use_cuda=False
    ),
    "morph": Seq2SeqModel(
        encoder_decoder_type="bart",
        encoder_decoder_name="Futyn-Maker/RuthLemm-morphology",
        use_cuda=False
    )
}
print("Models loaded successfully!")

# --- 2. PREPROCESSING LOGIC


def preprocess_form(form, pos=""):
    # Handle special punctuation
    if form in {"(", ")", "[", "]"}:
        return form

    # Remove brackets from within words
    processed_form = form.replace(
        "(", "").replace(
        ")", "").replace(
            "[", "").replace(
                "]", "")

    # Apply case rules based on Part-of-Speech (POS)
    if pos == "PROPN":
        return processed_form.capitalize()
    else:
        return processed_form.lower()

# --- 3. CORE FUNCTIONS FOR GRADIO TABS ---


def lemmatize_string(raw_text: str):
    """Lemmatizes a raw string using the non-morphological model."""
    if not raw_text.strip():
        return ""

    # Tokenize while preserving punctuation as separate tokens
    tokens = re.findall(r'\w+|[^\w\s]', raw_text)

    words_to_predict = []
    # We use a special object to mark where words were, to reconstruct the
    # sentence later
    placeholder = object()
    reconstruction_map = []

    for token in tokens:
        if re.match(r'\w+', token):  # It's a word
            # For raw text, we don't have POS tags, so we use the default rule
            # (lowercase)
            preprocessed = preprocess_form(token)
            words_to_predict.append(preprocessed)
            reconstruction_map.append(placeholder)
        else:  # It's punctuation
            reconstruction_map.append(token)

    # Get predictions from the model if there are any words
    if not words_to_predict:
        return raw_text

    predictions = MODELS["no_morph"].predict(words_to_predict)
    pred_iter = iter(predictions)

    # Reconstruct the output string
    output_parts = []
    for item in reconstruction_map:
        if item is placeholder:
            output_parts.append(next(pred_iter))
        else:
            output_parts.append(item)

    # Join with spaces, but clean up spacing around punctuation
    return " ".join(output_parts).replace(
        " .",
        ".").replace(
        " ,",
        ",").replace(
            " ?",
            "?").replace(
                " !",
        "!")


def lemmatize_conllu(
        conllu_input_text: str,
        conllu_file_obj,
        use_morphology: bool):
    """Lemmatizes a CoNLL-U formatted text using the selected model."""
    # Determine the input source
    if conllu_file_obj is not None:
        with open(conllu_file_obj.name, 'r', encoding='utf-8') as f:
            conllu_text = f.read()
    else:
        conllu_text = conllu_input_text

    if not conllu_text.strip():
        return "", None

    lines = conllu_text.strip().split('\n')
    inputs_for_model = []
    token_lines_indices = []  # Store indices of lines that are actual tokens

    for i, line in enumerate(lines):
        if line.startswith('#') or not line.strip():
            continue

        parts = line.split('\t')
        if len(parts) < 6:  # Skip malformed lines
            continue

        token_lines_indices.append(i)
        form = parts[1]
        pos = parts[3]
        features = parts[5]

        preprocessed_form = preprocess_form(form, pos)

        if use_morphology:
            model_input = f"{preprocessed_form} {pos} {features}"
        else:
            model_input = preprocessed_form
        inputs_for_model.append(model_input)

    # If no valid token lines were found, return original text
    if not inputs_for_model:
        return conllu_text, None

    # Select model and predict
    model = MODELS["morph"] if use_morphology else MODELS["no_morph"]
    predictions = model.predict(inputs_for_model)

    # Replace lemma column with predictions
    pred_iter = iter(predictions)
    output_lines = list(lines)  # Make a mutable copy
    for line_idx in token_lines_indices:
        parts = output_lines[line_idx].split('\t')
        parts[2] = next(pred_iter)  # Column 2 is the lemma
        output_lines[line_idx] = '\t'.join(parts)

    final_output = "\n".join(output_lines)

    # Create a file for download
    timestamp = int(time.time())
    output_filename = f"/tmp/lemmatized_{timestamp}.conllu"
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(final_output)

    return final_output, output_filename

# --- 4. GRADIO UI ---


# Explanatory text
readme_text = """
# RuthLemm Demo

This is a demonstration of **RuthLemm**, a transformer (BART-based) lemmatizer for the Old Belarusian (Ruthenian) language. It can process raw text or files in the CoNLL-U format used by Universal Dependencies.

### How to Use:
1.  **Lemmatize String:** Enter any text in the text box. The tool will tokenize it, lemmatize each word, and return the result. This mode does not use morphological information.
2.  **Lemmatize CoNLL-U:** Paste your CoNLL-U data into the text box or upload a `.conllu` file.
    * You can choose whether to use morphological features to improve accuracy via the **"Use Morphology"** checkbox.
    * The output will be the same CoNLL-U data with the `LEMMA` column updated. You can copy the result or download it as a file.
"""

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(readme_text)

    with gr.Tabs():
        with gr.TabItem("Lemmatize String"):
            with gr.Row():
                string_input = gr.Textbox(
                    lines=8,
                    label="Input Text",
                    placeholder="")
                string_output = gr.Textbox(
                    lines=8, label="Lemmatized Output", interactive=False)
            lemmatize_string_btn = gr.Button("Lemmatize", variant="primary")

        with gr.TabItem("Lemmatize CoNLL-U"):
            use_morphology_checkbox = gr.Checkbox(
                label="Use Morphology",
                value=False,
                info="Check this to use POS tags and morphological features for better accuracy.")
            with gr.Row():
                with gr.Column():
                    conllu_input_text = gr.Textbox(
                        lines=10,
                        label="Paste CoNLL-U Data Here",
                        placeholder="")
                    conllu_upload = gr.File(
                        label="Or Upload a .conllu File",
                        file_types=[".conllu"])
                with gr.Column():
                    conllu_output_text = gr.Textbox(
                        lines=10,
                        label="Lemmatized CoNLL-U Output",
                        interactive=False,
                        show_copy_button=True)
                    conllu_download = gr.File(
                        label="Download Result", interactive=False)

            lemmatize_conllu_btn = gr.Button(
                "Lemmatize CoNLL-U", variant="primary")

    # Button click events
    lemmatize_string_btn.click(
        fn=lemmatize_string,
        inputs=[string_input],
        outputs=[string_output]
    )

    lemmatize_conllu_btn.click(
        fn=lemmatize_conllu,
        inputs=[conllu_input_text, conllu_upload, use_morphology_checkbox],
        outputs=[conllu_output_text, conllu_download]
    )

if __name__ == "__main__":
    demo.launch()