File size: 7,729 Bytes
bc25cf1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b070a94
bc25cf1
 
 
 
 
 
 
 
 
 
 
 
 
 
b070a94
bc25cf1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import gradio as gr
import pandas as pd
import re
import os
from simpletransformers.seq2seq import Seq2SeqModel
import time

# --- 1. MODEL LOADING ---
# Load models once at startup to avoid reloading on every request.
print("Loading models...")
MODELS = {
    "no_morph": Seq2SeqModel(
        encoder_decoder_type="bart",
        encoder_decoder_name="Futyn-Maker/RuthLemm",
        use_cuda=False
    ),
    "morph": Seq2SeqModel(
        encoder_decoder_type="bart",
        encoder_decoder_name="Futyn-Maker/RuthLemm-morphology",
        use_cuda=False
    )
}
print("Models loaded successfully!")

# --- 2. PREPROCESSING LOGIC


def preprocess_form(form, pos=""):
    # Handle special punctuation
    if form in {"(", ")", "[", "]"}:
        return form

    # Remove brackets from within words
    processed_form = form.replace(
        "(", "").replace(
        ")", "").replace(
            "[", "").replace(
                "]", "")

    # Apply case rules based on Part-of-Speech (POS)
    if pos == "PROPN":
        return processed_form.capitalize()
    else:
        return processed_form.lower()

# --- 3. CORE FUNCTIONS FOR GRADIO TABS ---


def lemmatize_string(raw_text: str):
    """Lemmatizes a raw string using the non-morphological model."""
    if not raw_text.strip():
        return ""

    # Tokenize while preserving punctuation as separate tokens
    tokens = re.findall(r'\w+|[^\w\s]', raw_text)

    words_to_predict = []
    # We use a special object to mark where words were, to reconstruct the
    # sentence later
    placeholder = object()
    reconstruction_map = []

    for token in tokens:
        if re.match(r'\w+', token):  # It's a word
            # For raw text, we don't have POS tags, so we use the default rule
            # (lowercase)
            preprocessed = preprocess_form(token)
            words_to_predict.append(preprocessed)
            reconstruction_map.append(placeholder)
        else:  # It's punctuation
            reconstruction_map.append(token)

    # Get predictions from the model if there are any words
    if not words_to_predict:
        return raw_text

    predictions = MODELS["no_morph"].predict(words_to_predict)
    pred_iter = iter(predictions)

    # Reconstruct the output string
    output_parts = []
    for item in reconstruction_map:
        if item is placeholder:
            output_parts.append(next(pred_iter))
        else:
            output_parts.append(item)

    # Join with spaces, but clean up spacing around punctuation
    return " ".join(output_parts).replace(
        " .",
        ".").replace(
        " ,",
        ",").replace(
            " ?",
            "?").replace(
                " !",
        "!")


def lemmatize_conllu(
        conllu_input_text: str,
        conllu_file_obj,
        use_morphology: bool):
    """Lemmatizes a CoNLL-U formatted text using the selected model."""
    # Determine the input source
    if conllu_file_obj is not None:
        with open(conllu_file_obj.name, 'r', encoding='utf-8') as f:
            conllu_text = f.read()
    else:
        conllu_text = conllu_input_text

    if not conllu_text.strip():
        return "", None

    lines = conllu_text.strip().split('\n')
    inputs_for_model = []
    token_lines_indices = []  # Store indices of lines that are actual tokens

    for i, line in enumerate(lines):
        if line.startswith('#') or not line.strip():
            continue

        parts = line.split('\t')
        if len(parts) < 6:  # Skip malformed lines
            continue

        token_lines_indices.append(i)
        form = parts[1]
        pos = parts[3]
        features = parts[5]

        preprocessed_form = preprocess_form(form, pos)

        if use_morphology:
            model_input = f"{preprocessed_form} {pos} {features}"
        else:
            model_input = preprocessed_form
        inputs_for_model.append(model_input)

    # If no valid token lines were found, return original text
    if not inputs_for_model:
        return conllu_text, None

    # Select model and predict
    model = MODELS["morph"] if use_morphology else MODELS["no_morph"]
    predictions = model.predict(inputs_for_model)

    # Replace lemma column with predictions
    pred_iter = iter(predictions)
    output_lines = list(lines)  # Make a mutable copy
    for line_idx in token_lines_indices:
        parts = output_lines[line_idx].split('\t')
        parts[2] = next(pred_iter)  # Column 2 is the lemma
        output_lines[line_idx] = '\t'.join(parts)

    final_output = "\n".join(output_lines)

    # Create a file for download
    timestamp = int(time.time())
    output_filename = f"/tmp/lemmatized_{timestamp}.conllu"
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(final_output)

    return final_output, output_filename

# --- 4. GRADIO UI ---


# Explanatory text
readme_text = """
# RuthLemm Demo

This is a demonstration of **RuthLemm**, a transformer (BART-based) lemmatizer for the Old Belarusian (Ruthenian) language. It can process raw text or files in the CoNLL-U format used by Universal Dependencies.

### How to Use:
1.  **Lemmatize String:** Enter any text in the text box. The tool will tokenize it, lemmatize each word, and return the result. This mode does not use morphological information.
2.  **Lemmatize CoNLL-U:** Paste your CoNLL-U data into the text box or upload a `.conllu` file.
    * You can choose whether to use morphological features to improve accuracy via the **"Use Morphology"** checkbox.
    * The output will be the same CoNLL-U data with the `LEMMA` column updated. You can copy the result or download it as a file.
"""

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown(readme_text)

    with gr.Tabs():
        with gr.TabItem("Lemmatize String"):
            with gr.Row():
                string_input = gr.Textbox(
                    lines=8,
                    label="Input Text",
                    placeholder="")
                string_output = gr.Textbox(
                    lines=8, label="Lemmatized Output", interactive=False)
            lemmatize_string_btn = gr.Button("Lemmatize", variant="primary")

        with gr.TabItem("Lemmatize CoNLL-U"):
            use_morphology_checkbox = gr.Checkbox(
                label="Use Morphology",
                value=False,
                info="Check this to use POS tags and morphological features for better accuracy.")
            with gr.Row():
                with gr.Column():
                    conllu_input_text = gr.Textbox(
                        lines=10,
                        label="Paste CoNLL-U Data Here",
                        placeholder="")
                    conllu_upload = gr.File(
                        label="Or Upload a .conllu File",
                        file_types=[".conllu"])
                with gr.Column():
                    conllu_output_text = gr.Textbox(
                        lines=10,
                        label="Lemmatized CoNLL-U Output",
                        interactive=False,
                        show_copy_button=True)
                    conllu_download = gr.File(
                        label="Download Result", interactive=False)

            lemmatize_conllu_btn = gr.Button(
                "Lemmatize CoNLL-U", variant="primary")

    # Button click events
    lemmatize_string_btn.click(
        fn=lemmatize_string,
        inputs=[string_input],
        outputs=[string_output]
    )

    lemmatize_conllu_btn.click(
        fn=lemmatize_conllu,
        inputs=[conllu_input_text, conllu_upload, use_morphology_checkbox],
        outputs=[conllu_output_text, conllu_download]
    )

if __name__ == "__main__":
    demo.launch()