Spaces:
Running
Running
File size: 7,729 Bytes
bc25cf1 b070a94 bc25cf1 b070a94 bc25cf1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 |
import gradio as gr
import pandas as pd
import re
import os
from simpletransformers.seq2seq import Seq2SeqModel
import time
# --- 1. MODEL LOADING ---
# Load models once at startup to avoid reloading on every request.
print("Loading models...")
MODELS = {
"no_morph": Seq2SeqModel(
encoder_decoder_type="bart",
encoder_decoder_name="Futyn-Maker/RuthLemm",
use_cuda=False
),
"morph": Seq2SeqModel(
encoder_decoder_type="bart",
encoder_decoder_name="Futyn-Maker/RuthLemm-morphology",
use_cuda=False
)
}
print("Models loaded successfully!")
# --- 2. PREPROCESSING LOGIC
def preprocess_form(form, pos=""):
# Handle special punctuation
if form in {"(", ")", "[", "]"}:
return form
# Remove brackets from within words
processed_form = form.replace(
"(", "").replace(
")", "").replace(
"[", "").replace(
"]", "")
# Apply case rules based on Part-of-Speech (POS)
if pos == "PROPN":
return processed_form.capitalize()
else:
return processed_form.lower()
# --- 3. CORE FUNCTIONS FOR GRADIO TABS ---
def lemmatize_string(raw_text: str):
"""Lemmatizes a raw string using the non-morphological model."""
if not raw_text.strip():
return ""
# Tokenize while preserving punctuation as separate tokens
tokens = re.findall(r'\w+|[^\w\s]', raw_text)
words_to_predict = []
# We use a special object to mark where words were, to reconstruct the
# sentence later
placeholder = object()
reconstruction_map = []
for token in tokens:
if re.match(r'\w+', token): # It's a word
# For raw text, we don't have POS tags, so we use the default rule
# (lowercase)
preprocessed = preprocess_form(token)
words_to_predict.append(preprocessed)
reconstruction_map.append(placeholder)
else: # It's punctuation
reconstruction_map.append(token)
# Get predictions from the model if there are any words
if not words_to_predict:
return raw_text
predictions = MODELS["no_morph"].predict(words_to_predict)
pred_iter = iter(predictions)
# Reconstruct the output string
output_parts = []
for item in reconstruction_map:
if item is placeholder:
output_parts.append(next(pred_iter))
else:
output_parts.append(item)
# Join with spaces, but clean up spacing around punctuation
return " ".join(output_parts).replace(
" .",
".").replace(
" ,",
",").replace(
" ?",
"?").replace(
" !",
"!")
def lemmatize_conllu(
conllu_input_text: str,
conllu_file_obj,
use_morphology: bool):
"""Lemmatizes a CoNLL-U formatted text using the selected model."""
# Determine the input source
if conllu_file_obj is not None:
with open(conllu_file_obj.name, 'r', encoding='utf-8') as f:
conllu_text = f.read()
else:
conllu_text = conllu_input_text
if not conllu_text.strip():
return "", None
lines = conllu_text.strip().split('\n')
inputs_for_model = []
token_lines_indices = [] # Store indices of lines that are actual tokens
for i, line in enumerate(lines):
if line.startswith('#') or not line.strip():
continue
parts = line.split('\t')
if len(parts) < 6: # Skip malformed lines
continue
token_lines_indices.append(i)
form = parts[1]
pos = parts[3]
features = parts[5]
preprocessed_form = preprocess_form(form, pos)
if use_morphology:
model_input = f"{preprocessed_form} {pos} {features}"
else:
model_input = preprocessed_form
inputs_for_model.append(model_input)
# If no valid token lines were found, return original text
if not inputs_for_model:
return conllu_text, None
# Select model and predict
model = MODELS["morph"] if use_morphology else MODELS["no_morph"]
predictions = model.predict(inputs_for_model)
# Replace lemma column with predictions
pred_iter = iter(predictions)
output_lines = list(lines) # Make a mutable copy
for line_idx in token_lines_indices:
parts = output_lines[line_idx].split('\t')
parts[2] = next(pred_iter) # Column 2 is the lemma
output_lines[line_idx] = '\t'.join(parts)
final_output = "\n".join(output_lines)
# Create a file for download
timestamp = int(time.time())
output_filename = f"/tmp/lemmatized_{timestamp}.conllu"
with open(output_filename, "w", encoding="utf-8") as f:
f.write(final_output)
return final_output, output_filename
# --- 4. GRADIO UI ---
# Explanatory text
readme_text = """
# RuthLemm Demo
This is a demonstration of **RuthLemm**, a transformer (BART-based) lemmatizer for the Old Belarusian (Ruthenian) language. It can process raw text or files in the CoNLL-U format used by Universal Dependencies.
### How to Use:
1. **Lemmatize String:** Enter any text in the text box. The tool will tokenize it, lemmatize each word, and return the result. This mode does not use morphological information.
2. **Lemmatize CoNLL-U:** Paste your CoNLL-U data into the text box or upload a `.conllu` file.
* You can choose whether to use morphological features to improve accuracy via the **"Use Morphology"** checkbox.
* The output will be the same CoNLL-U data with the `LEMMA` column updated. You can copy the result or download it as a file.
"""
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.Markdown(readme_text)
with gr.Tabs():
with gr.TabItem("Lemmatize String"):
with gr.Row():
string_input = gr.Textbox(
lines=8,
label="Input Text",
placeholder="")
string_output = gr.Textbox(
lines=8, label="Lemmatized Output", interactive=False)
lemmatize_string_btn = gr.Button("Lemmatize", variant="primary")
with gr.TabItem("Lemmatize CoNLL-U"):
use_morphology_checkbox = gr.Checkbox(
label="Use Morphology",
value=False,
info="Check this to use POS tags and morphological features for better accuracy.")
with gr.Row():
with gr.Column():
conllu_input_text = gr.Textbox(
lines=10,
label="Paste CoNLL-U Data Here",
placeholder="")
conllu_upload = gr.File(
label="Or Upload a .conllu File",
file_types=[".conllu"])
with gr.Column():
conllu_output_text = gr.Textbox(
lines=10,
label="Lemmatized CoNLL-U Output",
interactive=False,
show_copy_button=True)
conllu_download = gr.File(
label="Download Result", interactive=False)
lemmatize_conllu_btn = gr.Button(
"Lemmatize CoNLL-U", variant="primary")
# Button click events
lemmatize_string_btn.click(
fn=lemmatize_string,
inputs=[string_input],
outputs=[string_output]
)
lemmatize_conllu_btn.click(
fn=lemmatize_conllu,
inputs=[conllu_input_text, conllu_upload, use_morphology_checkbox],
outputs=[conllu_output_text, conllu_download]
)
if __name__ == "__main__":
demo.launch()
|