Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import re | |
| import os | |
| from simpletransformers.seq2seq import Seq2SeqModel | |
| import time | |
| # --- 1. MODEL LOADING --- | |
| # Load models once at startup to avoid reloading on every request. | |
| print("Loading models...") | |
| MODELS = { | |
| "no_morph": Seq2SeqModel( | |
| encoder_decoder_type="bart", | |
| encoder_decoder_name="Futyn-Maker/RuthLemm", | |
| use_cuda=False | |
| ), | |
| "morph": Seq2SeqModel( | |
| encoder_decoder_type="bart", | |
| encoder_decoder_name="Futyn-Maker/RuthLemm-morphology", | |
| use_cuda=False | |
| ) | |
| } | |
| print("Models loaded successfully!") | |
| # --- 2. PREPROCESSING LOGIC | |
| def preprocess_form(form, pos=""): | |
| # Handle special punctuation | |
| if form in {"(", ")", "[", "]"}: | |
| return form | |
| # Remove brackets from within words | |
| processed_form = form.replace( | |
| "(", "").replace( | |
| ")", "").replace( | |
| "[", "").replace( | |
| "]", "") | |
| # Apply case rules based on Part-of-Speech (POS) | |
| if pos == "PROPN": | |
| return processed_form.capitalize() | |
| else: | |
| return processed_form.lower() | |
| # --- 3. CORE FUNCTIONS FOR GRADIO TABS --- | |
| def lemmatize_string(raw_text: str): | |
| """Lemmatizes a raw string using the non-morphological model.""" | |
| if not raw_text.strip(): | |
| return "" | |
| # Tokenize while preserving punctuation as separate tokens | |
| tokens = re.findall(r'\w+|[^\w\s]', raw_text) | |
| words_to_predict = [] | |
| # We use a special object to mark where words were, to reconstruct the | |
| # sentence later | |
| placeholder = object() | |
| reconstruction_map = [] | |
| for token in tokens: | |
| if re.match(r'\w+', token): # It's a word | |
| # For raw text, we don't have POS tags, so we use the default rule | |
| # (lowercase) | |
| preprocessed = preprocess_form(token) | |
| words_to_predict.append(preprocessed) | |
| reconstruction_map.append(placeholder) | |
| else: # It's punctuation | |
| reconstruction_map.append(token) | |
| # Get predictions from the model if there are any words | |
| if not words_to_predict: | |
| return raw_text | |
| predictions = MODELS["no_morph"].predict(words_to_predict) | |
| pred_iter = iter(predictions) | |
| # Reconstruct the output string | |
| output_parts = [] | |
| for item in reconstruction_map: | |
| if item is placeholder: | |
| output_parts.append(next(pred_iter)) | |
| else: | |
| output_parts.append(item) | |
| # Join with spaces, but clean up spacing around punctuation | |
| return " ".join(output_parts).replace( | |
| " .", | |
| ".").replace( | |
| " ,", | |
| ",").replace( | |
| " ?", | |
| "?").replace( | |
| " !", | |
| "!") | |
| def lemmatize_conllu( | |
| conllu_input_text: str, | |
| conllu_file_obj, | |
| use_morphology: bool): | |
| """Lemmatizes a CoNLL-U formatted text using the selected model.""" | |
| # Determine the input source | |
| if conllu_file_obj is not None: | |
| with open(conllu_file_obj.name, 'r', encoding='utf-8') as f: | |
| conllu_text = f.read() | |
| else: | |
| conllu_text = conllu_input_text | |
| if not conllu_text.strip(): | |
| return "", None | |
| lines = conllu_text.strip().split('\n') | |
| inputs_for_model = [] | |
| token_lines_indices = [] # Store indices of lines that are actual tokens | |
| for i, line in enumerate(lines): | |
| if line.startswith('#') or not line.strip(): | |
| continue | |
| parts = line.split('\t') | |
| if len(parts) < 6: # Skip malformed lines | |
| continue | |
| token_lines_indices.append(i) | |
| form = parts[1] | |
| pos = parts[3] | |
| features = parts[5] | |
| preprocessed_form = preprocess_form(form, pos) | |
| if use_morphology: | |
| model_input = f"{preprocessed_form} {pos} {features}" | |
| else: | |
| model_input = preprocessed_form | |
| inputs_for_model.append(model_input) | |
| # If no valid token lines were found, return original text | |
| if not inputs_for_model: | |
| return conllu_text, None | |
| # Select model and predict | |
| model = MODELS["morph"] if use_morphology else MODELS["no_morph"] | |
| predictions = model.predict(inputs_for_model) | |
| # Replace lemma column with predictions | |
| pred_iter = iter(predictions) | |
| output_lines = list(lines) # Make a mutable copy | |
| for line_idx in token_lines_indices: | |
| parts = output_lines[line_idx].split('\t') | |
| parts[2] = next(pred_iter) # Column 2 is the lemma | |
| output_lines[line_idx] = '\t'.join(parts) | |
| final_output = "\n".join(output_lines) | |
| # Create a file for download | |
| timestamp = int(time.time()) | |
| output_filename = f"/tmp/lemmatized_{timestamp}.conllu" | |
| with open(output_filename, "w", encoding="utf-8") as f: | |
| f.write(final_output) | |
| return final_output, output_filename | |
| # --- 4. GRADIO UI --- | |
| # Explanatory text | |
| readme_text = """ | |
| # RuthLemm Demo | |
| This is a demonstration of **RuthLemm**, a transformer (BART-based) lemmatizer for the Old Belarusian (Ruthenian) language. It can process raw text or files in the CoNLL-U format used by Universal Dependencies. | |
| ### How to Use: | |
| 1. **Lemmatize String:** Enter any text in the text box. The tool will tokenize it, lemmatize each word, and return the result. This mode does not use morphological information. | |
| 2. **Lemmatize CoNLL-U:** Paste your CoNLL-U data into the text box or upload a `.conllu` file. | |
| * You can choose whether to use morphological features to improve accuracy via the **"Use Morphology"** checkbox. | |
| * The output will be the same CoNLL-U data with the `LEMMA` column updated. You can copy the result or download it as a file. | |
| """ | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(readme_text) | |
| with gr.Tabs(): | |
| with gr.TabItem("Lemmatize String"): | |
| with gr.Row(): | |
| string_input = gr.Textbox( | |
| lines=8, | |
| label="Input Text", | |
| placeholder="") | |
| string_output = gr.Textbox( | |
| lines=8, label="Lemmatized Output", interactive=False) | |
| lemmatize_string_btn = gr.Button("Lemmatize", variant="primary") | |
| with gr.TabItem("Lemmatize CoNLL-U"): | |
| use_morphology_checkbox = gr.Checkbox( | |
| label="Use Morphology", | |
| value=False, | |
| info="Check this to use POS tags and morphological features for better accuracy.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| conllu_input_text = gr.Textbox( | |
| lines=10, | |
| label="Paste CoNLL-U Data Here", | |
| placeholder="") | |
| conllu_upload = gr.File( | |
| label="Or Upload a .conllu File", | |
| file_types=[".conllu"]) | |
| with gr.Column(): | |
| conllu_output_text = gr.Textbox( | |
| lines=10, | |
| label="Lemmatized CoNLL-U Output", | |
| interactive=False, | |
| show_copy_button=True) | |
| conllu_download = gr.File( | |
| label="Download Result", interactive=False) | |
| lemmatize_conllu_btn = gr.Button( | |
| "Lemmatize CoNLL-U", variant="primary") | |
| # Button click events | |
| lemmatize_string_btn.click( | |
| fn=lemmatize_string, | |
| inputs=[string_input], | |
| outputs=[string_output] | |
| ) | |
| lemmatize_conllu_btn.click( | |
| fn=lemmatize_conllu, | |
| inputs=[conllu_input_text, conllu_upload, use_morphology_checkbox], | |
| outputs=[conllu_output_text, conllu_download] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |