Spaces:

Futyn-Maker
/

RuthLemm

Sleeping

App Files Files Community

RuthLemm / app.py

Futyn-Maker

Upload folder using huggingface_hub

b070a94 verified 4 months ago

raw

history blame contribute delete

7.73 kB

	import gradio as gr
	import pandas as pd
	import re
	import os
	from simpletransformers.seq2seq import Seq2SeqModel
	import time

	# --- 1. MODEL LOADING ---
	# Load models once at startup to avoid reloading on every request.
	print("Loading models...")
	MODELS = {
	"no_morph": Seq2SeqModel(
	encoder_decoder_type="bart",
	encoder_decoder_name="Futyn-Maker/RuthLemm",
	use_cuda=False
	),
	"morph": Seq2SeqModel(
	encoder_decoder_type="bart",
	encoder_decoder_name="Futyn-Maker/RuthLemm-morphology",
	use_cuda=False
	)
	}
	print("Models loaded successfully!")

	# --- 2. PREPROCESSING LOGIC


	def preprocess_form(form, pos=""):
	# Handle special punctuation
	if form in {"(", ")", "[", "]"}:
	return form

	# Remove brackets from within words
	processed_form = form.replace(
	"(", "").replace(
	")", "").replace(
	"[", "").replace(
	"]", "")

	# Apply case rules based on Part-of-Speech (POS)
	if pos == "PROPN":
	return processed_form.capitalize()
	else:
	return processed_form.lower()

	# --- 3. CORE FUNCTIONS FOR GRADIO TABS ---


	def lemmatize_string(raw_text: str):
	"""Lemmatizes a raw string using the non-morphological model."""
	if not raw_text.strip():
	return ""

	# Tokenize while preserving punctuation as separate tokens
	tokens = re.findall(r'\w+\|[^\w\s]', raw_text)

	words_to_predict = []
	# We use a special object to mark where words were, to reconstruct the
	# sentence later
	placeholder = object()
	reconstruction_map = []

	for token in tokens:
	if re.match(r'\w+', token): # It's a word
	# For raw text, we don't have POS tags, so we use the default rule
	# (lowercase)
	preprocessed = preprocess_form(token)
	words_to_predict.append(preprocessed)
	reconstruction_map.append(placeholder)
	else: # It's punctuation
	reconstruction_map.append(token)

	# Get predictions from the model if there are any words
	if not words_to_predict:
	return raw_text

	predictions = MODELS["no_morph"].predict(words_to_predict)
	pred_iter = iter(predictions)

	# Reconstruct the output string
	output_parts = []
	for item in reconstruction_map:
	if item is placeholder:
	output_parts.append(next(pred_iter))
	else:
	output_parts.append(item)

	# Join with spaces, but clean up spacing around punctuation
	return " ".join(output_parts).replace(
	" .",
	".").replace(
	" ,",
	",").replace(
	" ?",
	"?").replace(
	" !",
	"!")


	def lemmatize_conllu(
	conllu_input_text: str,
	conllu_file_obj,
	use_morphology: bool):
	"""Lemmatizes a CoNLL-U formatted text using the selected model."""
	# Determine the input source
	if conllu_file_obj is not None:
	with open(conllu_file_obj.name, 'r', encoding='utf-8') as f:
	conllu_text = f.read()
	else:
	conllu_text = conllu_input_text

	if not conllu_text.strip():
	return "", None

	lines = conllu_text.strip().split('\n')
	inputs_for_model = []
	token_lines_indices = [] # Store indices of lines that are actual tokens

	for i, line in enumerate(lines):
	if line.startswith('#') or not line.strip():
	continue

	parts = line.split('\t')
	if len(parts) < 6: # Skip malformed lines
	continue

	token_lines_indices.append(i)
	form = parts[1]
	pos = parts[3]
	features = parts[5]

	preprocessed_form = preprocess_form(form, pos)

	if use_morphology:
	model_input = f"{preprocessed_form} {pos} {features}"
	else:
	model_input = preprocessed_form
	inputs_for_model.append(model_input)

	# If no valid token lines were found, return original text
	if not inputs_for_model:
	return conllu_text, None

	# Select model and predict
	model = MODELS["morph"] if use_morphology else MODELS["no_morph"]
	predictions = model.predict(inputs_for_model)

	# Replace lemma column with predictions
	pred_iter = iter(predictions)
	output_lines = list(lines) # Make a mutable copy
	for line_idx in token_lines_indices:
	parts = output_lines[line_idx].split('\t')
	parts[2] = next(pred_iter) # Column 2 is the lemma
	output_lines[line_idx] = '\t'.join(parts)

	final_output = "\n".join(output_lines)

	# Create a file for download
	timestamp = int(time.time())
	output_filename = f"/tmp/lemmatized_{timestamp}.conllu"
	with open(output_filename, "w", encoding="utf-8") as f:
	f.write(final_output)

	return final_output, output_filename

	# --- 4. GRADIO UI ---


	# Explanatory text
	readme_text = """
	# RuthLemm Demo

	This is a demonstration of RuthLemm, a transformer (BART-based) lemmatizer for the Old Belarusian (Ruthenian) language. It can process raw text or files in the CoNLL-U format used by Universal Dependencies.

	### How to Use:
	1. Lemmatize String: Enter any text in the text box. The tool will tokenize it, lemmatize each word, and return the result. This mode does not use morphological information.
	2. Lemmatize CoNLL-U: Paste your CoNLL-U data into the text box or upload a `.conllu` file.
	* You can choose whether to use morphological features to improve accuracy via the "Use Morphology" checkbox.
	* The output will be the same CoNLL-U data with the `LEMMA` column updated. You can copy the result or download it as a file.
	"""

	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	gr.Markdown(readme_text)

	with gr.Tabs():
	with gr.TabItem("Lemmatize String"):
	with gr.Row():
	string_input = gr.Textbox(
	lines=8,
	label="Input Text",
	placeholder="")
	string_output = gr.Textbox(
	lines=8, label="Lemmatized Output", interactive=False)
	lemmatize_string_btn = gr.Button("Lemmatize", variant="primary")

	with gr.TabItem("Lemmatize CoNLL-U"):
	use_morphology_checkbox = gr.Checkbox(
	label="Use Morphology",
	value=False,
	info="Check this to use POS tags and morphological features for better accuracy.")
	with gr.Row():
	with gr.Column():
	conllu_input_text = gr.Textbox(
	lines=10,
	label="Paste CoNLL-U Data Here",
	placeholder="")
	conllu_upload = gr.File(
	label="Or Upload a .conllu File",
	file_types=[".conllu"])
	with gr.Column():
	conllu_output_text = gr.Textbox(
	lines=10,
	label="Lemmatized CoNLL-U Output",
	interactive=False,
	show_copy_button=True)
	conllu_download = gr.File(
	label="Download Result", interactive=False)

	lemmatize_conllu_btn = gr.Button(
	"Lemmatize CoNLL-U", variant="primary")

	# Button click events
	lemmatize_string_btn.click(
	fn=lemmatize_string,
	inputs=[string_input],
	outputs=[string_output]
	)

	lemmatize_conllu_btn.click(
	fn=lemmatize_conllu,
	inputs=[conllu_input_text, conllu_upload, use_morphology_checkbox],
	outputs=[conllu_output_text, conllu_download]
	)

	if __name__ == "__main__":
	demo.launch()