|
|
|
|
|
|
|
|
|
|
|
import datasets |
|
from datasets import Dataset |
|
from transformers import ( |
|
AutoTokenizer, |
|
AutoModelForSeq2SeqLM, |
|
Seq2SeqTrainingArguments, |
|
Seq2SeqTrainer, |
|
DataCollatorForSeq2Seq |
|
) |
|
|
|
|
|
|
|
|
|
raw_data = { |
|
"train": [ |
|
{"input_text": "El medico me receto analgesicos.", "target_text": "El médico me recetó analgésicos."}, |
|
{"input_text": "La cancion tiene una melodia pegadiza.", "target_text": "La canción tiene una melodía pegadiza."}, |
|
{"input_text": "La accion legal continuo.", "target_text": "La acción legal continuó."}, |
|
{"input_text": "Manana habra reunion.", "target_text": "Mañana habrá reunión."}, |
|
|
|
], |
|
"validation": [ |
|
{"input_text": "La informacion es util.", "target_text": "La información es útil."}, |
|
{"input_text": "El analisis fue complejo.", "target_text": "El análisis fue complejo."}, |
|
|
|
] |
|
} |
|
|
|
|
|
raw_datasets = datasets.DatasetDict({ |
|
"train": Dataset.from_dict(raw_data["train"]), |
|
"validation": Dataset.from_dict(raw_data["validation"]) |
|
}) |
|
|
|
print("Dataset de ejemplo cargado:") |
|
print(raw_datasets) |
|
|
|
|
|
|
|
|
|
model_checkpoint = "google/mt5-small" |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, legacy=False) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint) |
|
|
|
|
|
max_input_length = 128 |
|
max_target_length = 128 |
|
|
|
def preprocess_function(examples): |
|
|
|
inputs = [str(ex) for ex in examples["input_text"]] |
|
targets = [str(ex) for ex in examples["target_text"]] |
|
|
|
|
|
model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length") |
|
|
|
|
|
|
|
with tokenizer.as_target_tokenizer(): |
|
labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length") |
|
|
|
|
|
|
|
labels["input_ids"] = [ |
|
[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] |
|
] |
|
|
|
model_inputs["labels"] = labels["input_ids"] |
|
return model_inputs |
|
|
|
|
|
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True) |
|
|
|
print("\nDataset tokenizado:") |
|
print(tokenized_datasets) |
|
|
|
|
|
|
|
|
|
output_dir = "./results-accent-corrector" |
|
|
|
|
|
training_args = Seq2SeqTrainingArguments( |
|
output_dir=output_dir, |
|
evaluation_strategy="epoch", |
|
learning_rate=2e-5, |
|
per_device_train_batch_size=8, |
|
per_device_eval_batch_size=8, |
|
weight_decay=0.01, |
|
save_total_limit=3, |
|
num_train_epochs=3, |
|
predict_with_generate=True, |
|
fp16=False, |
|
logging_steps=10, |
|
|
|
|
|
) |
|
|
|
|
|
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model) |
|
|
|
|
|
trainer = Seq2SeqTrainer( |
|
model=model, |
|
args=training_args, |
|
train_dataset=tokenized_datasets["train"], |
|
eval_dataset=tokenized_datasets["validation"], |
|
tokenizer=tokenizer, |
|
data_collator=data_collator, |
|
|
|
) |
|
|
|
|
|
print("\nIniciando entrenamiento...") |
|
trainer.train() |
|
print("Entrenamiento completado.") |
|
|
|
|
|
final_model_path = f"{output_dir}/final_model" |
|
trainer.save_model(final_model_path) |
|
tokenizer.save_pretrained(final_model_path) |
|
print(f"Modelo final y tokenizer guardados en: {final_model_path}") |
|
|
|
|
|
from transformers import pipeline |
|
|
|
print("\nProbando el modelo afinado...") |
|
|
|
|
|
|
|
corrector_pipe = pipeline("text2text-generation", model=final_model_path, tokenizer=final_model_path) |
|
|
|
|
|
test_sentences = [ |
|
"Que dia es manana?", |
|
"La musica clasica me relaja.", |
|
"El exito requiere dedicacion.", |
|
"accion" |
|
] |
|
|
|
for sentence in test_sentences: |
|
result = corrector_pipe(sentence, max_length=50) |
|
corrected_text = result[0]['generated_text'] |
|
print(f"Entrada: {sentence}") |
|
print(f"Salida: {corrected_text}") |
|
print("---") |
|
|
|
print("\n¡Proceso terminado!") |