import numpy as np import pandas as pd from classifier import DebertaV2ForSequenceClassification from datasets import Dataset from scipy.stats import pearsonr from sklearn.metrics import accuracy_score, precision_score, recall_score from transformers import (AutoTokenizer, DataCollatorWithPadding, Trainer, TrainingArguments) tokenizer = AutoTokenizer.from_pretrained("microsoft/mdeberta-v3-base") def sigmoid(x): return 1 / (1 + np.exp(-x)) def compute_metrics(eval_pred): predictions, labels = eval_pred scores, binary_logits = predictions scores = scores.squeeze() probs = sigmoid(binary_logits.squeeze()) predicted_labels = (probs >= 0.5).astype(int) binary_labels = (labels >= 3).astype(int) return { 'pearson': pearsonr(scores, labels)[0], 'accuracy': accuracy_score(binary_labels, predicted_labels), 'precision': precision_score(binary_labels, predicted_labels), 'recall': recall_score(binary_labels, predicted_labels), } def tokenize_function(examples): return tokenizer(examples["text"], truncation=True, max_length=512) def train_classifier(): train_csv = pd.read_csv(PATH_TO_TRAINSET) train_dataset = Dataset.from_pandas(train_csv) test_csv = pd.read_csv(PATH_TO_TESTSET).sample(n=10_000, random_state=42) test_dataset = Dataset.from_pandas(test_csv) train_dataset = train_dataset.map(tokenize_function, batched=True) test_dataset = test_dataset.map(tokenize_function, batched=True) train_dataset = train_dataset.with_format("torch") test_dataset = test_dataset.with_format("torch") data_collator = DataCollatorWithPadding(tokenizer=tokenizer) training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=3, weight_decay=0.01, logging_dir="./logs", logging_steps=10, ) model = DebertaV2ForSequenceClassification.from_pretrained("microsoft/mdeberta-v3-base") print ("Freezing model embeddings!") model.freeze_embeddings() trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=test_dataset, tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics ) trainer.train() # Evaluate the model trainer.evaluate() #trainer.push_to_hub(private=True, model_name="mFine-Edu-classifier") if __name__ == "__main__": train_classifier()