# fine_tune_classifier.py import os import pandas as pd from datasets import Dataset, DatasetDict, ClassLabel from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments from sklearn.metrics import accuracy_score, f1_score import torch # --- 1. Configuration --- DATA_FILE = "df.csv" MODEL_NAME = "mediawatch-el-climate" MODEL_CHECKPOINT = os.getenv("MODEL_CHECKPOINT", "cvcio/roberta-el-news") OUTPUT_DIR = MODEL_NAME + "/" + MODEL_CHECKPOINT.replace("/", "-") + "-finetuned" NUM_EPOCHS = 4 BATCH_SIZE = 64 # --- 2. Load and Prepare the Dataset --- print("Step 2: Loading and preparing the dataset...") # Load your data from the CSV file df = pd.read_csv(DATA_FILE) # Ensure the columns are named 'text' and 'label' df = df.rename(columns={'text': 'text', 'label': 'label'}) df = df.dropna(subset=['text', 'label']).reset_index(drop=True) # Convert the pandas DataFrame to a Hugging Face Dataset dataset = Dataset.from_pandas(df) # Get the list of unique labels unique_labels = df['label'].unique().tolist() # Create label-to-ID and ID-to-label mappings label2id = {label: i for i, label in enumerate(unique_labels)} id2label = {i: label for i, label in enumerate(unique_labels)} num_labels = len(unique_labels) print(f"Found {num_labels} unique labels: {unique_labels}") # Create a ClassLabel feature to map string labels to integer IDs class_label_feature = ClassLabel(names=unique_labels) # Map string labels to integer IDs def map_labels(example): example['label'] = class_label_feature.str2int(example['label']) return example dataset = dataset.map(map_labels, batched=True) dataset = dataset.class_encode_column("label") # Split the dataset into training (80%) and testing (20%) sets train_test_split = dataset.train_test_split(test_size=0.2) ## , stratify_by_column="label") # Create a DatasetDict raw_datasets = DatasetDict({ 'train': train_test_split['train'], 'test': train_test_split['test'] }) print("Dataset prepared and split.") print(raw_datasets) # --- 3. Tokenization --- print("\nStep 3: Tokenizing the text data...") # Load the tokenizer associated with the pre-trained model tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT,model_max_length=512) # Create a function to tokenize the text def tokenize_function(examples): return tokenizer(examples["text"], padding="max_length", truncation=True) # Apply the tokenization to the entire dataset tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) print("Tokenization complete.") # --- 4. Model Training --- print("\nStep 4: Setting up and training the model...") # Load the pre-trained model, configured for our number of labels model = AutoModelForSequenceClassification.from_pretrained( MODEL_CHECKPOINT, num_labels=num_labels, id2label=id2label, # Pass the mappings to the model label2id=label2id, max_length=512, ) # Define a function to compute metrics during evaluation def compute_metrics(eval_pred): logits, labels = eval_pred predictions = logits.argmax(axis=-1) return { "accuracy": accuracy_score(labels, predictions), "f1_weighted": f1_score(labels, predictions, average="weighted"), } # Define the training arguments training_args = TrainingArguments( output_dir=OUTPUT_DIR, num_train_epochs=NUM_EPOCHS, per_device_train_batch_size=BATCH_SIZE, per_device_eval_batch_size=BATCH_SIZE, warmup_steps=50, weight_decay=0.01, logging_dir='./logs', logging_steps=10, eval_strategy="epoch", # Evaluate at the end of each epoch save_strategy="epoch", # Save the model at the end of each epoch load_best_model_at_end=True, # Load the best model found during training ) # Create the Trainer instance trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"], compute_metrics=compute_metrics, tokenizer=tokenizer, ) # Start the training print("Starting training...") trainer.train() print("Training finished.") # Save the final model and tokenizer trainer.save_model(OUTPUT_DIR) print(f"Model saved to {OUTPUT_DIR}") # --- 5. Example Prediction --- print("\nStep 5: Running an example prediction...") # # The trainer saves the label mapping in the model's config # id2label = model.config.id2label # Text to classify new_text = "Λειψυδρία: Σε ανησυχητικό επίπεδο η στάθμη του νερού σε Πηνειό και Μόρνο – Καμπανάκι ΕΥΔΑΠ για τα αποθέματα : Έχουμε λιγότερο από τα μισά του 2019" # Tokenize the new text inputs = tokenizer(new_text, return_tensors="pt") # Move inputs to the same device as the model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) inputs = {k: v.to(device) for k, v in inputs.items()} # Get predictions with torch.no_grad(): logits = model(**inputs).logits # Find the label with the highest probability predicted_class_id = logits.argmax().item() predicted_label = id2label[predicted_class_id] print(f"\nText: '{new_text}'") print(f"Predicted Label: {predicted_label}")