# fine_tune_classifier.py
import os
import pandas as pd

from datasets import Dataset, DatasetDict, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score

import torch

# --- 1. Configuration ---
DATA_FILE = "df.csv"

MODEL_NAME = "mediawatch-el-climate"
MODEL_CHECKPOINT = os.getenv("MODEL_CHECKPOINT", "cvcio/roberta-el-news") 
OUTPUT_DIR = MODEL_NAME + "/" + MODEL_CHECKPOINT.replace("/", "-") + "-finetuned"

NUM_EPOCHS = 4
BATCH_SIZE = 64

# --- 2. Load and Prepare the Dataset ---
print("Step 2: Loading and preparing the dataset...")

# Load your data from the CSV file
df = pd.read_csv(DATA_FILE)

# Ensure the columns are named 'text' and 'label'
df = df.rename(columns={'text': 'text', 'label': 'label'})
df = df.dropna(subset=['text', 'label']).reset_index(drop=True)

# Convert the pandas DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Get the list of unique labels
unique_labels = df['label'].unique().tolist()

# Create label-to-ID and ID-to-label mappings
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for i, label in enumerate(unique_labels)}

num_labels = len(unique_labels)
print(f"Found {num_labels} unique labels: {unique_labels}")

# Create a ClassLabel feature to map string labels to integer IDs
class_label_feature = ClassLabel(names=unique_labels)

# Map string labels to integer IDs
def map_labels(example):
    example['label'] = class_label_feature.str2int(example['label'])
    return example


dataset = dataset.map(map_labels, batched=True)
dataset = dataset.class_encode_column("label")

# Split the dataset into training (80%) and testing (20%) sets
train_test_split = dataset.train_test_split(test_size=0.2) ## , stratify_by_column="label")

# Create a DatasetDict
raw_datasets = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

print("Dataset prepared and split.")
print(raw_datasets)


# --- 3. Tokenization ---
print("\nStep 3: Tokenizing the text data...")

# Load the tokenizer associated with the pre-trained model
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT,model_max_length=512)

# Create a function to tokenize the text
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply the tokenization to the entire dataset
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

print("Tokenization complete.")

# --- 4. Model Training ---
print("\nStep 4: Setting up and training the model...")

# Load the pre-trained model, configured for our number of labels
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=num_labels,
    id2label=id2label, # Pass the mappings to the model
    label2id=label2id,
    max_length=512,
)

# Define a function to compute metrics during evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1_weighted": f1_score(labels, predictions, average="weighted"),
    }

# Define the training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",  # Evaluate at the end of each epoch
    save_strategy="epoch",      # Save the model at the end of each epoch
    load_best_model_at_end=True, # Load the best model found during training
)

# Create the Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

# Start the training
print("Starting training...")
trainer.train()
print("Training finished.")

# Save the final model and tokenizer
trainer.save_model(OUTPUT_DIR)
print(f"Model saved to {OUTPUT_DIR}")


# --- 5. Example Prediction ---
print("\nStep 5: Running an example prediction...")

# # The trainer saves the label mapping in the model's config
# id2label = model.config.id2label

# Text to classify
new_text = "Λειψυδρία: Σε ανησυχητικό επίπεδο η στάθμη του νερού σε Πηνειό και Μόρνο – Καμπανάκι ΕΥΔΑΠ για τα αποθέματα : Έχουμε λιγότερο από τα μισά του 2019"

# Tokenize the new text
inputs = tokenizer(new_text, return_tensors="pt")

# Move inputs to the same device as the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Get predictions
with torch.no_grad():
    logits = model(**inputs).logits

# Find the label with the highest probability
predicted_class_id = logits.argmax().item()
predicted_label = id2label[predicted_class_id]

print(f"\nText: '{new_text}'")
print(f"Predicted Label: {predicted_label}")