import csv
import json
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score

# File paths
csv_file = "gsm8k_llama3_results.csv"
metrics_file = "gsm8k_llama3_metrics.json"

# Initialize
true_answers = []
predicted_answers = []
correct_count = 0
total_count = 0

# Count total lines for tqdm
with open(csv_file, mode='r', encoding='utf-8') as f:
    total_lines = sum(1 for _ in f) - 1  # Subtract header

# Read CSV and collect predictions
with open(csv_file, mode='r', encoding='utf-8') as f:
    reader = csv.DictReader(f)
    for row in tqdm(reader, total=total_lines, desc="Evaluating"):
        true = row["true_answer"].strip()
        pred = row["predicted_answer"].strip()

        true_answers.append(true)
        predicted_answers.append(pred)

        if true == pred:
            correct_count += 1
        total_count += 1

# Accuracy
accuracy = correct_count / total_count if total_count > 0 else 0.0

# Precision, Recall, F1 — treating answers as string labels
precision = precision_score(true_answers, predicted_answers, average="macro", zero_division=0)
recall = recall_score(true_answers, predicted_answers, average="macro", zero_division=0)
f1 = f1_score(true_answers, predicted_answers, average="macro", zero_division=0)

# Save to JSON
metrics = {
    "accuracy": round(accuracy, 4),
    "precision": round(precision, 4),
    "recall": round(recall, 4),
    "f1_score": round(f1, 4),
    "total_examples": total_count
}

with open(metrics_file, "w") as f:
    json.dump(metrics, f, indent=2)

print(json.dumps(metrics, indent=2))