import csv import json from tqdm import tqdm from sklearn.metrics import precision_score, recall_score, f1_score # File paths csv_file = "gsm8k_llama3_results.csv" metrics_file = "gsm8k_llama3_metrics.json" # Initialize true_answers = [] predicted_answers = [] correct_count = 0 total_count = 0 # Count total lines for tqdm with open(csv_file, mode='r', encoding='utf-8') as f: total_lines = sum(1 for _ in f) - 1 # Subtract header # Read CSV and collect predictions with open(csv_file, mode='r', encoding='utf-8') as f: reader = csv.DictReader(f) for row in tqdm(reader, total=total_lines, desc="Evaluating"): true = row["true_answer"].strip() pred = row["predicted_answer"].strip() true_answers.append(true) predicted_answers.append(pred) if true == pred: correct_count += 1 total_count += 1 # Accuracy accuracy = correct_count / total_count if total_count > 0 else 0.0 # Precision, Recall, F1 — treating answers as string labels precision = precision_score(true_answers, predicted_answers, average="macro", zero_division=0) recall = recall_score(true_answers, predicted_answers, average="macro", zero_division=0) f1 = f1_score(true_answers, predicted_answers, average="macro", zero_division=0) # Save to JSON metrics = { "accuracy": round(accuracy, 4), "precision": round(precision, 4), "recall": round(recall, 4), "f1_score": round(f1, 4), "total_examples": total_count } with open(metrics_file, "w") as f: json.dump(metrics, f, indent=2) print(json.dumps(metrics, indent=2))