File size: 1,644 Bytes
47fcd4a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
import csv
import json
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score
# File paths
csv_file = "gsm8k_llama3_results.csv"
metrics_file = "gsm8k_llama3_metrics.json"
# Initialize
true_answers = []
predicted_answers = []
correct_count = 0
total_count = 0
# Count total lines for tqdm
with open(csv_file, mode='r', encoding='utf-8') as f:
total_lines = sum(1 for _ in f) - 1 # Subtract header
# Read CSV and collect predictions
with open(csv_file, mode='r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in tqdm(reader, total=total_lines, desc="Evaluating"):
true = row["true_answer"].strip()
pred = row["predicted_answer"].strip()
true_answers.append(true)
predicted_answers.append(pred)
if true == pred:
correct_count += 1
total_count += 1
# Accuracy
accuracy = correct_count / total_count if total_count > 0 else 0.0
# Precision, Recall, F1 — treating answers as string labels
precision = precision_score(true_answers, predicted_answers, average="macro", zero_division=0)
recall = recall_score(true_answers, predicted_answers, average="macro", zero_division=0)
f1 = f1_score(true_answers, predicted_answers, average="macro", zero_division=0)
# Save to JSON
metrics = {
"accuracy": round(accuracy, 4),
"precision": round(precision, 4),
"recall": round(recall, 4),
"f1_score": round(f1, 4),
"total_examples": total_count
}
with open(metrics_file, "w") as f:
json.dump(metrics, f, indent=2)
print(json.dumps(metrics, indent=2))
|