my-model-repoLama1 / metrics.py
ksimdeep's picture
Upload folder using huggingface_hub
47fcd4a verified
import csv
import json
from tqdm import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score
# File paths
csv_file = "gsm8k_llama3_results.csv"
metrics_file = "gsm8k_llama3_metrics.json"
# Initialize
true_answers = []
predicted_answers = []
correct_count = 0
total_count = 0
# Count total lines for tqdm
with open(csv_file, mode='r', encoding='utf-8') as f:
total_lines = sum(1 for _ in f) - 1 # Subtract header
# Read CSV and collect predictions
with open(csv_file, mode='r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in tqdm(reader, total=total_lines, desc="Evaluating"):
true = row["true_answer"].strip()
pred = row["predicted_answer"].strip()
true_answers.append(true)
predicted_answers.append(pred)
if true == pred:
correct_count += 1
total_count += 1
# Accuracy
accuracy = correct_count / total_count if total_count > 0 else 0.0
# Precision, Recall, F1 — treating answers as string labels
precision = precision_score(true_answers, predicted_answers, average="macro", zero_division=0)
recall = recall_score(true_answers, predicted_answers, average="macro", zero_division=0)
f1 = f1_score(true_answers, predicted_answers, average="macro", zero_division=0)
# Save to JSON
metrics = {
"accuracy": round(accuracy, 4),
"precision": round(precision, 4),
"recall": round(recall, 4),
"f1_score": round(f1, 4),
"total_examples": total_count
}
with open(metrics_file, "w") as f:
json.dump(metrics, f, indent=2)
print(json.dumps(metrics, indent=2))