|
import csv
|
|
import json
|
|
from tqdm import tqdm
|
|
from sklearn.metrics import precision_score, recall_score, f1_score
|
|
|
|
|
|
csv_file = "gsm8k_llama3_results.csv"
|
|
metrics_file = "gsm8k_llama3_metrics.json"
|
|
|
|
|
|
true_answers = []
|
|
predicted_answers = []
|
|
correct_count = 0
|
|
total_count = 0
|
|
|
|
|
|
with open(csv_file, mode='r', encoding='utf-8') as f:
|
|
total_lines = sum(1 for _ in f) - 1
|
|
|
|
|
|
with open(csv_file, mode='r', encoding='utf-8') as f:
|
|
reader = csv.DictReader(f)
|
|
for row in tqdm(reader, total=total_lines, desc="Evaluating"):
|
|
true = row["true_answer"].strip()
|
|
pred = row["predicted_answer"].strip()
|
|
|
|
true_answers.append(true)
|
|
predicted_answers.append(pred)
|
|
|
|
if true == pred:
|
|
correct_count += 1
|
|
total_count += 1
|
|
|
|
|
|
accuracy = correct_count / total_count if total_count > 0 else 0.0
|
|
|
|
|
|
precision = precision_score(true_answers, predicted_answers, average="macro", zero_division=0)
|
|
recall = recall_score(true_answers, predicted_answers, average="macro", zero_division=0)
|
|
f1 = f1_score(true_answers, predicted_answers, average="macro", zero_division=0)
|
|
|
|
|
|
metrics = {
|
|
"accuracy": round(accuracy, 4),
|
|
"precision": round(precision, 4),
|
|
"recall": round(recall, 4),
|
|
"f1_score": round(f1, 4),
|
|
"total_examples": total_count
|
|
}
|
|
|
|
with open(metrics_file, "w") as f:
|
|
json.dump(metrics, f, indent=2)
|
|
|
|
print(json.dumps(metrics, indent=2))
|
|
|