ksimdeep
/

my-model-repoLama1

Model card Files Files and versions

my-model-repoLama1 / metrics.py

ksimdeep's picture

Upload folder using huggingface_hub

47fcd4a verified 3 months ago

history blame contribute delete

1.64 kB

	import csv
	import json
	from tqdm import tqdm
	from sklearn.metrics import precision_score, recall_score, f1_score

	# File paths
	csv_file = "gsm8k_llama3_results.csv"
	metrics_file = "gsm8k_llama3_metrics.json"

	# Initialize
	true_answers = []
	predicted_answers = []
	correct_count = 0
	total_count = 0

	# Count total lines for tqdm
	with open(csv_file, mode='r', encoding='utf-8') as f:
	total_lines = sum(1 for _ in f) - 1 # Subtract header

	# Read CSV and collect predictions
	with open(csv_file, mode='r', encoding='utf-8') as f:
	reader = csv.DictReader(f)
	for row in tqdm(reader, total=total_lines, desc="Evaluating"):
	true = row["true_answer"].strip()
	pred = row["predicted_answer"].strip()

	true_answers.append(true)
	predicted_answers.append(pred)

	if true == pred:
	correct_count += 1
	total_count += 1

	# Accuracy
	accuracy = correct_count / total_count if total_count > 0 else 0.0

	# Precision, Recall, F1 — treating answers as string labels
	precision = precision_score(true_answers, predicted_answers, average="macro", zero_division=0)
	recall = recall_score(true_answers, predicted_answers, average="macro", zero_division=0)
	f1 = f1_score(true_answers, predicted_answers, average="macro", zero_division=0)

	# Save to JSON
	metrics = {
	"accuracy": round(accuracy, 4),
	"precision": round(precision, 4),
	"recall": round(recall, 4),
	"f1_score": round(f1, 4),
	"total_examples": total_count
	}

	with open(metrics_file, "w") as f:
	json.dump(metrics, f, indent=2)

	print(json.dumps(metrics, indent=2))