my-model-repoLama1 / gsm8k_eval.py
ksimdeep's picture
Upload folder using huggingface_hub
47fcd4a verified
import torch
import csv
import json
import re
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm
from bert_score import score as bert_score
import evaluate
from warnings import filterwarnings
filterwarnings("ignore")
# Model setup
model_id = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
# Load entire GSM8K dataset
dataset = load_dataset("gsm8k", "main", split="train")
# Evaluators
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
# Output CSV
csv_file = "gsm8k_llama3_results.csv"
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(["question", "true_answer", "predicted_answer", "full_response", "correct"])
# Prepare for metric calculation
true_answers = []
predicted_answers = []
correct_flags = []
# Inference loop
for example in tqdm(dataset, desc="Evaluating"):
question = example["question"]
true_answer = example["answer"].split("####")[-1].strip()
prompt = f"Q: {question}\nA:"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7,pad_token_id=tokenizer.eos_token_id)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract predicted number
pred_numbers = re.findall(r"[-+]?\d*\.\d+|\d+", response)
predicted_answer = pred_numbers[-1] if pred_numbers else "N/A"
# Determine correctness
is_correct = predicted_answer == true_answer
correct_flags.append(is_correct)
true_answers.append(true_answer)
predicted_answers.append(predicted_answer)
# Append to CSV
with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow([question, true_answer, predicted_answer, response, is_correct])
# Metrics
accuracy = sum(correct_flags) / len(correct_flags)
f1 = f1_metric.compute(predictions=predicted_answers, references=true_answers, average="macro")["f1"]
P, R, F1 = bert_score(predicted_answers, true_answers, lang="en")
bert_f1 = F1.mean().item()
# Save metrics to JSON
metrics = {
"accuracy": accuracy,
"f1_score": f1,
"bert_score_f1": bert_f1
}
with open("gsm8k_llama3_metrics.json", "w") as f:
json.dump(metrics, f, indent=2)
# Print summary
print(json.dumps(metrics, indent=2))