|
import torch
|
|
import csv
|
|
import json
|
|
import re
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
from datasets import load_dataset
|
|
from tqdm import tqdm
|
|
from bert_score import score as bert_score
|
|
import evaluate
|
|
from warnings import filterwarnings
|
|
filterwarnings("ignore")
|
|
|
|
|
|
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
|
|
|
|
|
|
dataset = load_dataset("gsm8k", "main", split="train")
|
|
|
|
|
|
accuracy_metric = evaluate.load("accuracy")
|
|
f1_metric = evaluate.load("f1")
|
|
|
|
|
|
csv_file = "gsm8k_llama3_results.csv"
|
|
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
|
|
writer = csv.writer(file)
|
|
writer.writerow(["question", "true_answer", "predicted_answer", "full_response", "correct"])
|
|
|
|
|
|
true_answers = []
|
|
predicted_answers = []
|
|
correct_flags = []
|
|
|
|
|
|
for example in tqdm(dataset, desc="Evaluating"):
|
|
question = example["question"]
|
|
true_answer = example["answer"].split("####")[-1].strip()
|
|
|
|
prompt = f"Q: {question}\nA:"
|
|
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
|
|
|
with torch.no_grad():
|
|
outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7,pad_token_id=tokenizer.eos_token_id)
|
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
|
|
|
|
pred_numbers = re.findall(r"[-+]?\d*\.\d+|\d+", response)
|
|
predicted_answer = pred_numbers[-1] if pred_numbers else "N/A"
|
|
|
|
|
|
is_correct = predicted_answer == true_answer
|
|
correct_flags.append(is_correct)
|
|
true_answers.append(true_answer)
|
|
predicted_answers.append(predicted_answer)
|
|
|
|
|
|
with open(csv_file, mode='a', newline='', encoding='utf-8') as file:
|
|
writer = csv.writer(file)
|
|
writer.writerow([question, true_answer, predicted_answer, response, is_correct])
|
|
|
|
|
|
accuracy = sum(correct_flags) / len(correct_flags)
|
|
f1 = f1_metric.compute(predictions=predicted_answers, references=true_answers, average="macro")["f1"]
|
|
P, R, F1 = bert_score(predicted_answers, true_answers, lang="en")
|
|
bert_f1 = F1.mean().item()
|
|
|
|
|
|
metrics = {
|
|
"accuracy": accuracy,
|
|
"f1_score": f1,
|
|
"bert_score_f1": bert_f1
|
|
}
|
|
with open("gsm8k_llama3_metrics.json", "w") as f:
|
|
json.dump(metrics, f, indent=2)
|
|
|
|
|
|
print(json.dumps(metrics, indent=2))
|
|
|