import torch import csv import json import re from transformers import AutoTokenizer, AutoModelForCausalLM from datasets import load_dataset from tqdm import tqdm from bert_score import score as bert_score import evaluate from warnings import filterwarnings filterwarnings("ignore") # Model setup model_id = "meta-llama/Llama-3.2-1B-Instruct" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto") # Load entire GSM8K dataset dataset = load_dataset("gsm8k", "main", split="train") # Evaluators accuracy_metric = evaluate.load("accuracy") f1_metric = evaluate.load("f1") # Output CSV csv_file = "gsm8k_llama3_results.csv" with open(csv_file, mode='w', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow(["question", "true_answer", "predicted_answer", "full_response", "correct"]) # Prepare for metric calculation true_answers = [] predicted_answers = [] correct_flags = [] # Inference loop for example in tqdm(dataset, desc="Evaluating"): question = example["question"] true_answer = example["answer"].split("####")[-1].strip() prompt = f"Q: {question}\nA:" inputs = tokenizer(prompt, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7,pad_token_id=tokenizer.eos_token_id) response = tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract predicted number pred_numbers = re.findall(r"[-+]?\d*\.\d+|\d+", response) predicted_answer = pred_numbers[-1] if pred_numbers else "N/A" # Determine correctness is_correct = predicted_answer == true_answer correct_flags.append(is_correct) true_answers.append(true_answer) predicted_answers.append(predicted_answer) # Append to CSV with open(csv_file, mode='a', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow([question, true_answer, predicted_answer, response, is_correct]) # Metrics accuracy = sum(correct_flags) / len(correct_flags) f1 = f1_metric.compute(predictions=predicted_answers, references=true_answers, average="macro")["f1"] P, R, F1 = bert_score(predicted_answers, true_answers, lang="en") bert_f1 = F1.mean().item() # Save metrics to JSON metrics = { "accuracy": accuracy, "f1_score": f1, "bert_score_f1": bert_f1 } with open("gsm8k_llama3_metrics.json", "w") as f: json.dump(metrics, f, indent=2) # Print summary print(json.dumps(metrics, indent=2))