|
import torch
|
|
import csv
|
|
import json
|
|
import re
|
|
from transformers import AutoTokenizer, AutoModelForCausalLM
|
|
from datasets import load_dataset
|
|
from tqdm import tqdm
|
|
from warnings import filterwarnings
|
|
|
|
filterwarnings("ignore")
|
|
|
|
|
|
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
|
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
|
|
|
|
|
|
dataset = load_dataset("gsm8k", "main", split="train[:100]")
|
|
|
|
|
|
csv_file = "gsm8k_llama3_results_1.csv"
|
|
file = open(csv_file, mode='w', newline='', encoding='utf-8')
|
|
writer = csv.writer(file)
|
|
writer.writerow(["question", "true_answer", "predicted_answer", "full_response"])
|
|
|
|
|
|
for idx, example in enumerate(tqdm(dataset, desc="Evaluating")):
|
|
question = example["question"]
|
|
true_answer = example["answer"].split("####")[-1].strip()
|
|
|
|
|
|
prompt = (
|
|
f"Question: {question}\n\n"
|
|
"Please solve this step-by-step and finally answer in this format:\n"
|
|
"Answer: <final numeric answer>\n"
|
|
)
|
|
|
|
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
|
|
|
with torch.no_grad():
|
|
outputs = model.generate(
|
|
**inputs,
|
|
max_new_tokens=300,
|
|
temperature=0.7,
|
|
pad_token_id=tokenizer.eos_token_id
|
|
)
|
|
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
|
|
|
|
match = re.search(r"Answer:\s*([-+]?\d*\.?\d+)", response)
|
|
if match:
|
|
predicted_answer = match.group(1)
|
|
else:
|
|
|
|
pred_numbers = re.findall(r"[-+]?\d*\.\d+|\d+", response)
|
|
predicted_answer = pred_numbers[-1] if pred_numbers else "N/A"
|
|
|
|
|
|
if idx < 5:
|
|
print("="*50)
|
|
print(f"Question: {question}")
|
|
print(f"Response: {response}")
|
|
print(f"True Answer: {true_answer}")
|
|
print(f"Predicted Answer: {predicted_answer}")
|
|
|
|
|
|
writer.writerow([question, true_answer, predicted_answer, response])
|
|
|
|
|
|
file.close()
|
|
|
|
print("Evaluation complete. Results saved to:", csv_file)
|
|
|