File size: 2,434 Bytes
6cb5132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import torch
import csv
import json
import re
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm
from warnings import filterwarnings

filterwarnings("ignore")

# Model setup
model_id = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")

# Load subset of GSM8K dataset for debugging
dataset = load_dataset("gsm8k", "main", split="train[:100]")

# Output CSV
csv_file = "gsm8k_llama3_results_1.csv"
file = open(csv_file, mode='w', newline='', encoding='utf-8')
writer = csv.writer(file)
writer.writerow(["question", "true_answer", "predicted_answer", "full_response"])

# Inference loop
for idx, example in enumerate(tqdm(dataset, desc="Evaluating")):
    question = example["question"]
    true_answer = example["answer"].split("####")[-1].strip()

    # Better prompting with fixed answer format
    prompt = (
        f"Question: {question}\n\n"
        "Please solve this step-by-step and finally answer in this format:\n"
        "Answer: <final numeric answer>\n"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=300,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Better extraction: look for "Answer: <number>" pattern first
    match = re.search(r"Answer:\s*([-+]?\d*\.?\d+)", response)
    if match:
        predicted_answer = match.group(1)
    else:
        # Fallback: extract all numbers, take most frequent or last
        pred_numbers = re.findall(r"[-+]?\d*\.\d+|\d+", response)
        predicted_answer = pred_numbers[-1] if pred_numbers else "N/A"

    # Print few examples for debugging
    if idx < 5:  
        print("="*50)
        print(f"Question: {question}")
        print(f"Response: {response}")
        print(f"True Answer: {true_answer}")
        print(f"Predicted Answer: {predicted_answer}")

    # Write to CSV
    writer.writerow([question, true_answer, predicted_answer, response])

# Close CSV file
file.close()

print("Evaluation complete. Results saved to:", csv_file)