|
from datasets import load_dataset |
|
from transformers import AutoTokenizer, AutoModelForCausalLM |
|
from sklearn.metrics import classification_report |
|
from bert_score import score as bert_score |
|
import pandas as pd |
|
import torch |
|
import re |
|
from tqdm import tqdm |
|
|
|
model_name = "meta-llama/Llama-3.2-1B" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
model = AutoModelForCausalLM.from_pretrained(model_name) |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model.to(device) |
|
|
|
dataset = load_dataset("INK-USC/riddle_sense", trust_remote_code=True)["validation"] |
|
|
|
|
|
def format_prompt(example): |
|
prompt = f"Question: {example['question']}\nChoices:\n" |
|
for label, text in zip(example['choices']['label'], example['choices']['text']): |
|
prompt += f"{label}. {text}\n" |
|
prompt += "Answer:" |
|
return prompt |
|
|
|
|
|
letter_to_label = {l: i for i, l in enumerate(["A", "B", "C", "D", "E"])} |
|
label_to_letter = {i: l for l, i in letter_to_label.items()} |
|
|
|
true_labels, pred_labels = [], [] |
|
true_texts, pred_texts = [], [] |
|
|
|
|
|
for i, example in enumerate(tqdm(dataset, desc="Evaluating")): |
|
prompt = format_prompt(example) |
|
inputs = tokenizer(prompt, return_tensors="pt").to(device) |
|
outputs = model.generate(**inputs, max_new_tokens=10) |
|
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
|
|
|
answer_section = output_text[len(prompt):].strip() |
|
match = re.search(r"\b([A-E])\b", answer_section) |
|
pred_letter = match.group(1) if match else "Z" |
|
|
|
pred_label = letter_to_label.get(pred_letter, -1) |
|
|
|
true_letter = example["answerKey"] |
|
true_label = letter_to_label.get(true_letter, -1) |
|
|
|
true_labels.append(true_label) |
|
pred_labels.append(pred_label) |
|
|
|
pred_text = "" |
|
if pred_letter in example['choices']['label']: |
|
pred_index = example['choices']['label'].index(pred_letter) |
|
pred_text = example['choices']['text'][pred_index] |
|
|
|
if true_letter in example['choices']['label']: |
|
true_index = example['choices']['label'].index(true_letter) |
|
true_text = example['choices']['text'][true_index] |
|
else: |
|
true_text = "" |
|
|
|
pred_texts.append(pred_text) |
|
true_texts.append(true_text) |
|
|
|
print(f"\n--- Example {i + 1} ---") |
|
print(f"Riddle: {example['question']}") |
|
print(f"Model's answer: {pred_letter} → {pred_text}") |
|
print(f"Correct answer: {true_letter} → {true_text}") |
|
print(f"Raw output:\n{output_text}") |
|
|
|
|
|
report = classification_report(true_labels, pred_labels, output_dict=True, zero_division=0) |
|
df_report = pd.DataFrame(report).transpose() |
|
|
|
|
|
P, R, F1 = bert_score(pred_texts, true_texts, lang="en", verbose=True) |
|
df_report.loc["BERTScore", ["precision", "recall", "f1-score"]] = [P.mean().item(), R.mean().item(), F1.mean().item()] |
|
|
|
|
|
df_report.to_csv("riddlesense_classification_report.csv") |
|
print("\nFinal Report:\n", df_report) |
|
|