my-model-repoLama1a / llama_riddle.py
ksimdeep's picture
Upload folder using huggingface_hub
951a66b verified
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from sklearn.metrics import classification_report
from bert_score import score as bert_score
import pandas as pd
import torch
import re
from tqdm import tqdm
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
dataset = load_dataset("INK-USC/riddle_sense", trust_remote_code=True)["validation"]
#Format prompt
def format_prompt(example):
prompt = f"Question: {example['question']}\nChoices:\n"
for label, text in zip(example['choices']['label'], example['choices']['text']):
prompt += f"{label}. {text}\n"
prompt += "Answer:"
return prompt
#Mapping choices to numbers
letter_to_label = {l: i for i, l in enumerate(["A", "B", "C", "D", "E"])}
label_to_letter = {i: l for l, i in letter_to_label.items()}
true_labels, pred_labels = [], []
true_texts, pred_texts = [], []
#Run for whole dataset
for i, example in enumerate(tqdm(dataset, desc="Evaluating")):
prompt = format_prompt(example)
inputs = tokenizer(prompt, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=10)
output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract the new part of output
answer_section = output_text[len(prompt):].strip()
match = re.search(r"\b([A-E])\b", answer_section)
pred_letter = match.group(1) if match else "Z"
pred_label = letter_to_label.get(pred_letter, -1)
true_letter = example["answerKey"]
true_label = letter_to_label.get(true_letter, -1)
true_labels.append(true_label)
pred_labels.append(pred_label)
pred_text = ""
if pred_letter in example['choices']['label']:
pred_index = example['choices']['label'].index(pred_letter)
pred_text = example['choices']['text'][pred_index]
if true_letter in example['choices']['label']:
true_index = example['choices']['label'].index(true_letter)
true_text = example['choices']['text'][true_index] #bert
else:
true_text = ""
pred_texts.append(pred_text)
true_texts.append(true_text)
print(f"\n--- Example {i + 1} ---")
print(f"Riddle: {example['question']}")
print(f"Model's answer: {pred_letter} → {pred_text}")
print(f"Correct answer: {true_letter} → {true_text}")
print(f"Raw output:\n{output_text}")
# Step 7: Generate classification report
report = classification_report(true_labels, pred_labels, output_dict=True, zero_division=0)
df_report = pd.DataFrame(report).transpose()
# Step 8: Compute BERTScore
P, R, F1 = bert_score(pred_texts, true_texts, lang="en", verbose=True)
df_report.loc["BERTScore", ["precision", "recall", "f1-score"]] = [P.mean().item(), R.mean().item(), F1.mean().item()]
# Step 9: Save report
df_report.to_csv("riddlesense_classification_report.csv")
print("\nFinal Report:\n", df_report)