from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForCausalLM from sklearn.metrics import classification_report from bert_score import score as bert_score import pandas as pd import torch import re from tqdm import tqdm model_name = "meta-llama/Llama-3.2-1B" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) dataset = load_dataset("INK-USC/riddle_sense", trust_remote_code=True)["validation"] #Format prompt def format_prompt(example): prompt = f"Question: {example['question']}\nChoices:\n" for label, text in zip(example['choices']['label'], example['choices']['text']): prompt += f"{label}. {text}\n" prompt += "Answer:" return prompt #Mapping choices to numbers letter_to_label = {l: i for i, l in enumerate(["A", "B", "C", "D", "E"])} label_to_letter = {i: l for l, i in letter_to_label.items()} true_labels, pred_labels = [], [] true_texts, pred_texts = [], [] #Run for whole dataset for i, example in enumerate(tqdm(dataset, desc="Evaluating")): prompt = format_prompt(example) inputs = tokenizer(prompt, return_tensors="pt").to(device) outputs = model.generate(**inputs, max_new_tokens=10) output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract the new part of output answer_section = output_text[len(prompt):].strip() match = re.search(r"\b([A-E])\b", answer_section) pred_letter = match.group(1) if match else "Z" pred_label = letter_to_label.get(pred_letter, -1) true_letter = example["answerKey"] true_label = letter_to_label.get(true_letter, -1) true_labels.append(true_label) pred_labels.append(pred_label) pred_text = "" if pred_letter in example['choices']['label']: pred_index = example['choices']['label'].index(pred_letter) pred_text = example['choices']['text'][pred_index] if true_letter in example['choices']['label']: true_index = example['choices']['label'].index(true_letter) true_text = example['choices']['text'][true_index] #bert else: true_text = "" pred_texts.append(pred_text) true_texts.append(true_text) print(f"\n--- Example {i + 1} ---") print(f"Riddle: {example['question']}") print(f"Model's answer: {pred_letter} → {pred_text}") print(f"Correct answer: {true_letter} → {true_text}") print(f"Raw output:\n{output_text}") # Step 7: Generate classification report report = classification_report(true_labels, pred_labels, output_dict=True, zero_division=0) df_report = pd.DataFrame(report).transpose() # Step 8: Compute BERTScore P, R, F1 = bert_score(pred_texts, true_texts, lang="en", verbose=True) df_report.loc["BERTScore", ["precision", "recall", "f1-score"]] = [P.mean().item(), R.mean().item(), F1.mean().item()] # Step 9: Save report df_report.to_csv("riddlesense_classification_report.csv") print("\nFinal Report:\n", df_report)