eval_playground / app.py
Narsil's picture
Narsil HF Staff
Cleaning up UI.
5e4987c unverified
import gradio as gr
from datasets import load_dataset, get_dataset_config_names
import random
from typing import List, Tuple
import logging
# Set up logging
logging.basicConfig(
level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)
# Popular evaluation datasets with their configurations
EVAL_DATASETS = {
"openai/gsm8k": {
"name": "GSM8K - Grade School Math",
"type": "qa",
"config": "main",
"question_field": "question",
"answer_field": "answer",
"split": "train",
},
"cais/mmlu": {
"name": "MMLU - Massive Multitask Language Understanding",
"type": "multiple_choice",
"config": "all",
"question_field": "question",
"choices_field": "choices",
"answer_field": "answer",
"split": "test",
},
"allenai/ai2_arc": {
"name": "AI2 ARC - Science Questions",
"type": "multiple_choice",
"config": "ARC-Challenge",
"question_field": "question",
"choices_field": "choices",
"answer_field": "answerKey",
"split": "train",
},
"Rowan/hellaswag": {
"name": "HellaSwag - Commonsense NLI",
"type": "multiple_choice",
"question_field": "ctx",
"choices_field": "endings",
"answer_field": "label",
"split": "train",
},
"allenai/winogrande": {
"name": "WinoGrande - Winograd Schema",
"type": "binary_choice",
"config": "winogrande_xl",
"question_field": "sentence",
"option1_field": "option1",
"option2_field": "option2",
"answer_field": "answer",
"split": "train",
},
"google/boolq": {
"name": "BoolQ - Boolean Questions",
"type": "true_false",
"question_field": "question",
"context_field": "passage",
"answer_field": "answer",
"split": "train",
},
"rajpurkar/squad": {
"name": "SQuAD - Reading Comprehension",
"type": "extractive_qa",
"question_field": "question",
"context_field": "context",
"answer_field": "answers",
"split": "train",
},
"allenai/piqa": {
"name": "PIQA - Physical Reasoning",
"type": "binary_choice",
"question_field": "goal",
"option1_field": "sol1",
"option2_field": "sol2",
"answer_field": "label",
"split": "train",
},
}
class QuizApp:
def __init__(self):
self.current_dataset = None
self.current_dataset_name = None
self.questions = []
self.current_question_idx = 0
self.score = 0
self.total_questions = 0
def load_dataset_questions(self, dataset_name: str, num_questions: int = 10):
"""Load random questions from the selected dataset"""
try:
config = EVAL_DATASETS[dataset_name]
# Try to load dataset with config if specified
try:
if "config" in config:
dataset = load_dataset(
dataset_name, config["config"], split=config["split"]
)
else:
dataset = load_dataset(dataset_name, split=config["split"])
except ValueError as e:
# If config is missing, try to get available configs
if "Config name is missing" in str(e):
configs = get_dataset_config_names(dataset_name)
# Use first config or "all" if available
if "all" in configs:
selected_config = "all"
else:
selected_config = configs[0]
print(
f"Auto-selected config '{selected_config}' for {dataset_name}"
)
dataset = load_dataset(
dataset_name, selected_config, split=config["split"]
)
else:
raise e
# Sample random questions
total_examples = len(dataset)
num_questions = min(num_questions, total_examples)
indices = random.sample(range(total_examples), num_questions)
self.questions = []
for idx in indices:
example = dataset[idx]
self.questions.append(example)
self.current_dataset = config
self.current_dataset_name = dataset_name
self.current_question_idx = 0
self.score = 0
self.total_questions = len(self.questions)
return True, f"Loaded {num_questions} questions from {config['name']}"
except Exception as e:
return False, f"Error loading dataset: {str(e)}"
def get_current_question(self) -> Tuple[str, List[str], str]:
"""Get the current question formatted for display"""
if not self.questions or self.current_question_idx >= len(self.questions):
return "", [], ""
question_data = self.questions[self.current_question_idx]
config = self.current_dataset
logging.info(f"\n{'=' * 60}")
logging.info(f"Dataset: {self.current_dataset_name}")
logging.info(f"Question {self.current_question_idx + 1}/{self.total_questions}")
logging.info(f"Raw question data: {repr(question_data)}")
logging.info(f"{'=' * 60}\n")
# Format question based on dataset type
question_type = config["type"]
if question_type == "multiple_choice":
question = question_data[config["question_field"]]
choices = question_data[config["choices_field"]]
if config["answer_field"] in question_data:
answer = question_data[config["answer_field"]]
else:
answer = ""
# Format choices with letters
formatted_choices = [
f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices)
]
return question, formatted_choices, question_type
elif question_type == "true_false":
question = question_data[config["question_field"]]
if "context_field" in config:
context = question_data[config["context_field"]]
question = f"Context: {context}\n\nQuestion: {question}"
return question, ["True", "False"], question_type
elif question_type == "binary_choice":
question = question_data[config["question_field"]]
option1 = question_data[config["option1_field"]]
option2 = question_data[config["option2_field"]]
return question, [f"A. {option1}", f"B. {option2}"], question_type
elif question_type == "qa" or question_type == "extractive_qa":
question = question_data[config["question_field"]]
if "context_field" in config and config["context_field"] in question_data:
context = question_data[config["context_field"]]
question = f"Context: {context[:500]}...\n\nQuestion: {question}"
return question, [], question_type
return "", [], ""
def format_answer(self, answer: str, dataset_name: str) -> str:
"""Format answer based on dataset type for better readability"""
import re
# Convert <<equation>> to show the math clearly
# Extract the equation and its result, show just the result with equation in parentheses
def format_equation(match):
equation = match.group(1)
# Check if it's in format "calculation=result"
if '=' in equation:
parts = equation.split('=')
if len(parts) == 2:
calculation, result = parts[0], parts[1]
return f"{result} (={calculation})"
return f"[{equation}]"
answer = re.sub(r"<<([^>]+)>>", format_equation, answer)
# Dataset-specific formatting
if dataset_name == "openai/gsm8k":
# Format the final answer line
answer = answer.replace("####", "\n\nFinal Answer:")
# Ensure proper line breaks after periods for readability
answer = re.sub(r'\. (?=[A-Z])', '.\n', answer)
return answer
def check_answer(self, user_answer: str) -> Tuple[bool, str]:
"""Check if the user's answer is correct"""
if not self.questions or self.current_question_idx >= len(self.questions):
return False, "No question available"
question_data = self.questions[self.current_question_idx]
config = self.current_dataset
question_type = config["type"]
if question_type == "multiple_choice":
correct_answer_idx = question_data[config["answer_field"]]
# Handle both numeric and letter answers
if isinstance(correct_answer_idx, int):
correct_letter = chr(65 + correct_answer_idx)
else:
correct_letter = str(correct_answer_idx)
user_letter = user_answer.strip().upper()[0] if user_answer else ""
is_correct = user_letter == correct_letter
if is_correct:
return True, '✅ Correct!'
else:
choices = question_data[config["choices_field"]]
correct_choice = (
choices[correct_answer_idx]
if isinstance(correct_answer_idx, int)
else correct_answer_idx
)
logging.info(f"Raw answer (multiple choice): {repr(correct_choice)}")
formatted_answer = self.format_answer(
correct_choice, self.current_dataset_name
)
return (
False,
f'❌ Incorrect\n\nThe correct answer was {correct_letter}:\n\n{formatted_answer}',
)
elif question_type == "true_false":
correct_answer = question_data[config["answer_field"]]
user_bool = user_answer.lower().strip() == "true"
is_correct = user_bool == correct_answer
if is_correct:
return True, '✅ Correct!'
else:
return (
False,
f'❌ Incorrect\n\nThe correct answer was {correct_answer}',
)
elif question_type == "binary_choice":
correct_answer_idx = question_data[config["answer_field"]]
user_idx = 0 if user_answer.strip().upper().startswith("A") else 1
is_correct = user_idx == correct_answer_idx
if is_correct:
return True, '✅ Correct!'
else:
correct_letter = "A" if correct_answer_idx == 0 else "B"
option_field = (
config["option1_field"]
if correct_answer_idx == 0
else config["option2_field"]
)
correct_option = question_data[option_field]
logging.info(f"Raw answer (binary choice): {repr(correct_option)}")
formatted_answer = self.format_answer(
correct_option, self.current_dataset_name
)
return (
False,
f'❌ Incorrect\n\nThe correct answer was {correct_letter}:\n\n{formatted_answer}',
)
elif question_type in ["qa", "extractive_qa"]:
# For QA, we'll do a simple check - in real app, you'd want more sophisticated matching
correct_answer = question_data[config["answer_field"]]
if isinstance(correct_answer, dict) and "text" in correct_answer:
correct_answer = (
correct_answer["text"][0] if correct_answer["text"] else ""
)
elif isinstance(correct_answer, list) and len(correct_answer) > 0:
correct_answer = (
correct_answer[0]["text"]
if isinstance(correct_answer[0], dict)
else str(correct_answer[0])
)
else:
correct_answer = str(correct_answer)
# Extract final answer for GSM8K and similar datasets
import re
# For GSM8K, extract the final answer after ####
if "####" in correct_answer:
final_answer_match = re.search(r"####\s*(.+)", correct_answer)
if final_answer_match:
final_answer = final_answer_match.group(1).strip()
else:
final_answer = correct_answer
else:
final_answer = correct_answer
# First check if user answer is empty
if not user_answer or not user_answer.strip():
is_correct = False
else:
# Extract numbers from both answers for comparison
correct_numbers = re.findall(r"-?\d+\.?\d*", final_answer)
user_numbers = re.findall(r"-?\d+\.?\d*", user_answer)
# Check if answers match
is_correct = False
# If both have numbers, compare the numbers
if correct_numbers and user_numbers:
# Convert to float for comparison to handle decimals
try:
correct_num = float(
correct_numbers[-1]
) # Take the last number as final answer
user_num = float(user_numbers[-1]) # Take the last number from user
is_correct = (
abs(correct_num - user_num) < 0.0001
) # Small tolerance for float comparison
except ValueError:
# Fall back to string comparison
is_correct = correct_numbers[-1] == user_numbers[-1]
elif correct_numbers and not user_numbers:
# If correct answer has numbers but user answer doesn't, it's wrong
is_correct = False
else:
# Fall back to substring matching for non-numeric answers
# But ensure both strings are non-empty
is_correct = (
user_answer.lower().strip() in correct_answer.lower()
or correct_answer.lower() in user_answer.lower().strip()
) and len(user_answer.strip()) > 0
if is_correct:
return True, '✅ Correct!'
else:
logging.info(f"Raw answer (QA): {repr(correct_answer)}")
logging.info(f"Extracted final answer: {repr(final_answer)}")
logging.info(
f"Correct numbers: {correct_numbers}, User numbers: {user_numbers}"
)
formatted_answer = self.format_answer(
correct_answer, self.current_dataset_name
)
# Debug: log the formatted answer
logging.info(f"Formatted answer with LaTeX: {repr(formatted_answer)}")
return (
False,
f'❌ Incorrect\n\nThe correct answer was:\n\n{formatted_answer}',
)
return False, "Unknown question type"
# Create global quiz app instance
quiz_app = QuizApp()
def create_dataset_display():
"""Create the dataset listing display"""
dataset_info = []
for dataset_id, config in EVAL_DATASETS.items():
dataset_info.append(
f"**{config['name']}**\n- Dataset: {dataset_id}\n- Type: {config['type']}"
)
return "\n\n".join(dataset_info)
def start_quiz(dataset_choice: str, num_questions: int):
"""Start a new quiz with the selected dataset"""
# Extract dataset ID from the choice
dataset_id = None
for did, config in EVAL_DATASETS.items():
if config["name"] in dataset_choice:
dataset_id = did
break
if not dataset_id:
return (
"Please select a dataset",
gr.update(visible=False), # question_display
gr.update(visible=False), # answer_radio
gr.update(visible=False), # answer_textbox
gr.update(visible=False), # submit_button
gr.update(visible=False), # progress_text
)
success, message = quiz_app.load_dataset_questions(dataset_id, num_questions)
if success:
question, choices, q_type = quiz_app.get_current_question()
if q_type in ["multiple_choice", "true_false", "binary_choice"]:
return (
message,
gr.update(value=question, visible=True), # question_display
gr.update(choices=choices, visible=True, value=None), # answer_radio
gr.update(visible=False), # answer_textbox
gr.update(visible=True), # submit_button
gr.update(value=f"Question 1/{quiz_app.total_questions}", visible=True), # progress_text
)
else:
return (
message,
gr.update(value=question, visible=True), # question_display
gr.update(visible=False), # answer_radio
gr.update(visible=True, value=""), # answer_textbox
gr.update(visible=True), # submit_button
gr.update(value=f"Question 1/{quiz_app.total_questions}", visible=True), # progress_text
)
else:
return (
message,
gr.update(visible=False), # question_display
gr.update(visible=False), # answer_radio
gr.update(visible=False), # answer_textbox
gr.update(visible=False), # submit_button
gr.update(visible=False), # progress_text
)
def submit_answer(answer_choice, answer_text):
"""Submit answer and show feedback"""
# Determine which answer to use
if answer_choice:
answer = answer_choice
else:
answer = answer_text
is_correct, feedback = quiz_app.check_answer(answer)
if is_correct:
quiz_app.score += 1
return gr.update(value=feedback, visible=True), gr.update(visible=True)
def next_question():
"""Move to the next question"""
quiz_app.current_question_idx += 1
if quiz_app.current_question_idx >= quiz_app.total_questions:
# Quiz complete
final_score = f'🎉 Quiz Complete!\n\nYour score: {quiz_app.score}/{quiz_app.total_questions} ({quiz_app.score / quiz_app.total_questions * 100:.1f}%)'
return (
gr.update(value=final_score, visible=True),
"",
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
gr.update(visible=False),
"Quiz Complete",
)
question, choices, q_type = quiz_app.get_current_question()
if q_type in ["multiple_choice", "true_false", "binary_choice"]:
return (
gr.update(value="", visible=False), # Clear feedback
gr.update(value=question), # question_display
gr.update(choices=choices, visible=True, value=None),
gr.update(visible=False),
gr.update(visible=True),
gr.update(visible=False),
gr.update(value=f"Question {quiz_app.current_question_idx + 1}/{quiz_app.total_questions}"),
)
else:
return (
gr.update(value="", visible=False), # Clear feedback
gr.update(value=question), # question_display
gr.update(visible=False),
gr.update(visible=True, value=""),
gr.update(visible=True),
gr.update(visible=False),
gr.update(value=f"Question {quiz_app.current_question_idx + 1}/{quiz_app.total_questions}"),
)
# Create Gradio interface
with gr.Blocks(title="HuggingFace Evaluation Dataset Quiz") as demo:
gr.Markdown("# 🤗 Evaluation Dataset Quiz")
gr.Markdown(
"Test yourself with questions from popular HuggingFace evaluation datasets!"
)
# Dataset Selection Section
with gr.Row():
dataset_dropdown = gr.Dropdown(
choices=[config["name"] for config in EVAL_DATASETS.values()],
label="Select Dataset",
value=list(EVAL_DATASETS.values())[0]["name"],
)
num_questions_slider = gr.Slider(
minimum=5, maximum=20, value=10, step=1, label="Number of Questions"
)
start_button = gr.Button("Start Quiz", variant="primary")
status_message = gr.Textbox(label="Status", interactive=False)
# Quiz Section - shown when quiz starts
gr.Markdown("---") # Separator
progress_text = gr.Textbox(label="Progress", value="0/0", interactive=False, visible=False)
question_display = gr.Textbox(label="Question", lines=5, interactive=False, visible=False)
# Answer inputs (one will be visible at a time)
answer_radio = gr.Radio(label="Select your answer", visible=False)
answer_textbox = gr.Textbox(label="Type your answer (Raw number)", visible=False)
submit_button = gr.Button("Submit Answer", variant="primary", visible=False)
feedback_display = gr.Textbox(
label="Feedback",
visible=False,
lines=10,
max_lines=20,
interactive=False
)
next_button = gr.Button("Next Question", visible=False)
# Connect events
start_button.click(
start_quiz,
inputs=[dataset_dropdown, num_questions_slider],
outputs=[
status_message,
question_display,
answer_radio,
answer_textbox,
submit_button,
progress_text,
],
)
submit_button.click(
submit_answer,
inputs=[answer_radio, answer_textbox],
outputs=[feedback_display, next_button],
)
next_button.click(
next_question,
outputs=[
feedback_display,
question_display,
answer_radio,
answer_textbox,
submit_button,
next_button,
progress_text,
],
)
if __name__ == "__main__":
demo.launch()