Spaces:

Narsil
/

eval_playground

Sleeping

App Files Files Community

eval_playground / app.py

Narsil HF Staff

Cleaning up UI.

5e4987c unverified 12 days ago

raw

history blame contribute delete

22.5 kB

	import gradio as gr
	from datasets import load_dataset, get_dataset_config_names
	import random
	from typing import List, Tuple
	import logging

	# Set up logging
	logging.basicConfig(
	level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
	)

	# Popular evaluation datasets with their configurations
	EVAL_DATASETS = {
	"openai/gsm8k": {
	"name": "GSM8K - Grade School Math",
	"type": "qa",
	"config": "main",
	"question_field": "question",
	"answer_field": "answer",
	"split": "train",
	},
	"cais/mmlu": {
	"name": "MMLU - Massive Multitask Language Understanding",
	"type": "multiple_choice",
	"config": "all",
	"question_field": "question",
	"choices_field": "choices",
	"answer_field": "answer",
	"split": "test",
	},
	"allenai/ai2_arc": {
	"name": "AI2 ARC - Science Questions",
	"type": "multiple_choice",
	"config": "ARC-Challenge",
	"question_field": "question",
	"choices_field": "choices",
	"answer_field": "answerKey",
	"split": "train",
	},
	"Rowan/hellaswag": {
	"name": "HellaSwag - Commonsense NLI",
	"type": "multiple_choice",
	"question_field": "ctx",
	"choices_field": "endings",
	"answer_field": "label",
	"split": "train",
	},
	"allenai/winogrande": {
	"name": "WinoGrande - Winograd Schema",
	"type": "binary_choice",
	"config": "winogrande_xl",
	"question_field": "sentence",
	"option1_field": "option1",
	"option2_field": "option2",
	"answer_field": "answer",
	"split": "train",
	},
	"google/boolq": {
	"name": "BoolQ - Boolean Questions",
	"type": "true_false",
	"question_field": "question",
	"context_field": "passage",
	"answer_field": "answer",
	"split": "train",
	},
	"rajpurkar/squad": {
	"name": "SQuAD - Reading Comprehension",
	"type": "extractive_qa",
	"question_field": "question",
	"context_field": "context",
	"answer_field": "answers",
	"split": "train",
	},
	"allenai/piqa": {
	"name": "PIQA - Physical Reasoning",
	"type": "binary_choice",
	"question_field": "goal",
	"option1_field": "sol1",
	"option2_field": "sol2",
	"answer_field": "label",
	"split": "train",
	},
	}


	class QuizApp:
	def __init__(self):
	self.current_dataset = None
	self.current_dataset_name = None
	self.questions = []
	self.current_question_idx = 0
	self.score = 0
	self.total_questions = 0

	def load_dataset_questions(self, dataset_name: str, num_questions: int = 10):
	"""Load random questions from the selected dataset"""
	try:
	config = EVAL_DATASETS[dataset_name]

	# Try to load dataset with config if specified
	try:
	if "config" in config:
	dataset = load_dataset(
	dataset_name, config["config"], split=config["split"]
	)
	else:
	dataset = load_dataset(dataset_name, split=config["split"])
	except ValueError as e:
	# If config is missing, try to get available configs
	if "Config name is missing" in str(e):
	configs = get_dataset_config_names(dataset_name)
	# Use first config or "all" if available
	if "all" in configs:
	selected_config = "all"
	else:
	selected_config = configs[0]
	print(
	f"Auto-selected config '{selected_config}' for {dataset_name}"
	)
	dataset = load_dataset(
	dataset_name, selected_config, split=config["split"]
	)
	else:
	raise e

	# Sample random questions
	total_examples = len(dataset)
	num_questions = min(num_questions, total_examples)
	indices = random.sample(range(total_examples), num_questions)

	self.questions = []
	for idx in indices:
	example = dataset[idx]
	self.questions.append(example)

	self.current_dataset = config
	self.current_dataset_name = dataset_name
	self.current_question_idx = 0
	self.score = 0
	self.total_questions = len(self.questions)

	return True, f"Loaded {num_questions} questions from {config['name']}"

	except Exception as e:
	return False, f"Error loading dataset: {str(e)}"

	def get_current_question(self) -> Tuple[str, List[str], str]:
	"""Get the current question formatted for display"""
	if not self.questions or self.current_question_idx >= len(self.questions):
	return "", [], ""

	question_data = self.questions[self.current_question_idx]
	config = self.current_dataset

	logging.info(f"\n{'=' * 60}")
	logging.info(f"Dataset: {self.current_dataset_name}")
	logging.info(f"Question {self.current_question_idx + 1}/{self.total_questions}")
	logging.info(f"Raw question data: {repr(question_data)}")
	logging.info(f"{'=' * 60}\n")

	# Format question based on dataset type
	question_type = config["type"]

	if question_type == "multiple_choice":
	question = question_data[config["question_field"]]
	choices = question_data[config["choices_field"]]
	if config["answer_field"] in question_data:
	answer = question_data[config["answer_field"]]
	else:
	answer = ""

	# Format choices with letters
	formatted_choices = [
	f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices)
	]
	return question, formatted_choices, question_type

	elif question_type == "true_false":
	question = question_data[config["question_field"]]
	if "context_field" in config:
	context = question_data[config["context_field"]]
	question = f"Context: {context}\n\nQuestion: {question}"
	return question, ["True", "False"], question_type

	elif question_type == "binary_choice":
	question = question_data[config["question_field"]]
	option1 = question_data[config["option1_field"]]
	option2 = question_data[config["option2_field"]]
	return question, [f"A. {option1}", f"B. {option2}"], question_type

	elif question_type == "qa" or question_type == "extractive_qa":
	question = question_data[config["question_field"]]
	if "context_field" in config and config["context_field"] in question_data:
	context = question_data[config["context_field"]]
	question = f"Context: {context[:500]}...\n\nQuestion: {question}"
	return question, [], question_type

	return "", [], ""

	def format_answer(self, answer: str, dataset_name: str) -> str:
	"""Format answer based on dataset type for better readability"""
	import re


	# Convert <<equation>> to show the math clearly
	# Extract the equation and its result, show just the result with equation in parentheses
	def format_equation(match):
	equation = match.group(1)
	# Check if it's in format "calculation=result"
	if '=' in equation:
	parts = equation.split('=')
	if len(parts) == 2:
	calculation, result = parts[0], parts[1]
	return f"{result} (={calculation})"
	return f"[{equation}]"

	answer = re.sub(r"<<([^>]+)>>", format_equation, answer)

	# Dataset-specific formatting
	if dataset_name == "openai/gsm8k":
	# Format the final answer line
	answer = answer.replace("####", "\n\nFinal Answer:")
	# Ensure proper line breaks after periods for readability
	answer = re.sub(r'\. (?=[A-Z])', '.\n', answer)

	return answer

	def check_answer(self, user_answer: str) -> Tuple[bool, str]:
	"""Check if the user's answer is correct"""
	if not self.questions or self.current_question_idx >= len(self.questions):
	return False, "No question available"

	question_data = self.questions[self.current_question_idx]
	config = self.current_dataset
	question_type = config["type"]

	if question_type == "multiple_choice":
	correct_answer_idx = question_data[config["answer_field"]]
	# Handle both numeric and letter answers
	if isinstance(correct_answer_idx, int):
	correct_letter = chr(65 + correct_answer_idx)
	else:
	correct_letter = str(correct_answer_idx)

	user_letter = user_answer.strip().upper()[0] if user_answer else ""
	is_correct = user_letter == correct_letter

	if is_correct:
	return True, '✅ Correct!'
	else:
	choices = question_data[config["choices_field"]]
	correct_choice = (
	choices[correct_answer_idx]
	if isinstance(correct_answer_idx, int)
	else correct_answer_idx
	)
	logging.info(f"Raw answer (multiple choice): {repr(correct_choice)}")
	formatted_answer = self.format_answer(
	correct_choice, self.current_dataset_name
	)
	return (
	False,
	f'❌ Incorrect\n\nThe correct answer was {correct_letter}:\n\n{formatted_answer}',
	)

	elif question_type == "true_false":
	correct_answer = question_data[config["answer_field"]]
	user_bool = user_answer.lower().strip() == "true"
	is_correct = user_bool == correct_answer

	if is_correct:
	return True, '✅ Correct!'
	else:
	return (
	False,
	f'❌ Incorrect\n\nThe correct answer was {correct_answer}',
	)

	elif question_type == "binary_choice":
	correct_answer_idx = question_data[config["answer_field"]]
	user_idx = 0 if user_answer.strip().upper().startswith("A") else 1
	is_correct = user_idx == correct_answer_idx

	if is_correct:
	return True, '✅ Correct!'
	else:
	correct_letter = "A" if correct_answer_idx == 0 else "B"
	option_field = (
	config["option1_field"]
	if correct_answer_idx == 0
	else config["option2_field"]
	)
	correct_option = question_data[option_field]
	logging.info(f"Raw answer (binary choice): {repr(correct_option)}")
	formatted_answer = self.format_answer(
	correct_option, self.current_dataset_name
	)
	return (
	False,
	f'❌ Incorrect\n\nThe correct answer was {correct_letter}:\n\n{formatted_answer}',
	)

	elif question_type in ["qa", "extractive_qa"]:
	# For QA, we'll do a simple check - in real app, you'd want more sophisticated matching
	correct_answer = question_data[config["answer_field"]]
	if isinstance(correct_answer, dict) and "text" in correct_answer:
	correct_answer = (
	correct_answer["text"][0] if correct_answer["text"] else ""
	)
	elif isinstance(correct_answer, list) and len(correct_answer) > 0:
	correct_answer = (
	correct_answer[0]["text"]
	if isinstance(correct_answer[0], dict)
	else str(correct_answer[0])
	)
	else:
	correct_answer = str(correct_answer)

	# Extract final answer for GSM8K and similar datasets
	import re

	# For GSM8K, extract the final answer after ####
	if "####" in correct_answer:
	final_answer_match = re.search(r"####\s*(.+)", correct_answer)
	if final_answer_match:
	final_answer = final_answer_match.group(1).strip()
	else:
	final_answer = correct_answer
	else:
	final_answer = correct_answer

	# First check if user answer is empty
	if not user_answer or not user_answer.strip():
	is_correct = False
	else:
	# Extract numbers from both answers for comparison
	correct_numbers = re.findall(r"-?\d+\.?\d*", final_answer)
	user_numbers = re.findall(r"-?\d+\.?\d*", user_answer)

	# Check if answers match
	is_correct = False

	# If both have numbers, compare the numbers
	if correct_numbers and user_numbers:
	# Convert to float for comparison to handle decimals
	try:
	correct_num = float(
	correct_numbers[-1]
	) # Take the last number as final answer
	user_num = float(user_numbers[-1]) # Take the last number from user
	is_correct = (
	abs(correct_num - user_num) < 0.0001
	) # Small tolerance for float comparison
	except ValueError:
	# Fall back to string comparison
	is_correct = correct_numbers[-1] == user_numbers[-1]
	elif correct_numbers and not user_numbers:
	# If correct answer has numbers but user answer doesn't, it's wrong
	is_correct = False
	else:
	# Fall back to substring matching for non-numeric answers
	# But ensure both strings are non-empty
	is_correct = (
	user_answer.lower().strip() in correct_answer.lower()
	or correct_answer.lower() in user_answer.lower().strip()
	) and len(user_answer.strip()) > 0

	if is_correct:
	return True, '✅ Correct!'
	else:
	logging.info(f"Raw answer (QA): {repr(correct_answer)}")
	logging.info(f"Extracted final answer: {repr(final_answer)}")
	logging.info(
	f"Correct numbers: {correct_numbers}, User numbers: {user_numbers}"
	)
	formatted_answer = self.format_answer(
	correct_answer, self.current_dataset_name
	)
	# Debug: log the formatted answer
	logging.info(f"Formatted answer with LaTeX: {repr(formatted_answer)}")
	return (
	False,
	f'❌ Incorrect\n\nThe correct answer was:\n\n{formatted_answer}',
	)

	return False, "Unknown question type"


	# Create global quiz app instance
	quiz_app = QuizApp()


	def create_dataset_display():
	"""Create the dataset listing display"""
	dataset_info = []
	for dataset_id, config in EVAL_DATASETS.items():
	dataset_info.append(
	f"{config['name']}\n- Dataset: {dataset_id}\n- Type: {config['type']}"
	)

	return "\n\n".join(dataset_info)


	def start_quiz(dataset_choice: str, num_questions: int):
	"""Start a new quiz with the selected dataset"""
	# Extract dataset ID from the choice
	dataset_id = None
	for did, config in EVAL_DATASETS.items():
	if config["name"] in dataset_choice:
	dataset_id = did
	break

	if not dataset_id:
	return (
	"Please select a dataset",
	gr.update(visible=False), # question_display
	gr.update(visible=False), # answer_radio
	gr.update(visible=False), # answer_textbox
	gr.update(visible=False), # submit_button
	gr.update(visible=False), # progress_text
	)

	success, message = quiz_app.load_dataset_questions(dataset_id, num_questions)

	if success:
	question, choices, q_type = quiz_app.get_current_question()

	if q_type in ["multiple_choice", "true_false", "binary_choice"]:
	return (
	message,
	gr.update(value=question, visible=True), # question_display
	gr.update(choices=choices, visible=True, value=None), # answer_radio
	gr.update(visible=False), # answer_textbox
	gr.update(visible=True), # submit_button
	gr.update(value=f"Question 1/{quiz_app.total_questions}", visible=True), # progress_text
	)
	else:
	return (
	message,
	gr.update(value=question, visible=True), # question_display
	gr.update(visible=False), # answer_radio
	gr.update(visible=True, value=""), # answer_textbox
	gr.update(visible=True), # submit_button
	gr.update(value=f"Question 1/{quiz_app.total_questions}", visible=True), # progress_text
	)
	else:
	return (
	message,
	gr.update(visible=False), # question_display
	gr.update(visible=False), # answer_radio
	gr.update(visible=False), # answer_textbox
	gr.update(visible=False), # submit_button
	gr.update(visible=False), # progress_text
	)


	def submit_answer(answer_choice, answer_text):
	"""Submit answer and show feedback"""
	# Determine which answer to use
	if answer_choice:
	answer = answer_choice
	else:
	answer = answer_text

	is_correct, feedback = quiz_app.check_answer(answer)

	if is_correct:
	quiz_app.score += 1

	return gr.update(value=feedback, visible=True), gr.update(visible=True)


	def next_question():
	"""Move to the next question"""
	quiz_app.current_question_idx += 1

	if quiz_app.current_question_idx >= quiz_app.total_questions:
	# Quiz complete
	final_score = f'🎉 Quiz Complete!\n\nYour score: {quiz_app.score}/{quiz_app.total_questions} ({quiz_app.score / quiz_app.total_questions * 100:.1f}%)'
	return (
	gr.update(value=final_score, visible=True),
	"",
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	gr.update(visible=False),
	"Quiz Complete",
	)

	question, choices, q_type = quiz_app.get_current_question()

	if q_type in ["multiple_choice", "true_false", "binary_choice"]:
	return (
	gr.update(value="", visible=False), # Clear feedback
	gr.update(value=question), # question_display
	gr.update(choices=choices, visible=True, value=None),
	gr.update(visible=False),
	gr.update(visible=True),
	gr.update(visible=False),
	gr.update(value=f"Question {quiz_app.current_question_idx + 1}/{quiz_app.total_questions}"),
	)
	else:
	return (
	gr.update(value="", visible=False), # Clear feedback
	gr.update(value=question), # question_display
	gr.update(visible=False),
	gr.update(visible=True, value=""),
	gr.update(visible=True),
	gr.update(visible=False),
	gr.update(value=f"Question {quiz_app.current_question_idx + 1}/{quiz_app.total_questions}"),
	)


	# Create Gradio interface
	with gr.Blocks(title="HuggingFace Evaluation Dataset Quiz") as demo:
	gr.Markdown("# 🤗 Evaluation Dataset Quiz")
	gr.Markdown(
	"Test yourself with questions from popular HuggingFace evaluation datasets!"
	)

	# Dataset Selection Section
	with gr.Row():
	dataset_dropdown = gr.Dropdown(
	choices=[config["name"] for config in EVAL_DATASETS.values()],
	label="Select Dataset",
	value=list(EVAL_DATASETS.values())[0]["name"],
	)
	num_questions_slider = gr.Slider(
	minimum=5, maximum=20, value=10, step=1, label="Number of Questions"
	)

	start_button = gr.Button("Start Quiz", variant="primary")
	status_message = gr.Textbox(label="Status", interactive=False)

	# Quiz Section - shown when quiz starts
	gr.Markdown("---") # Separator

	progress_text = gr.Textbox(label="Progress", value="0/0", interactive=False, visible=False)
	question_display = gr.Textbox(label="Question", lines=5, interactive=False, visible=False)

	# Answer inputs (one will be visible at a time)
	answer_radio = gr.Radio(label="Select your answer", visible=False)
	answer_textbox = gr.Textbox(label="Type your answer (Raw number)", visible=False)

	submit_button = gr.Button("Submit Answer", variant="primary", visible=False)

	feedback_display = gr.Textbox(
	label="Feedback",
	visible=False,
	lines=10,
	max_lines=20,
	interactive=False
	)
	next_button = gr.Button("Next Question", visible=False)

	# Connect events
	start_button.click(
	start_quiz,
	inputs=[dataset_dropdown, num_questions_slider],
	outputs=[
	status_message,
	question_display,
	answer_radio,
	answer_textbox,
	submit_button,
	progress_text,
	],
	)

	submit_button.click(
	submit_answer,
	inputs=[answer_radio, answer_textbox],
	outputs=[feedback_display, next_button],
	)

	next_button.click(
	next_question,
	outputs=[
	feedback_display,
	question_display,
	answer_radio,
	answer_textbox,
	submit_button,
	next_button,
	progress_text,
	],
	)

	if __name__ == "__main__":
	demo.launch()