Spaces:

Narsil
/

eval_playground

Sleeping

File size: 22,487 Bytes

import gradio as gr
from datasets import load_dataset, get_dataset_config_names
import random
from typing import List, Tuple
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
)

# Popular evaluation datasets with their configurations
EVAL_DATASETS = {
    "openai/gsm8k": {
        "name": "GSM8K - Grade School Math",
        "type": "qa",
        "config": "main",
        "question_field": "question",
        "answer_field": "answer",
        "split": "train",
    },
    "cais/mmlu": {
        "name": "MMLU - Massive Multitask Language Understanding",
        "type": "multiple_choice",
        "config": "all",
        "question_field": "question",
        "choices_field": "choices",
        "answer_field": "answer",
        "split": "test",
    },
    "allenai/ai2_arc": {
        "name": "AI2 ARC - Science Questions",
        "type": "multiple_choice",
        "config": "ARC-Challenge",
        "question_field": "question",
        "choices_field": "choices",
        "answer_field": "answerKey",
        "split": "train",
    },
    "Rowan/hellaswag": {
        "name": "HellaSwag - Commonsense NLI",
        "type": "multiple_choice",
        "question_field": "ctx",
        "choices_field": "endings",
        "answer_field": "label",
        "split": "train",
    },
    "allenai/winogrande": {
        "name": "WinoGrande - Winograd Schema",
        "type": "binary_choice",
        "config": "winogrande_xl",
        "question_field": "sentence",
        "option1_field": "option1",
        "option2_field": "option2",
        "answer_field": "answer",
        "split": "train",
    },
    "google/boolq": {
        "name": "BoolQ - Boolean Questions",
        "type": "true_false",
        "question_field": "question",
        "context_field": "passage",
        "answer_field": "answer",
        "split": "train",
    },
    "rajpurkar/squad": {
        "name": "SQuAD - Reading Comprehension",
        "type": "extractive_qa",
        "question_field": "question",
        "context_field": "context",
        "answer_field": "answers",
        "split": "train",
    },
    "allenai/piqa": {
        "name": "PIQA - Physical Reasoning",
        "type": "binary_choice",
        "question_field": "goal",
        "option1_field": "sol1",
        "option2_field": "sol2",
        "answer_field": "label",
        "split": "train",
    },
}


class QuizApp:
    def __init__(self):
        self.current_dataset = None
        self.current_dataset_name = None
        self.questions = []
        self.current_question_idx = 0
        self.score = 0
        self.total_questions = 0

    def load_dataset_questions(self, dataset_name: str, num_questions: int = 10):
        """Load random questions from the selected dataset"""
        try:
            config = EVAL_DATASETS[dataset_name]

            # Try to load dataset with config if specified
            try:
                if "config" in config:
                    dataset = load_dataset(
                        dataset_name, config["config"], split=config["split"]
                    )
                else:
                    dataset = load_dataset(dataset_name, split=config["split"])
            except ValueError as e:
                # If config is missing, try to get available configs
                if "Config name is missing" in str(e):
                    configs = get_dataset_config_names(dataset_name)
                    # Use first config or "all" if available
                    if "all" in configs:
                        selected_config = "all"
                    else:
                        selected_config = configs[0]
                    print(
                        f"Auto-selected config '{selected_config}' for {dataset_name}"
                    )
                    dataset = load_dataset(
                        dataset_name, selected_config, split=config["split"]
                    )
                else:
                    raise e

            # Sample random questions
            total_examples = len(dataset)
            num_questions = min(num_questions, total_examples)
            indices = random.sample(range(total_examples), num_questions)

            self.questions = []
            for idx in indices:
                example = dataset[idx]
                self.questions.append(example)

            self.current_dataset = config
            self.current_dataset_name = dataset_name
            self.current_question_idx = 0
            self.score = 0
            self.total_questions = len(self.questions)

            return True, f"Loaded {num_questions} questions from {config['name']}"

        except Exception as e:
            return False, f"Error loading dataset: {str(e)}"

    def get_current_question(self) -> Tuple[str, List[str], str]:
        """Get the current question formatted for display"""
        if not self.questions or self.current_question_idx >= len(self.questions):
            return "", [], ""

        question_data = self.questions[self.current_question_idx]
        config = self.current_dataset

        logging.info(f"\n{'=' * 60}")
        logging.info(f"Dataset: {self.current_dataset_name}")
        logging.info(f"Question {self.current_question_idx + 1}/{self.total_questions}")
        logging.info(f"Raw question data: {repr(question_data)}")
        logging.info(f"{'=' * 60}\n")

        # Format question based on dataset type
        question_type = config["type"]

        if question_type == "multiple_choice":
            question = question_data[config["question_field"]]
            choices = question_data[config["choices_field"]]
            if config["answer_field"] in question_data:
                answer = question_data[config["answer_field"]]
            else:
                answer = ""

            # Format choices with letters
            formatted_choices = [
                f"{chr(65 + i)}. {choice}" for i, choice in enumerate(choices)
            ]
            return question, formatted_choices, question_type

        elif question_type == "true_false":
            question = question_data[config["question_field"]]
            if "context_field" in config:
                context = question_data[config["context_field"]]
                question = f"Context: {context}\n\nQuestion: {question}"
            return question, ["True", "False"], question_type

        elif question_type == "binary_choice":
            question = question_data[config["question_field"]]
            option1 = question_data[config["option1_field"]]
            option2 = question_data[config["option2_field"]]
            return question, [f"A. {option1}", f"B. {option2}"], question_type

        elif question_type == "qa" or question_type == "extractive_qa":
            question = question_data[config["question_field"]]
            if "context_field" in config and config["context_field"] in question_data:
                context = question_data[config["context_field"]]
                question = f"Context: {context[:500]}...\n\nQuestion: {question}"
            return question, [], question_type

        return "", [], ""

    def format_answer(self, answer: str, dataset_name: str) -> str:
        """Format answer based on dataset type for better readability"""
        import re
        
        
        # Convert <<equation>> to show the math clearly
        # Extract the equation and its result, show just the result with equation in parentheses
        def format_equation(match):
            equation = match.group(1)
            # Check if it's in format "calculation=result"
            if '=' in equation:
                parts = equation.split('=')
                if len(parts) == 2:
                    calculation, result = parts[0], parts[1]
                    return f"{result} (={calculation})"
            return f"[{equation}]"
        
        answer = re.sub(r"<<([^>]+)>>", format_equation, answer)
        
        # Dataset-specific formatting
        if dataset_name == "openai/gsm8k":
            # Format the final answer line
            answer = answer.replace("####", "\n\nFinal Answer:")
            # Ensure proper line breaks after periods for readability
            answer = re.sub(r'\. (?=[A-Z])', '.\n', answer)
        
        return answer

    def check_answer(self, user_answer: str) -> Tuple[bool, str]:
        """Check if the user's answer is correct"""
        if not self.questions or self.current_question_idx >= len(self.questions):
            return False, "No question available"

        question_data = self.questions[self.current_question_idx]
        config = self.current_dataset
        question_type = config["type"]

        if question_type == "multiple_choice":
            correct_answer_idx = question_data[config["answer_field"]]
            # Handle both numeric and letter answers
            if isinstance(correct_answer_idx, int):
                correct_letter = chr(65 + correct_answer_idx)
            else:
                correct_letter = str(correct_answer_idx)

            user_letter = user_answer.strip().upper()[0] if user_answer else ""
            is_correct = user_letter == correct_letter

            if is_correct:
                return True, '✅ Correct!'
            else:
                choices = question_data[config["choices_field"]]
                correct_choice = (
                    choices[correct_answer_idx]
                    if isinstance(correct_answer_idx, int)
                    else correct_answer_idx
                )
                logging.info(f"Raw answer (multiple choice): {repr(correct_choice)}")
                formatted_answer = self.format_answer(
                    correct_choice, self.current_dataset_name
                )
                return (
                    False,
                    f'❌ Incorrect\n\nThe correct answer was {correct_letter}:\n\n{formatted_answer}',
                )

        elif question_type == "true_false":
            correct_answer = question_data[config["answer_field"]]
            user_bool = user_answer.lower().strip() == "true"
            is_correct = user_bool == correct_answer

            if is_correct:
                return True, '✅ Correct!'
            else:
                return (
                    False,
                    f'❌ Incorrect\n\nThe correct answer was {correct_answer}',
                )

        elif question_type == "binary_choice":
            correct_answer_idx = question_data[config["answer_field"]]
            user_idx = 0 if user_answer.strip().upper().startswith("A") else 1
            is_correct = user_idx == correct_answer_idx

            if is_correct:
                return True, '✅ Correct!'
            else:
                correct_letter = "A" if correct_answer_idx == 0 else "B"
                option_field = (
                    config["option1_field"]
                    if correct_answer_idx == 0
                    else config["option2_field"]
                )
                correct_option = question_data[option_field]
                logging.info(f"Raw answer (binary choice): {repr(correct_option)}")
                formatted_answer = self.format_answer(
                    correct_option, self.current_dataset_name
                )
                return (
                    False,
                    f'❌ Incorrect\n\nThe correct answer was {correct_letter}:\n\n{formatted_answer}',
                )

        elif question_type in ["qa", "extractive_qa"]:
            # For QA, we'll do a simple check - in real app, you'd want more sophisticated matching
            correct_answer = question_data[config["answer_field"]]
            if isinstance(correct_answer, dict) and "text" in correct_answer:
                correct_answer = (
                    correct_answer["text"][0] if correct_answer["text"] else ""
                )
            elif isinstance(correct_answer, list) and len(correct_answer) > 0:
                correct_answer = (
                    correct_answer[0]["text"]
                    if isinstance(correct_answer[0], dict)
                    else str(correct_answer[0])
                )
            else:
                correct_answer = str(correct_answer)

            # Extract final answer for GSM8K and similar datasets
            import re

            # For GSM8K, extract the final answer after ####
            if "####" in correct_answer:
                final_answer_match = re.search(r"####\s*(.+)", correct_answer)
                if final_answer_match:
                    final_answer = final_answer_match.group(1).strip()
                else:
                    final_answer = correct_answer
            else:
                final_answer = correct_answer

            # First check if user answer is empty
            if not user_answer or not user_answer.strip():
                is_correct = False
            else:
                # Extract numbers from both answers for comparison
                correct_numbers = re.findall(r"-?\d+\.?\d*", final_answer)
                user_numbers = re.findall(r"-?\d+\.?\d*", user_answer)

                # Check if answers match
                is_correct = False

                # If both have numbers, compare the numbers
                if correct_numbers and user_numbers:
                    # Convert to float for comparison to handle decimals
                    try:
                        correct_num = float(
                            correct_numbers[-1]
                        )  # Take the last number as final answer
                        user_num = float(user_numbers[-1])  # Take the last number from user
                        is_correct = (
                            abs(correct_num - user_num) < 0.0001
                        )  # Small tolerance for float comparison
                    except ValueError:
                        # Fall back to string comparison
                        is_correct = correct_numbers[-1] == user_numbers[-1]
                elif correct_numbers and not user_numbers:
                    # If correct answer has numbers but user answer doesn't, it's wrong
                    is_correct = False
                else:
                    # Fall back to substring matching for non-numeric answers
                    # But ensure both strings are non-empty
                    is_correct = (
                        user_answer.lower().strip() in correct_answer.lower()
                        or correct_answer.lower() in user_answer.lower().strip()
                    ) and len(user_answer.strip()) > 0

            if is_correct:
                return True, '✅ Correct!'
            else:
                logging.info(f"Raw answer (QA): {repr(correct_answer)}")
                logging.info(f"Extracted final answer: {repr(final_answer)}")
                logging.info(
                    f"Correct numbers: {correct_numbers}, User numbers: {user_numbers}"
                )
                formatted_answer = self.format_answer(
                    correct_answer, self.current_dataset_name
                )
                # Debug: log the formatted answer
                logging.info(f"Formatted answer with LaTeX: {repr(formatted_answer)}")
                return (
                    False,
                    f'❌ Incorrect\n\nThe correct answer was:\n\n{formatted_answer}',
                )

        return False, "Unknown question type"


# Create global quiz app instance
quiz_app = QuizApp()


def create_dataset_display():
    """Create the dataset listing display"""
    dataset_info = []
    for dataset_id, config in EVAL_DATASETS.items():
        dataset_info.append(
            f"**{config['name']}**\n- Dataset: {dataset_id}\n- Type: {config['type']}"
        )

    return "\n\n".join(dataset_info)


def start_quiz(dataset_choice: str, num_questions: int):
    """Start a new quiz with the selected dataset"""
    # Extract dataset ID from the choice
    dataset_id = None
    for did, config in EVAL_DATASETS.items():
        if config["name"] in dataset_choice:
            dataset_id = did
            break

    if not dataset_id:
        return (
            "Please select a dataset",
            gr.update(visible=False),  # question_display
            gr.update(visible=False),  # answer_radio
            gr.update(visible=False),  # answer_textbox
            gr.update(visible=False),  # submit_button
            gr.update(visible=False),  # progress_text
        )

    success, message = quiz_app.load_dataset_questions(dataset_id, num_questions)

    if success:
        question, choices, q_type = quiz_app.get_current_question()

        if q_type in ["multiple_choice", "true_false", "binary_choice"]:
            return (
                message,
                gr.update(value=question, visible=True),  # question_display
                gr.update(choices=choices, visible=True, value=None),  # answer_radio
                gr.update(visible=False),  # answer_textbox
                gr.update(visible=True),  # submit_button
                gr.update(value=f"Question 1/{quiz_app.total_questions}", visible=True),  # progress_text
            )
        else:
            return (
                message,
                gr.update(value=question, visible=True),  # question_display
                gr.update(visible=False),  # answer_radio
                gr.update(visible=True, value=""),  # answer_textbox
                gr.update(visible=True),  # submit_button
                gr.update(value=f"Question 1/{quiz_app.total_questions}", visible=True),  # progress_text
            )
    else:
        return (
            message,
            gr.update(visible=False),  # question_display
            gr.update(visible=False),  # answer_radio
            gr.update(visible=False),  # answer_textbox
            gr.update(visible=False),  # submit_button
            gr.update(visible=False),  # progress_text
        )


def submit_answer(answer_choice, answer_text):
    """Submit answer and show feedback"""
    # Determine which answer to use
    if answer_choice:
        answer = answer_choice
    else:
        answer = answer_text

    is_correct, feedback = quiz_app.check_answer(answer)

    if is_correct:
        quiz_app.score += 1

    return gr.update(value=feedback, visible=True), gr.update(visible=True)


def next_question():
    """Move to the next question"""
    quiz_app.current_question_idx += 1

    if quiz_app.current_question_idx >= quiz_app.total_questions:
        # Quiz complete
        final_score = f'🎉 Quiz Complete!\n\nYour score: {quiz_app.score}/{quiz_app.total_questions} ({quiz_app.score / quiz_app.total_questions * 100:.1f}%)'
        return (
            gr.update(value=final_score, visible=True),
            "",
            gr.update(visible=False),
            gr.update(visible=False),
            gr.update(visible=False),
            gr.update(visible=False),
            "Quiz Complete",
        )

    question, choices, q_type = quiz_app.get_current_question()

    if q_type in ["multiple_choice", "true_false", "binary_choice"]:
        return (
            gr.update(value="", visible=False),  # Clear feedback
            gr.update(value=question),  # question_display
            gr.update(choices=choices, visible=True, value=None),
            gr.update(visible=False),
            gr.update(visible=True),
            gr.update(visible=False),
            gr.update(value=f"Question {quiz_app.current_question_idx + 1}/{quiz_app.total_questions}"),
        )
    else:
        return (
            gr.update(value="", visible=False),  # Clear feedback
            gr.update(value=question),  # question_display
            gr.update(visible=False),
            gr.update(visible=True, value=""),
            gr.update(visible=True),
            gr.update(visible=False),
            gr.update(value=f"Question {quiz_app.current_question_idx + 1}/{quiz_app.total_questions}"),
        )


# Create Gradio interface
with gr.Blocks(title="HuggingFace Evaluation Dataset Quiz") as demo:
    gr.Markdown("# 🤗 Evaluation Dataset Quiz")
    gr.Markdown(
        "Test yourself with questions from popular HuggingFace evaluation datasets!"
    )

    # Dataset Selection Section
    with gr.Row():
        dataset_dropdown = gr.Dropdown(
            choices=[config["name"] for config in EVAL_DATASETS.values()],
            label="Select Dataset",
            value=list(EVAL_DATASETS.values())[0]["name"],
        )
        num_questions_slider = gr.Slider(
            minimum=5, maximum=20, value=10, step=1, label="Number of Questions"
        )

    start_button = gr.Button("Start Quiz", variant="primary")
    status_message = gr.Textbox(label="Status", interactive=False)
    
    # Quiz Section - shown when quiz starts
    gr.Markdown("---")  # Separator
    
    progress_text = gr.Textbox(label="Progress", value="0/0", interactive=False, visible=False)
    question_display = gr.Textbox(label="Question", lines=5, interactive=False, visible=False)

    # Answer inputs (one will be visible at a time)
    answer_radio = gr.Radio(label="Select your answer", visible=False)
    answer_textbox = gr.Textbox(label="Type your answer (Raw number)", visible=False)

    submit_button = gr.Button("Submit Answer", variant="primary", visible=False)

    feedback_display = gr.Textbox(
        label="Feedback",
        visible=False,
        lines=10,
        max_lines=20,
        interactive=False
    )
    next_button = gr.Button("Next Question", visible=False)

    # Connect events
    start_button.click(
        start_quiz,
        inputs=[dataset_dropdown, num_questions_slider],
        outputs=[
            status_message,
            question_display,
            answer_radio,
            answer_textbox,
            submit_button,
            progress_text,
        ],
    )

    submit_button.click(
        submit_answer,
        inputs=[answer_radio, answer_textbox],
        outputs=[feedback_display, next_button],
    )

    next_button.click(
        next_question,
        outputs=[
            feedback_display,
            question_display,
            answer_radio,
            answer_textbox,
            submit_button,
            next_button,
            progress_text,
        ],
    )

if __name__ == "__main__":
    demo.launch()