Spaces:

TeddyYao
/

grok4-gpqa-eval

Running

File size: 4,184 Bytes

8474f02

import gradio as gr
import pandas as pd
import json
import os
from datetime import datetime
from dotenv import load_dotenv
import time

# Load environment variables
load_dotenv()

RESULTS_DIR = "results"
PROGRESS_FILE = os.path.join(RESULTS_DIR, "gpqa_progress.json")

def load_progress():
    if not os.path.exists(PROGRESS_FILE):
        return pd.DataFrame(), "No progress file found. The evaluation might be starting up.", "N/A"

    try:
        df = pd.read_json(PROGRESS_FILE)
        if df.empty:
            return pd.DataFrame(), "Progress file is empty.", "N/A"

        # Calculate metrics
        total_questions = len(df)
        correct_answers = df['is_correct'].sum()
        accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
        avg_response_time = df['response_time'].mean()

        summary_text = f"""
        ## Evaluation Progress
        - **Questions Processed:** {total_questions} / 448
        - **Current Accuracy:** {accuracy:.2f}%
        - **Correct Answers:** {correct_answers}
        - **Average Response Time:** {avg_response_time:.2f} seconds/question
        """
        
        # Get last modified time
        last_modified_time = datetime.fromtimestamp(os.path.getmtime(PROGRESS_FILE)).strftime('%Y-%m-%d %H:%M:%S')
        
        return df, summary_text, f"Last updated: {last_modified_time}"
    except Exception as e:
        return pd.DataFrame(), f"Error loading progress file: {e}", "N/A"

def create_ui():
    df, summary, last_updated = load_progress()
    
    with gr.Blocks(theme=gr.themes.Soft(), title="GPQA Evaluation Progress") as demo:
        gr.Markdown("# Real-Time GPQA Evaluation Dashboard")
        gr.Markdown("This dashboard shows the progress of the GPQA benchmark evaluation for the `grok-4-0709` model.")
        
        with gr.Row():
            summary_box = gr.Markdown(summary)
            last_updated_box = gr.Markdown(last_updated)
        
        with gr.Row():
            # Create a simple plot: number of correct vs incorrect answers
            if not df.empty:
                correct_counts = df['is_correct'].value_counts().rename({True: 'Correct', False: 'Incorrect'})
                plot = gr.BarPlot(correct_counts, x="Answer Status", y="Count", title="Correct vs. Incorrect Answers", interactive=False)

        gr.Markdown("## Raw Results")
        gr.DataFrame(df, wrap=True)

    return demo

def check_environment():
    """Check if all required environment variables are set"""
    issues = []
    
    if not os.getenv('GROK_API_KEY'):
        issues.append("GROK_API_KEY not found in environment")
    
    if not os.getenv('HF_TOKEN'):
        issues.append("HF_TOKEN not found (required for GPQA dataset access)")
    
    return issues

def start_evaluation_safe():
    """Safely start the evaluation process with error handling"""
    issues = check_environment()
    if issues:
        print("⚠️  Environment issues detected:")
        for issue in issues:
            print(f"   - {issue}")
        print("\nPlease set the required environment variables in .env or Hugging Face Secrets")
        return None
    
    import subprocess
    import sys
    
    print("Starting background evaluation process...")
    command = [
        sys.executable, 
        "run_evaluation.py", 
        "--config", "official_config.yaml", 
        "--models", "grok-4-0709", 
        "--benchmarks", "gpqa"
    ]
    
    try:
        # Use Popen to run in the background
        process = subprocess.Popen(command)
        print(f"Evaluation process started with PID: {process.pid}")
        return process
    except Exception as e:
        print(f"Failed to start evaluation: {e}")
        return None

if __name__ == "__main__":
    # Check environment first
    issues = check_environment()
    
    if issues:
        # Create UI with warning message
        ui = create_ui()
        print("\n⚠️  Running in demo mode due to missing configuration")
    else:
        # Start evaluation process
        process = start_evaluation_safe()
        ui = create_ui()
    
    # Launch the UI
    ui.launch(server_name="0.0.0.0", server_port=7860)