File size: 4,184 Bytes
8474f02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import gradio as gr
import pandas as pd
import json
import os
from datetime import datetime
from dotenv import load_dotenv
import time

# Load environment variables
load_dotenv()

RESULTS_DIR = "results"
PROGRESS_FILE = os.path.join(RESULTS_DIR, "gpqa_progress.json")

def load_progress():
    if not os.path.exists(PROGRESS_FILE):
        return pd.DataFrame(), "No progress file found. The evaluation might be starting up.", "N/A"

    try:
        df = pd.read_json(PROGRESS_FILE)
        if df.empty:
            return pd.DataFrame(), "Progress file is empty.", "N/A"

        # Calculate metrics
        total_questions = len(df)
        correct_answers = df['is_correct'].sum()
        accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
        avg_response_time = df['response_time'].mean()

        summary_text = f"""
        ## Evaluation Progress
        - **Questions Processed:** {total_questions} / 448
        - **Current Accuracy:** {accuracy:.2f}%
        - **Correct Answers:** {correct_answers}
        - **Average Response Time:** {avg_response_time:.2f} seconds/question
        """
        
        # Get last modified time
        last_modified_time = datetime.fromtimestamp(os.path.getmtime(PROGRESS_FILE)).strftime('%Y-%m-%d %H:%M:%S')
        
        return df, summary_text, f"Last updated: {last_modified_time}"
    except Exception as e:
        return pd.DataFrame(), f"Error loading progress file: {e}", "N/A"

def create_ui():
    df, summary, last_updated = load_progress()
    
    with gr.Blocks(theme=gr.themes.Soft(), title="GPQA Evaluation Progress") as demo:
        gr.Markdown("# Real-Time GPQA Evaluation Dashboard")
        gr.Markdown("This dashboard shows the progress of the GPQA benchmark evaluation for the `grok-4-0709` model.")
        
        with gr.Row():
            summary_box = gr.Markdown(summary)
            last_updated_box = gr.Markdown(last_updated)
        
        with gr.Row():
            # Create a simple plot: number of correct vs incorrect answers
            if not df.empty:
                correct_counts = df['is_correct'].value_counts().rename({True: 'Correct', False: 'Incorrect'})
                plot = gr.BarPlot(correct_counts, x="Answer Status", y="Count", title="Correct vs. Incorrect Answers", interactive=False)

        gr.Markdown("## Raw Results")
        gr.DataFrame(df, wrap=True)

    return demo

def check_environment():
    """Check if all required environment variables are set"""
    issues = []
    
    if not os.getenv('GROK_API_KEY'):
        issues.append("GROK_API_KEY not found in environment")
    
    if not os.getenv('HF_TOKEN'):
        issues.append("HF_TOKEN not found (required for GPQA dataset access)")
    
    return issues

def start_evaluation_safe():
    """Safely start the evaluation process with error handling"""
    issues = check_environment()
    if issues:
        print("⚠️  Environment issues detected:")
        for issue in issues:
            print(f"   - {issue}")
        print("\nPlease set the required environment variables in .env or Hugging Face Secrets")
        return None
    
    import subprocess
    import sys
    
    print("Starting background evaluation process...")
    command = [
        sys.executable, 
        "run_evaluation.py", 
        "--config", "official_config.yaml", 
        "--models", "grok-4-0709", 
        "--benchmarks", "gpqa"
    ]
    
    try:
        # Use Popen to run in the background
        process = subprocess.Popen(command)
        print(f"Evaluation process started with PID: {process.pid}")
        return process
    except Exception as e:
        print(f"Failed to start evaluation: {e}")
        return None

if __name__ == "__main__":
    # Check environment first
    issues = check_environment()
    
    if issues:
        # Create UI with warning message
        ui = create_ui()
        print("\n⚠️  Running in demo mode due to missing configuration")
    else:
        # Start evaluation process
        process = start_evaluation_safe()
        ui = create_ui()
    
    # Launch the UI
    ui.launch(server_name="0.0.0.0", server_port=7860)