grok4-gpqa-eval / app.py
TeddyYao's picture
Upload 38 files
8474f02 verified
import gradio as gr
import pandas as pd
import json
import os
from datetime import datetime
from dotenv import load_dotenv
import time
# Load environment variables
load_dotenv()
RESULTS_DIR = "results"
PROGRESS_FILE = os.path.join(RESULTS_DIR, "gpqa_progress.json")
def load_progress():
if not os.path.exists(PROGRESS_FILE):
return pd.DataFrame(), "No progress file found. The evaluation might be starting up.", "N/A"
try:
df = pd.read_json(PROGRESS_FILE)
if df.empty:
return pd.DataFrame(), "Progress file is empty.", "N/A"
# Calculate metrics
total_questions = len(df)
correct_answers = df['is_correct'].sum()
accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
avg_response_time = df['response_time'].mean()
summary_text = f"""
## Evaluation Progress
- **Questions Processed:** {total_questions} / 448
- **Current Accuracy:** {accuracy:.2f}%
- **Correct Answers:** {correct_answers}
- **Average Response Time:** {avg_response_time:.2f} seconds/question
"""
# Get last modified time
last_modified_time = datetime.fromtimestamp(os.path.getmtime(PROGRESS_FILE)).strftime('%Y-%m-%d %H:%M:%S')
return df, summary_text, f"Last updated: {last_modified_time}"
except Exception as e:
return pd.DataFrame(), f"Error loading progress file: {e}", "N/A"
def create_ui():
df, summary, last_updated = load_progress()
with gr.Blocks(theme=gr.themes.Soft(), title="GPQA Evaluation Progress") as demo:
gr.Markdown("# Real-Time GPQA Evaluation Dashboard")
gr.Markdown("This dashboard shows the progress of the GPQA benchmark evaluation for the `grok-4-0709` model.")
with gr.Row():
summary_box = gr.Markdown(summary)
last_updated_box = gr.Markdown(last_updated)
with gr.Row():
# Create a simple plot: number of correct vs incorrect answers
if not df.empty:
correct_counts = df['is_correct'].value_counts().rename({True: 'Correct', False: 'Incorrect'})
plot = gr.BarPlot(correct_counts, x="Answer Status", y="Count", title="Correct vs. Incorrect Answers", interactive=False)
gr.Markdown("## Raw Results")
gr.DataFrame(df, wrap=True)
return demo
def check_environment():
"""Check if all required environment variables are set"""
issues = []
if not os.getenv('GROK_API_KEY'):
issues.append("GROK_API_KEY not found in environment")
if not os.getenv('HF_TOKEN'):
issues.append("HF_TOKEN not found (required for GPQA dataset access)")
return issues
def start_evaluation_safe():
"""Safely start the evaluation process with error handling"""
issues = check_environment()
if issues:
print("⚠️ Environment issues detected:")
for issue in issues:
print(f" - {issue}")
print("\nPlease set the required environment variables in .env or Hugging Face Secrets")
return None
import subprocess
import sys
print("Starting background evaluation process...")
command = [
sys.executable,
"run_evaluation.py",
"--config", "official_config.yaml",
"--models", "grok-4-0709",
"--benchmarks", "gpqa"
]
try:
# Use Popen to run in the background
process = subprocess.Popen(command)
print(f"Evaluation process started with PID: {process.pid}")
return process
except Exception as e:
print(f"Failed to start evaluation: {e}")
return None
if __name__ == "__main__":
# Check environment first
issues = check_environment()
if issues:
# Create UI with warning message
ui = create_ui()
print("\n⚠️ Running in demo mode due to missing configuration")
else:
# Start evaluation process
process = start_evaluation_safe()
ui = create_ui()
# Launch the UI
ui.launch(server_name="0.0.0.0", server_port=7860)