Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
import json | |
import os | |
from datetime import datetime | |
from dotenv import load_dotenv | |
import time | |
# Load environment variables | |
load_dotenv() | |
RESULTS_DIR = "results" | |
PROGRESS_FILE = os.path.join(RESULTS_DIR, "gpqa_progress.json") | |
def load_progress(): | |
if not os.path.exists(PROGRESS_FILE): | |
return pd.DataFrame(), "No progress file found. The evaluation might be starting up.", "N/A" | |
try: | |
df = pd.read_json(PROGRESS_FILE) | |
if df.empty: | |
return pd.DataFrame(), "Progress file is empty.", "N/A" | |
# Calculate metrics | |
total_questions = len(df) | |
correct_answers = df['is_correct'].sum() | |
accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0 | |
avg_response_time = df['response_time'].mean() | |
summary_text = f""" | |
## Evaluation Progress | |
- **Questions Processed:** {total_questions} / 448 | |
- **Current Accuracy:** {accuracy:.2f}% | |
- **Correct Answers:** {correct_answers} | |
- **Average Response Time:** {avg_response_time:.2f} seconds/question | |
""" | |
# Get last modified time | |
last_modified_time = datetime.fromtimestamp(os.path.getmtime(PROGRESS_FILE)).strftime('%Y-%m-%d %H:%M:%S') | |
return df, summary_text, f"Last updated: {last_modified_time}" | |
except Exception as e: | |
return pd.DataFrame(), f"Error loading progress file: {e}", "N/A" | |
def create_ui(): | |
df, summary, last_updated = load_progress() | |
with gr.Blocks(theme=gr.themes.Soft(), title="GPQA Evaluation Progress") as demo: | |
gr.Markdown("# Real-Time GPQA Evaluation Dashboard") | |
gr.Markdown("This dashboard shows the progress of the GPQA benchmark evaluation for the `grok-4-0709` model.") | |
with gr.Row(): | |
summary_box = gr.Markdown(summary) | |
last_updated_box = gr.Markdown(last_updated) | |
with gr.Row(): | |
# Create a simple plot: number of correct vs incorrect answers | |
if not df.empty: | |
correct_counts = df['is_correct'].value_counts().rename({True: 'Correct', False: 'Incorrect'}) | |
plot = gr.BarPlot(correct_counts, x="Answer Status", y="Count", title="Correct vs. Incorrect Answers", interactive=False) | |
gr.Markdown("## Raw Results") | |
gr.DataFrame(df, wrap=True) | |
return demo | |
def check_environment(): | |
"""Check if all required environment variables are set""" | |
issues = [] | |
if not os.getenv('GROK_API_KEY'): | |
issues.append("GROK_API_KEY not found in environment") | |
if not os.getenv('HF_TOKEN'): | |
issues.append("HF_TOKEN not found (required for GPQA dataset access)") | |
return issues | |
def start_evaluation_safe(): | |
"""Safely start the evaluation process with error handling""" | |
issues = check_environment() | |
if issues: | |
print("⚠️ Environment issues detected:") | |
for issue in issues: | |
print(f" - {issue}") | |
print("\nPlease set the required environment variables in .env or Hugging Face Secrets") | |
return None | |
import subprocess | |
import sys | |
print("Starting background evaluation process...") | |
command = [ | |
sys.executable, | |
"run_evaluation.py", | |
"--config", "official_config.yaml", | |
"--models", "grok-4-0709", | |
"--benchmarks", "gpqa" | |
] | |
try: | |
# Use Popen to run in the background | |
process = subprocess.Popen(command) | |
print(f"Evaluation process started with PID: {process.pid}") | |
return process | |
except Exception as e: | |
print(f"Failed to start evaluation: {e}") | |
return None | |
if __name__ == "__main__": | |
# Check environment first | |
issues = check_environment() | |
if issues: | |
# Create UI with warning message | |
ui = create_ui() | |
print("\n⚠️ Running in demo mode due to missing configuration") | |
else: | |
# Start evaluation process | |
process = start_evaluation_safe() | |
ui = create_ui() | |
# Launch the UI | |
ui.launch(server_name="0.0.0.0", server_port=7860) | |