import gradio as gr import pandas as pd import json import os from datetime import datetime from dotenv import load_dotenv import time # Load environment variables load_dotenv() RESULTS_DIR = "results" PROGRESS_FILE = os.path.join(RESULTS_DIR, "gpqa_progress.json") def load_progress(): if not os.path.exists(PROGRESS_FILE): return pd.DataFrame(), "No progress file found. The evaluation might be starting up.", "N/A" try: df = pd.read_json(PROGRESS_FILE) if df.empty: return pd.DataFrame(), "Progress file is empty.", "N/A" # Calculate metrics total_questions = len(df) correct_answers = df['is_correct'].sum() accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0 avg_response_time = df['response_time'].mean() summary_text = f""" ## Evaluation Progress - **Questions Processed:** {total_questions} / 448 - **Current Accuracy:** {accuracy:.2f}% - **Correct Answers:** {correct_answers} - **Average Response Time:** {avg_response_time:.2f} seconds/question """ # Get last modified time last_modified_time = datetime.fromtimestamp(os.path.getmtime(PROGRESS_FILE)).strftime('%Y-%m-%d %H:%M:%S') return df, summary_text, f"Last updated: {last_modified_time}" except Exception as e: return pd.DataFrame(), f"Error loading progress file: {e}", "N/A" def create_ui(): df, summary, last_updated = load_progress() with gr.Blocks(theme=gr.themes.Soft(), title="GPQA Evaluation Progress") as demo: gr.Markdown("# Real-Time GPQA Evaluation Dashboard") gr.Markdown("This dashboard shows the progress of the GPQA benchmark evaluation for the `grok-4-0709` model.") with gr.Row(): summary_box = gr.Markdown(summary) last_updated_box = gr.Markdown(last_updated) with gr.Row(): # Create a simple plot: number of correct vs incorrect answers if not df.empty: correct_counts = df['is_correct'].value_counts().rename({True: 'Correct', False: 'Incorrect'}) plot = gr.BarPlot(correct_counts, x="Answer Status", y="Count", title="Correct vs. Incorrect Answers", interactive=False) gr.Markdown("## Raw Results") gr.DataFrame(df, wrap=True) return demo def check_environment(): """Check if all required environment variables are set""" issues = [] if not os.getenv('GROK_API_KEY'): issues.append("GROK_API_KEY not found in environment") if not os.getenv('HF_TOKEN'): issues.append("HF_TOKEN not found (required for GPQA dataset access)") return issues def start_evaluation_safe(): """Safely start the evaluation process with error handling""" issues = check_environment() if issues: print("⚠️ Environment issues detected:") for issue in issues: print(f" - {issue}") print("\nPlease set the required environment variables in .env or Hugging Face Secrets") return None import subprocess import sys print("Starting background evaluation process...") command = [ sys.executable, "run_evaluation.py", "--config", "official_config.yaml", "--models", "grok-4-0709", "--benchmarks", "gpqa" ] try: # Use Popen to run in the background process = subprocess.Popen(command) print(f"Evaluation process started with PID: {process.pid}") return process except Exception as e: print(f"Failed to start evaluation: {e}") return None if __name__ == "__main__": # Check environment first issues = check_environment() if issues: # Create UI with warning message ui = create_ui() print("\n⚠️ Running in demo mode due to missing configuration") else: # Start evaluation process process = start_evaluation_safe() ui = create_ui() # Launch the UI ui.launch(server_name="0.0.0.0", server_port=7860)