Spaces:

TeddyYao
/

grok4-gpqa-eval

Running

App Files Files Community

grok4-gpqa-eval / app.py

TeddyYao

Upload 38 files

8474f02 verified 22 days ago

raw

history blame contribute delete

4.18 kB

	import gradio as gr
	import pandas as pd
	import json
	import os
	from datetime import datetime
	from dotenv import load_dotenv
	import time

	# Load environment variables
	load_dotenv()

	RESULTS_DIR = "results"
	PROGRESS_FILE = os.path.join(RESULTS_DIR, "gpqa_progress.json")

	def load_progress():
	if not os.path.exists(PROGRESS_FILE):
	return pd.DataFrame(), "No progress file found. The evaluation might be starting up.", "N/A"

	try:
	df = pd.read_json(PROGRESS_FILE)
	if df.empty:
	return pd.DataFrame(), "Progress file is empty.", "N/A"

	# Calculate metrics
	total_questions = len(df)
	correct_answers = df['is_correct'].sum()
	accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0
	avg_response_time = df['response_time'].mean()

	summary_text = f"""
	## Evaluation Progress
	- Questions Processed: {total_questions} / 448
	- Current Accuracy: {accuracy:.2f}%
	- Correct Answers: {correct_answers}
	- Average Response Time: {avg_response_time:.2f} seconds/question
	"""

	# Get last modified time
	last_modified_time = datetime.fromtimestamp(os.path.getmtime(PROGRESS_FILE)).strftime('%Y-%m-%d %H:%M:%S')

	return df, summary_text, f"Last updated: {last_modified_time}"
	except Exception as e:
	return pd.DataFrame(), f"Error loading progress file: {e}", "N/A"

	def create_ui():
	df, summary, last_updated = load_progress()

	with gr.Blocks(theme=gr.themes.Soft(), title="GPQA Evaluation Progress") as demo:
	gr.Markdown("# Real-Time GPQA Evaluation Dashboard")
	gr.Markdown("This dashboard shows the progress of the GPQA benchmark evaluation for the `grok-4-0709` model.")

	with gr.Row():
	summary_box = gr.Markdown(summary)
	last_updated_box = gr.Markdown(last_updated)

	with gr.Row():
	# Create a simple plot: number of correct vs incorrect answers
	if not df.empty:
	correct_counts = df['is_correct'].value_counts().rename({True: 'Correct', False: 'Incorrect'})
	plot = gr.BarPlot(correct_counts, x="Answer Status", y="Count", title="Correct vs. Incorrect Answers", interactive=False)

	gr.Markdown("## Raw Results")
	gr.DataFrame(df, wrap=True)

	return demo

	def check_environment():
	"""Check if all required environment variables are set"""
	issues = []

	if not os.getenv('GROK_API_KEY'):
	issues.append("GROK_API_KEY not found in environment")

	if not os.getenv('HF_TOKEN'):
	issues.append("HF_TOKEN not found (required for GPQA dataset access)")

	return issues

	def start_evaluation_safe():
	"""Safely start the evaluation process with error handling"""
	issues = check_environment()
	if issues:
	print("⚠️ Environment issues detected:")
	for issue in issues:
	print(f" - {issue}")
	print("\nPlease set the required environment variables in .env or Hugging Face Secrets")
	return None

	import subprocess
	import sys

	print("Starting background evaluation process...")
	command = [
	sys.executable,
	"run_evaluation.py",
	"--config", "official_config.yaml",
	"--models", "grok-4-0709",
	"--benchmarks", "gpqa"
	]

	try:
	# Use Popen to run in the background
	process = subprocess.Popen(command)
	print(f"Evaluation process started with PID: {process.pid}")
	return process
	except Exception as e:
	print(f"Failed to start evaluation: {e}")
	return None

	if __name__ == "__main__":
	# Check environment first
	issues = check_environment()

	if issues:
	# Create UI with warning message
	ui = create_ui()
	print("\n⚠️ Running in demo mode due to missing configuration")
	else:
	# Start evaluation process
	process = start_evaluation_safe()
	ui = create_ui()

	# Launch the UI
	ui.launch(server_name="0.0.0.0", server_port=7860)