import gradio as gr import pandas as pd from huggingface_hub import snapshot_download, create_repo from huggingface_hub.utils import RepositoryNotFoundError import os from src.about import ( INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT, TITLE, ) from src.display.css_html_js import custom_css from src.display.utils import ( BENCHMARK_COLS, COLS, AutoEvalColumn, fields, ) from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, TOKEN, OWNER from src.populate import get_leaderboard_df def create_results_dataframe(): """Create and return the results DataFrame for display""" import sys sys.stderr.write("\nšŸ“Š CREATE_RESULTS_DATAFRAME CALLED\n") sys.stderr.flush() df = get_leaderboard_df(EVAL_RESULTS_PATH, COLS, BENCHMARK_COLS) sys.stderr.write(f"šŸ“‹ Retrieved leaderboard df: {df.shape if df is not None else 'None'}\n") sys.stderr.flush() if df is None or df.empty: sys.stderr.write("āš ļø DataFrame is None or empty, returning empty DataFrame\n") sys.stderr.flush() # Return empty DataFrame with proper columns return pd.DataFrame(columns=["Model", "Match P-Value", "Type", "Precision"]) sys.stderr.write(f"šŸ“Š Original DataFrame columns: {list(df.columns)}\n") sys.stderr.flush() # Check if required columns exist - only p-values matter required_cols = [ AutoEvalColumn.model.name, AutoEvalColumn.model_trace_p_value.name, AutoEvalColumn.model_type.name, AutoEvalColumn.precision.name, ] missing_cols = [col for col in required_cols if col not in df.columns] if missing_cols: sys.stderr.write(f"āš ļø Missing columns in DataFrame: {missing_cols}\n") sys.stderr.flush() # Add missing columns with default values for col in missing_cols: if col == AutoEvalColumn.model_trace_p_value.name: df[col] = None sys.stderr.write(f"āž• Added {col} column with None values\n") # Select and rename columns for display try: display_df = df[required_cols].copy() sys.stderr.write(f"āœ… Selected columns successfully: {list(display_df.columns)}\n") except Exception as e: sys.stderr.write(f"šŸ’„ Error selecting columns: {e}\n") sys.stderr.flush() return pd.DataFrame(columns=["Model", "Match P-Value", "Type", "Precision"]) # Rename columns for better display display_df.columns = ["Model", "Match P-Value", "Type", "Precision"] sys.stderr.write(f"šŸŽÆ Final display DataFrame shape: {display_df.shape}\n") sys.stderr.write(f"šŸŽÆ Final columns: {list(display_df.columns)}\n") # Check p-value column if "Match P-Value" in display_df.columns: p_value_stats = display_df["Match P-Value"].describe() sys.stderr.write(f"šŸ“ˆ P-Value column stats:\n{p_value_stats}\n") sys.stderr.flush() return display_df # Perplexity testing removed - we only focus on p-values now # Initialize results repository and directory try: # Try to download existing repository try: snapshot_download( repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN ) except RepositoryNotFoundError: # Create the repository if it doesn't exist print(f"Creating new results repository: {RESULTS_REPO}") create_repo( repo_id=RESULTS_REPO, repo_type="dataset", private=False, token=TOKEN ) # Create local directory os.makedirs(EVAL_RESULTS_PATH, exist_ok=True) except Exception as e: print(f"Error initializing results: {e}") # Ensure local directory exists even if repo operations fail os.makedirs(EVAL_RESULTS_PATH, exist_ok=True) # Initialize allowed models import sys from src.evaluation.initialize_models import initialize_allowed_models sys.stderr.write("\nšŸš€ STARTING GRADIO APP INITIALIZATION\n") sys.stderr.write("šŸ“Š Initializing allowed models...\n") sys.stderr.flush() # Initialize the allowed models initialize_allowed_models() sys.stderr.write("šŸ“Š Creating initial results DataFrame...\n") sys.stderr.flush() RESULTS_DF = create_results_dataframe() sys.stderr.write(f"āœ… Initial DataFrame created with shape: {RESULTS_DF.shape}\n") sys.stderr.write(f"šŸ“‹ Columns: {list(RESULTS_DF.columns)}\n") sys.stderr.flush() # Create the Gradio interface sys.stderr.write("šŸŽØ Creating Gradio interface...\n") sys.stderr.flush() demo = gr.Blocks(css=custom_css) with demo: gr.HTML(TITLE) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("šŸ… Results", elem_id="results-tab", id=0): gr.Markdown("## Model Evaluation Results") results_table = gr.DataFrame( value=RESULTS_DF, headers=["Model", "Match P-Value", "Type", "Precision"], interactive=False, wrap=False ) with gr.TabItem("šŸ“ About", elem_id="about-tab", id=1): gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text") with gr.TabItem("šŸ”¬ Analysis", elem_id="analysis-tab", id=2): gr.Markdown("## Model Tracing Analysis\n\nP-values are computed automatically for all supported models.") gr.Markdown(""" ### Current Analysis Status: - **P-values are computed automatically** using the model tracing pipeline - **Lower p-values indicate higher structural similarity** to Llama-2-7B - **Analysis compares neuron organization** across transformer layers - **Results appear in the main table** once computation is complete ### Supported Models: - `lmsys/vicuna-7b-v1.5` - Vicuna 7B v1.5 - `ibm-granite/granite-7b-base` - IBM Granite 7B Base - `EleutherAI/llemma_7b` - LLeMa 7B ### How it works: 1. Models are automatically analyzed against Llama-2-7B base 2. Match statistic with alignment is computed 3. P-values indicate structural similarity preservation 4. Results appear in the main Results tab """) sys.stderr.write("šŸŽÆ GRADIO INTERFACE SETUP COMPLETE\n") sys.stderr.write("šŸš€ LAUNCHING GRADIO APP WITH MODEL TRACING ANALYSIS\n") sys.stderr.write("šŸ“Š Features enabled:\n") sys.stderr.write(" - Model trace p-value computation (vs Llama-2-7B base)\n") sys.stderr.write(" - Match statistic with alignment\n") sys.stderr.write(" - Structural similarity analysis\n") sys.stderr.write("šŸŽ‰ Ready to display p-values!\n") sys.stderr.flush() demo.queue(default_concurrency_limit=5).launch()