""" Initialize the leaderboard with specific models and compute their p-values. This module ensures only the specified models are included in the leaderboard and their model trace p-values are computed. """ import os import json import sys from src.evaluation.model_trace_eval import compute_model_trace_p_value from src.envs import EVAL_RESULTS_PATH # The specific models we want to include ALLOWED_MODELS = [ "lmsys/vicuna-7b-v1.5", "ibm-granite/granite-7b-base", "EleutherAI/llemma_7b" ] def create_model_result_file(model_name, precision="float16"): """ Create a result file for a model with computed p-value. Args: model_name: HuggingFace model identifier precision: Model precision """ sys.stderr.write(f"\n๐Ÿ”ง CREATING RESULT FILE FOR: {model_name}\n") sys.stderr.flush() # Create the results directory if it doesn't exist os.makedirs(EVAL_RESULTS_PATH, exist_ok=True) # Generate a safe filename safe_name = model_name.replace("/", "_").replace("-", "_") result_file = os.path.join(EVAL_RESULTS_PATH, f"{safe_name}_{precision}.json") sys.stderr.write(f"๐Ÿ“ Result file path: {result_file}\n") sys.stderr.flush() # Check if file already exists if os.path.exists(result_file): sys.stderr.write(f"โœ… Result file already exists: {result_file}\n") sys.stderr.flush() return result_file # Create basic result structure result_data = { "config": { "model_dtype": f"torch.{precision}", "model_name": model_name, "model_sha": "main" }, "results": { # No perplexity - we only care about p-values } } # Save the result file try: with open(result_file, 'w') as f: json.dump(result_data, f, indent=2) sys.stderr.write(f"โœ… Created result file: {result_file}\n") sys.stderr.flush() return result_file except Exception as e: sys.stderr.write(f"โŒ Failed to create result file: {e}\n") sys.stderr.flush() return None def clean_non_allowed_results(): """ Remove result files for models that are not in the allowed list. """ sys.stderr.write(f"\n๐Ÿงน CLEANING NON-ALLOWED RESULT FILES\n") sys.stderr.flush() if not os.path.exists(EVAL_RESULTS_PATH): sys.stderr.write("๐Ÿ“ Results directory doesn't exist, nothing to clean\n") sys.stderr.flush() return removed_count = 0 # Walk through all files in the results directory for root, dirs, files in os.walk(EVAL_RESULTS_PATH): for file in files: if not file.endswith('.json'): continue file_path = os.path.join(root, file) try: # Try to extract model name from the result file with open(file_path, 'r') as f: data = json.load(f) config = data.get("config", {}) model_name = config.get("model_name", "") if model_name and not is_model_allowed(model_name): sys.stderr.write(f"๐Ÿ—‘๏ธ Removing non-allowed model result: {file_path} (model: {model_name})\n") os.remove(file_path) removed_count += 1 elif not model_name: sys.stderr.write(f"โš ๏ธ Skipping file with no model_name: {file_path}\n") except Exception as e: sys.stderr.write(f"โš ๏ธ Error processing file {file_path}: {e}\n") continue sys.stderr.write(f"โœ… Removed {removed_count} non-allowed result files\n") sys.stderr.flush() def initialize_allowed_models(): """ Initialize result files for all allowed models. """ sys.stderr.write(f"\n๐Ÿš€ INITIALIZING ALLOWED MODELS\n") sys.stderr.write(f"๐Ÿ“‹ Models to initialize: {ALLOWED_MODELS}\n") sys.stderr.flush() # First, clean up any existing non-allowed results clean_non_allowed_results() created_files = [] for model_name in ALLOWED_MODELS: try: result_file = create_model_result_file(model_name) if result_file: created_files.append(result_file) except Exception as e: sys.stderr.write(f"โŒ Failed to initialize {model_name}: {e}\n") sys.stderr.flush() continue sys.stderr.write(f"โœ… Initialized {len(created_files)} model result files\n") sys.stderr.flush() return created_files def is_model_allowed(model_name): """ Check if a model is in the allowed list. Args: model_name: HuggingFace model identifier Returns: bool: True if model is allowed """ return model_name in ALLOWED_MODELS def get_allowed_models(): """ Get the list of allowed models. Returns: list: List of allowed model names """ return ALLOWED_MODELS.copy()