from dataclasses import dataclass from enum import Enum # NO TASKS - ONLY P-VALUES # --------------------------------------------------- class Tasks(Enum): pass NUM_FEWSHOT = 0 # Not used # --------------------------------------------------- # Your leaderboard name TITLE = """

Model Tracing Leaderboard

""" # What does your leaderboard evaluate? INTRODUCTION_TEXT = """ This leaderboard evaluates specific language models based on their structural similarity to Llama-2-7B using model tracing analysis. **Models Evaluated:** - `lmsys/vicuna-7b-v1.5` - Vicuna 7B v1.5 - `ibm-granite/granite-7b-base` - IBM Granite 7B Base - `EleutherAI/llemma_7b` - LLeMa 7B **Metric:** - **Match P-Value**: Lower p-values indicate the model preserves structural similarity to Llama-2-7B after fine-tuning (neuron organization is maintained). """ # Which evaluations are you running? LLM_BENCHMARKS_TEXT = """ ## How it works The evaluation runs model tracing analysis on the supported language models: ### Supported Models - **Vicuna 7B v1.5** (`lmsys/vicuna-7b-v1.5`) - Chat-optimized LLaMA variant - **IBM Granite 7B** (`ibm-granite/granite-7b-base`) - IBM's foundational language model - **LLeMa 7B** (`EleutherAI/llemma_7b`) - EleutherAI's mathematical language model ### Model Tracing Analysis Compares each model's internal structure to Llama-2-7B using the "match" statistic: - **Base Model**: Llama-2-7B (`meta-llama/Llama-2-7b-hf`) - **Comparison Models**: The 3 supported models listed above - **Method**: Neuron matching analysis across transformer layers - **Alignment**: Models are aligned before comparison using the Hungarian algorithm - **Output**: P-value indicating structural similarity (lower = more similar to Llama-2-7B) The match statistic tests whether neurons in corresponding layers maintain similar functional roles between the base model and the comparison models. """ EVALUATION_QUEUE_TEXT = """ ## Model Analysis This leaderboard analyzes structural similarity between specific models and Llama-2-7B: 1. **Vicuna 7B v1.5** - Chat-optimized variant of LLaMA 2. **IBM Granite 7B Base** - IBM's foundational language model 3. **LLeMa 7B** - EleutherAI's mathematical language model The p-values are computed automatically using the model tracing analysis. """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = ""