Spaces:

allenai
/

reward-bench

Running

App Files Files Community

root commited on Feb 14

Commit

99399ee

1 Parent(s): 6dd5c50

v2 update

Browse files

Files changed (6) hide show

src/constants.py +60 -0
src/css.py +22 -0
src/logo.png +0 -0
src/md.py +106 -0
src/plt.py +53 -0
src/utils.py +174 -0

src/constants.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# reference for length bias categories
+length_categories = {
+    'alpacaeval-easy': 'True',
+    'alpacaeval-hard': 'True',
+    'alpacaeval-length': 'Neutral',
+    'donotanswer': 'False',
+    'hep-cpp': 'Neutral',
+    'hep-go': 'Neutral',
+    'hep-java': 'Neutral',
+    'hep-js': 'Neutral',
+    'hep-python': 'Neutral',
+    'hep-rust': 'Neutral',
+    'llmbar-adver-GPTInst': 'False',
+    'llmbar-adver-GPTOut': 'Neutral',
+    'llmbar-adver-manual': 'False',
+    'llmbar-adver-neighbor': 'False',
+    'llmbar-natural': 'Neutral',
+    'math-prm': 'Neutral',
+    'mt-bench-easy': 'False',
+    'mt-bench-hard': 'False',
+    'mt-bench-med': 'Neutral',
+    'refusals-dangerous': 'False',
+    'refusals-offensive': 'False',
+    'xstest-should-refuse': 'False',
+    'xstest-should-respond': 'True'
+}
+example_counts = {
+    "alpacaeval-easy": 100,
+    "alpacaeval-length": 95,
+    "alpacaeval-hard": 95,
+    "mt-bench-easy": 28,
+    "mt-bench-med": 40,
+    "mt-bench-hard": 37,
+    "math-prm": 984, # actual length 447, upweighting to be equal to code
+    "refusals-dangerous": 100,
+    "refusals-offensive": 100,
+    "llmbar-natural": 100,
+    "llmbar-adver-neighbor": 134,
+    "llmbar-adver-GPTInst": 92,
+    "llmbar-adver-GPTOut": 47,
+    "llmbar-adver-manual": 46,
+    "xstest-should-refuse": 154,
+    "xstest-should-respond": 250, # Note, refuse and respond were accidentally swapped until 9 Sept 2024
+    "donotanswer": 136,
+    "hep-cpp": 164,
+    "hep-go": 164,
+    "hep-java": 164,
+    "hep-js": 164,
+    "hep-python": 164,
+    "hep-rust": 164
+}
+# note, this order should match the dataframe.
+subset_mapping = {
+    "Chat": ['alpacaeval-easy', 'alpacaeval-hard', 'alpacaeval-length', 'mt-bench-easy', 'mt-bench-med'],
+    "Chat Hard": ['llmbar-adver-GPTInst', 'llmbar-adver-GPTOut', 'llmbar-adver-manual', 'llmbar-adver-neighbor', 'llmbar-natural', 'mt-bench-hard'],
+    "Safety": ['donotanswer', 'refusals-dangerous', 'refusals-offensive', 'xstest-should-refuse', 'xstest-should-respond'],
+    "Reasoning": ["hep-cpp", "hep-go", "hep-java", "hep-js", "hep-python", "hep-rust", "math-prm"]
+}

src/css.py ADDED Viewed

	@@ -0,0 +1,22 @@

+custom_css = """
+/* Full width space */
+.gradio-container {
+  max-width: 95%;
+}
+/* Text tyle and margins */
+.markdown-text {
+  font-size: 17px !important;
+}
+.tab-buttons button {
+  font-size: 20px;
+}
+h1 {
+  font-size: 32px !important;
+  margin-top: 0px !important;
+}
+"""

src/logo.png ADDED Viewed

src/md.py ADDED Viewed

	@@ -0,0 +1,106 @@

+from datetime import datetime
+import pytz
+ABOUT_TEXT = """
+We compute the win percentage for a reward model on hand curated chosen-rejected pairs for each prompt.
+A win is when the score for the chosen response is higher than the score for the rejected response.
+Note: Models with (*) after the model name are independently submitted model scores which have not been verified by the RewardBench team.
+## Overview
+We average over 4 core sections (per prompt weighting):
+1. **Chat**: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
+2. **Chat Hard**: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
+3. **Safety**: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
+4. **Reasoning**: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
+For Reasoning, we increase the weight of the PRM-Math subset so code and math abilities are weighed equally in the final number, rather than increasing the relevance of code.
+We add a final column, **Prior Sets** -- includes the test sets ([anthropic_helpful](https://huggingface.co/datasets/Anthropic/hh-rlhf), [anthropic_hhh](https://huggingface.co/datasets/HuggingFaceH4/hhh_alignment), [shp](https://huggingface.co/datasets/stanfordnlp/SHP), [summarize](https://huggingface.co/datasets/openai/summarize_from_feedback))
+Prior sets is weighted 0.5x in the final score to avoid gamification by training on the available training sets of Anthropic HH, SHP, and Summarize.
+Once all subsets weighted averages are achieved, the final RewardBench score is the average across the 5 subset scores.
+We include multiple types of reward models in this evaluation:
+1. **Sequence Classifiers** (Seq. Classifier): A model, normally trained with HuggingFace AutoModelForSequenceClassification, that takes in a prompt and a response and outputs a score.
+2. **Custom Classifiers**: Research models with different architectures and training objectives to either take in two inputs at once or generate scores differently (e.g. PairRM and Stanford SteamSHP).
+3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed. *Note*: This also includes other models trained with implicit rewards, such as those trained with [KTO](https://arxiv.org/abs/2402.01306).
+4. **Random**: Random choice baseline.
+4. **Generative**: Prompting fine-tuned models to choose between two answers, similar to MT Bench and AlpacaEval.
+All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
+*Note*: The reference models for DPO models (and other implicit rewards) can be found in two ways.
+* Click on a specific model in results and you'll see a key `ref_model`, e.g. [Qwen](https://huggingface.co/datasets/allenai/reward-bench-results/blob/main/eval-set/Qwen/Qwen1.5-72B-Chat.json).
+* All the reference models are listed in the [evaluation configs](https://github.com/allenai/reward-bench/blob/main/scripts/configs/eval_configs.yaml).
+### Subset Details
+Total number of the prompts is: 2985, filtered from 5123.
+| Subset             | Num. Samples (Pre-filtering, post-filtering) | Description |
+| :---------- | :-----: | :---------: |
+| alpacaeval-easy    | 805, 100          | Great model vs poor model            |
+| alpacaeval-length    | 805, 95          | Good model vs low model, equal length            |
+| alpacaeval-hard    | 805, 95          | Great model vs baseline model            |
+| mt-bench-easy      | 28, 28           | MT Bench 10s vs 1s            |
+| mt-bench-medium    | 45, 40           | MT Bench 9s vs 2-5s            |
+| mt-bench-hard      | 45, 37          | MT Bench 7-8 vs 5-6            |
+| refusals-dangerous | 505, 100          | Dangerous response vs no response            |
+| refusals-offensive | 704, 100          | Offensive response vs no response            |
+| llmbar-natural     | 100          | (See [paper](https://arxiv.org/abs/2310.07641)) Manually curated instruction pairs |
+| llmbar-adver-neighbor | 134          | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. off-topic prompt response |
+| llmbar-adver-GPTInst | 92          | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. GPT4 generated off-topic prompt response |
+| llmbar-adver-GPTOut |  47          | (See [paper](https://arxiv.org/abs/2310.07641)) Instruction response vs. unhelpful-prompted GPT4 responses |
+| llmbar-adver-manual |  46          | (See [paper](https://arxiv.org/abs/2310.07641)) Challenge set chosen vs. rejected |
+| xstest-should-refuse | 450, 154         | False response dataset (see [paper](https://arxiv.org/abs/2308.01263))        |
+| xstest-should-respond | 450, 250         | False refusal dataset (see [paper](https://arxiv.org/abs/2308.01263))        |
+| do not answer | 939, 136         | [Prompts which responsible LLMs do not answer](https://huggingface.co/datasets/LibrAI/do-not-answer)        |
+| math-prm | 447         | Human references vs. model error from OpenAI's Let's Verify Step by Step        |
+| hep-cpp | 164         | C++ code revisions (See [dataset](https://huggingface.co/datasets/bigcode/humanevalpack) or [paper](https://arxiv.org/abs/2308.07124))        |
+| hep-go | 164         |   Go code       |
+| hep-java | 164         |  Java code        |
+| hep-js | 164         |    Javascript code        |
+| hep-python | 164         |  Python code         |
+| hep-rust | 164         |   Rust code        |
+Lengths (mean, std. dev.) include the prompt
+| subset                | length bias | chosen_chars   | rejected_chars   | chosen_tokens   | rejected_tokens   | chosen_unique_tokens   | rejected_unique_tokens   |
+|-----------------------|-------------|----------------|------------------|-----------------|-------------------|------------------------|--------------------------|
+| alpacaeval-easy       | True        | 2283 (1138)    | 646 (482)        | 591 (303)       | 167 (139)         | 253 (117)              | 83 (46)                  |
+| alpacaeval-hard       | True        | 1590 (769)     | 526 (430)        | 412 (199)       | 137 (117)         | 173 (67)               | 71 (48)                  |
+| alpacaeval-length     | Neutral       | 2001 (1137)    | 2127 (1787)      | 511 (283)       | 597 (530)         | 192 (85)               | 189 (99)                 |
+| donotanswer           | False       | 755 (722)      | 1389 (695)       | 170 (161)       | 320 (164)         | 104 (82)               | 157 (73)                 |
+| hep-cpp               | Neutral     | 709 (341)      | 705 (342)        | 261 (125)       | 259 (125)         | 100 (29)               | 99 (29)                  |
+| hep-go                | Neutral     | 738 (361)      | 734 (361)        | 266 (118)       | 265 (118)         | 100 (29)               | 99 (29)                  |
+| hep-java              | Neutral     | 821 (393)      | 814 (390)        | 263 (123)       | 261 (122)         | 102 (30)               | 102 (30)                 |
+| hep-js                | Neutral     | 677 (341)      | 673 (339)        | 251 (129)       | 250 (128)         | 93 (29)                | 93 (29)                  |
+| hep-python            | Neutral     | 618 (301)      | 616 (300)        | 212 (98)        | 211 (98)          | 86 (26)                | 85 (26)                  |
+| hep-rust              | Neutral     | 666 (391)      | 660 (391)        | 221 (132)       | 219 (132)         | 95 (29)                | 95 (29)                  |
+| llmbar-adver-GPTInst  | False       | 735 (578)      | 1623 (1055)      | 170 (135)       | 377 (245)         | 93 (59)                | 179 (106)                |
+| llmbar-adver-GPTOut   | Neutral     | 378 (339)      | 359 (319)        | 96 (81)         | 101 (94)          | 60 (45)                | 55 (41)                  |
+| llmbar-adver-manual   | False       | 666 (584)      | 1139 (866)       | 160 (134)       | 264 (194)         | 92 (63)                | 140 (90)                 |
+| llmbar-adver-neighbor | False       | 287 (297)      | 712 (749)        | 70 (76)         | 173 (175)         | 43 (31)                | 91 (70)                  |
+| llmbar-natural        | Neutral     | 553 (644)      | 530 (597)        | 139 (162)       | 130 (140)         | 75 (71)                | 70 (62)                  |
+| mt-bench-easy         | False       | 1563 (720)     | 2129 (1520)      | 377 (159)       | 551 (415)         | 166 (55)               | 116 (62)                 |
+| mt-bench-hard         | False       | 1225 (499)     | 1471 (1016)      | 284 (116)       | 349 (234)         | 131 (45)               | 136 (58)                 |
+| mt-bench-med          | Neutral       | 1558 (729)     | 1733 (1312)      | 377 (170)       | 410 (311)         | 162 (58)               | 145 (88)                 |
+| refusals-dangerous    | False       | 597 (81)       | 1828 (547)       | 131 (20)        | 459 (136)         | 90 (12)                | 211 (50)                 |
+| refusals-offensive    | False       | 365 (116)      | 1092 (1146)      | 82 (25)         | 299 (278)         | 64 (15)                | 134 (101)                |
+| xstest-should-refuse  | False       | 584 (419)      | 904 (493)        | 129 (89)        | 217 (115)         | 81 (47)                | 116 (53)                 |
+| xstest-should-respond | True        | 771 (420)      | 466 (427)        | 189 (105)       | 107 (94)          | 104 (48)               | 67 (48)                  |
+For more details, see the [dataset](https://huggingface.co/datasets/allenai/reward-bench).
+"""
+# Get Pacific time zone (handles PST/PDT automatically)
+pacific_tz = pytz.timezone('America/Los_Angeles')
+current_time = datetime.now(pacific_tz).strftime("%H:%M %Z, %d %b %Y")
+TOP_TEXT = f"""# RewardBench: Evaluating Reward Models
+### Evaluating the capabilities, safety, and pitfalls of reward models
+[Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Prior Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | [Paper](https://arxiv.org/abs/2403.13787) | Total models: {{}} | * Unverified models | ⚠️ Dataset Contamination | Last restart (PST): {current_time}
+⚠️ Many of the top models were trained on unintentionally contaminated, AI-generated data, for more information, see this [gist](https://gist.github.com/natolambert/1aed306000c13e0e8c5bc17c1a5dd300)."""

src/plt.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import matplotlib.pyplot as plt
+import pandas as pd
+from .utils import undo_hyperlink
+def plot_avg_correlation(df1, df2):
+    """
+    Plots the "average" column for each unique model that appears in both dataframes.
+    Parameters:
+    - df1: pandas DataFrame containing columns "model" and "average".
+    - df2: pandas DataFrame containing columns "model" and "average".
+    """
+    # Identify the unique models that appear in both DataFrames
+    common_models = pd.Series(list(set(df1['model']) & set(df2['model'])))
+    # Set up the plot
+    plt.figure(figsize=(13, 6), constrained_layout=True)
+    # axes from 0 to 1 for x and y
+    plt.xlim(0.475, 0.8)
+    plt.ylim(0.475, 0.8)
+    # larger font (16)
+    plt.rcParams.update({'font.size': 12, 'axes.labelsize': 14,'axes.titlesize': 14})
+    # plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)
+    # plt.tight_layout()
+    # plt.margins(0,0)
+    for model in common_models:
+        # Filter data for the current model
+        df1_model_data = df1[df1['model'] == model]['average'].values
+        df2_model_data = df2[df2['model'] == model]['average'].values
+        # Plotting
+        plt.scatter(df1_model_data, df2_model_data, label=model)
+        m_name = undo_hyperlink(model)
+        if m_name == "No text found":
+            m_name = "Random"
+        # Add text above each point like
+        # plt.text(x[i] + 0.1, y[i] + 0.1, label, ha='left', va='bottom')
+        plt.text(df1_model_data - .005, df2_model_data, m_name, horizontalalignment='right', verticalalignment='center')
+    # add correlation line to scatter plot
+    # first, compute correlation
+    corr = df1['average'].corr(df2['average'])
+    # add correlation line based on corr
+    plt.xlabel('HERM Eval. Set Avg.', fontsize=16)
+    plt.ylabel('Pref. Test Sets Avg.', fontsize=16)
+    # plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
+    return plt

src/utils.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import pandas as pd
+from pathlib import Path
+from datasets import load_dataset
+import numpy as np
+import os
+import re
+UNVERIFIED_MODELS = [
+    "nvidia/Nemotron-4-340B-Reward",
+    "nvidia/Llama3-70B-SteerLM-RM",
+    "Cohere May 2024",
+    "google/gemini-1.5-pro-0514",
+    "google/flame-24b-july-2024",
+    "Cohere March 2024",
+    "facebook/Self-taught-Llama-3-70B",
+    "facebook/Self-taught-evaluator-llama3.1-70B",
+    "google/flame-1.0-24B-july-2024",
+    "Salesforce/SFR-LLaMa-3.1-70B-Judge-r",
+    "Salesforce/SFR-nemo-12B-Judge-r",
+    "Salesforce/SFR-LLaMa-3.1-8B-Judge-r",
+    "SF-Foundation/TextEval-OffsetBias-12B",
+    "SF-Foundation/TextEval-Llama3.1-70B",
+    "nvidia/Llama-3.1-Nemotron-70B-Reward",
+]
+CONTAMINATED_MODELS = [
+    "Skywork/Skywork-Reward-Gemma-2-27B",
+    "Skywork/Skywork-Critic-Llama-3.1-70B",
+    "LxzGordon/URM-LLaMa-3.1-8B",
+    "Skywork/Skywork-Reward-Llama-3.1-8B",
+    "Ray2333/GRM-Llama3-8B-rewardmodel-ft",
+    "nicolinho/QRM-Llama3.1-8B",
+    "nicolinho/QRM-Llama3-8B",
+    "general-preference/GPM-Llama-3.1-8B",
+    "SF-Foundation/TextEval-Llama3.1-70B",
+    "ZiyiYe/Con-J-Qwen2-7B",
+    "Ray2333/Gemma-2B-rewardmodel-ft",
+    "Ray2333/GRM-Gemma-2B-rewardmodel-ft"
+]
+# From Open LLM Leaderboard
+def model_hyperlink(link, model_name):
+    # if model_name is above 50 characters, return first 47 characters and "..."
+    if len(model_name) > 50:
+        model_name = model_name[:47] + "..."
+    if model_name == "random":
+        output = "random"
+    elif model_name == "Cohere March 2024":
+        output = f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+    elif "openai" == model_name.split("/")[0]:
+        output = f'<a target="_blank" href="https://huggingface.co/openai" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+    elif "Anthropic" == model_name.split("/")[0]:
+        output = f'<a target="_blank" href="https://huggingface.co/Anthropic" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+    elif "google" == model_name.split("/")[0]:
+        output = f'<a target="_blank" href="https://huggingface.co/google" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+    elif "PoLL" == model_name.split("/")[0]:
+        output = model_name
+    output = f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+    if model_name in UNVERIFIED_MODELS:
+        output += " *"
+    if model_name in CONTAMINATED_MODELS:
+        output += " ⚠️"
+    return output
+def undo_hyperlink(html_string):
+    # Regex pattern to match content inside > and <
+    pattern = r'>[^<]+<'
+    match = re.search(pattern, html_string)
+    if match:
+        # Extract the matched text and remove leading '>' and trailing '<'
+        return match.group(0)[1:-1]
+    else:
+        return "No text found"
+# Define a function to fetch and process data
+def load_all_data(data_repo, subdir:str, subsubsets=False):    # use HF api to pull the git repo
+    dir = Path(data_repo)
+    data_dir = dir / subdir
+    orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))]
+    # get all files within the sub folders orgs
+    models_results = []
+    for org in orgs:
+        org_dir = data_dir / org
+        files = [f for f in os.listdir(org_dir) if os.path.isfile(os.path.join(org_dir, f))]
+        for file in files:
+            if file.endswith(".json"):
+                models_results.append(org + "/" + file)
+    # create empty dataframe to add all data to
+    df = pd.DataFrame()
+    # load all json data in the list models_results one by one to avoid not having the same entries
+    for model in models_results:
+        model_data = load_dataset("json", data_files=data_repo + subdir+ "/" + model, split="train")
+        df2 = pd.DataFrame(model_data)
+        # add to df
+        df = pd.concat([df2, df])
+    # remove chat_template comlumn
+    df = df.drop(columns=["chat_template"])
+    # sort columns alphabetically
+    df = df.reindex(sorted(df.columns), axis=1)
+    # move column "model" to the front
+    cols = list(df.columns)
+    cols.insert(0, cols.pop(cols.index('model')))
+    df = df.loc[:, cols]
+    # select all columns except "model"
+    cols = df.columns.tolist()
+    cols.remove("model")
+    # if model_type is a column (pref tests may not have it)
+    if "model_type" in cols:
+        cols.remove("model_type")
+    # remove ref_model if in columns
+    if "ref_model" in cols:
+        cols.remove("ref_model")
+    # remove model_beaker from dataframe
+    if "model_beaker" in cols:
+        cols.remove("model_beaker")
+        df = df.drop(columns=["model_beaker"])
+    # remove column xstest (outdated data)
+    # if xstest is a column
+    if "xstest" in cols:
+        df = df.drop(columns=["xstest"])
+        cols.remove("xstest")
+    if "ref_model" in df.columns:
+        df = df.drop(columns=["ref_model"])
+    # remove column anthropic and summarize_prompted (outdated data)
+    if "anthropic" in cols:
+        df = df.drop(columns=["anthropic"])
+        cols.remove("anthropic")
+    if "summarize_prompted" in cols:
+        df = df.drop(columns=["summarize_prompted"])
+        cols.remove("summarize_prompted")
+    # remove pku_better and pku_safer (removed from the leaderboard)
+    if "pku_better" in cols:
+        df = df.drop(columns=["pku_better"])
+        cols.remove("pku_better")
+    if "pku_safer" in cols:
+        df = df.drop(columns=["pku_safer"])
+        cols.remove("pku_safer")
+    # convert to score
+    df[cols] = (df[cols]*100)
+    avg = np.nanmean(df[cols].values,axis=1)
+    # add average column
+    df["average"] = avg
+    # apply model_hyperlink function to column "model"
+    df["model"] = df["model"].apply(lambda x: model_hyperlink(f"https://huggingface.co/{x}", x))
+    # move average column to the second
+    cols = list(df.columns)
+    cols.insert(1, cols.pop(cols.index('average')))
+    df = df.loc[:, cols]
+    # move model_type column to first
+    if "model_type" in cols:
+        cols = list(df.columns)
+        cols.insert(1, cols.pop(cols.index('model_type')))
+        df = df.loc[:, cols]
+    # remove models with DPO Ref. Free as type (future work)
+    df = df[~df["model_type"].str.contains("DPO Ref. Free", na=False)]
+    return df