Spaces:

BAAI
/

MVRB_leaderboard

Running

App Files Files Community

Upload folder using huggingface_hub

by ZiyiXia - opened May 16

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+510

-0

Files changed (11) hide show

.gitignore +15 -0
Makefile +13 -0
app.py +79 -0
pyproject.toml +13 -0
requirements.txt +16 -0
results.csv +12 -0
src/about.py +102 -0
src/display/css_html_js.py +105 -0
src/display/formatting.py +27 -0
src/display/utils.py +103 -0
src/envs.py +25 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,15 @@

+auto_evals/
+venv/
+__pycache__/
+.env
+.ipynb_checkpoints
+*ipynb
+.vscode/
+eval-queue/
+eval-results/
+eval-queue-bk/
+eval-results-bk/
+logs/
+construct_df.py

Makefile ADDED Viewed

	@@ -0,0 +1,13 @@

+.PHONY: style format
+style:
+	python -m black --line-length 119 .
+	python -m isort .
+	ruff check --fix .
+quality:
+	python -m black --check --line-length 119 .
+	python -m isort --check-only .
+	ruff check .

app.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import gradio as gr
+from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
+import pandas as pd
+from apscheduler.schedulers.background import BackgroundScheduler
+from huggingface_hub import snapshot_download
+from src.about import (
+    CITATION_BUTTON_LABEL,
+    CITATION_BUTTON_TEXT,
+    EVALUATION_QUEUE_TEXT,
+    INTRODUCTION_TEXT,
+    LLM_BENCHMARKS_TEXT,
+    SUBMIT_FORM,
+    TITLE,
+)
+from src.display.css_html_js import custom_css
+from src.display.utils import (
+    BENCHMARK_COLS,
+    COLS,
+    EVAL_COLS,
+    EVAL_TYPES,
+    AutoEvalColumn,
+    ModelType,
+    fields,
+    WeightType,
+    Precision
+)
+from src.envs import API, REPO_ID
+def restart_space():
+    API.restart_space(repo_id=REPO_ID)
+LEADERBOARD_DF = pd.read_csv("results.csv")
+demo = gr.Blocks(css=custom_css)
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 MVRB", elem_id="llm-benchmark-tab-table", id=0):
+            Leaderboard(
+                value=LEADERBOARD_DF,
+                datatype=[c.type for c in fields(AutoEvalColumn)],
+                select_columns=SelectColumns(
+                    default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
+                    cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
+                    label="Select Columns to Display:",
+                ),
+                search_columns=[AutoEvalColumn.model.name],
+                hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
+                filter_columns=[
+                    ColumnFilter("#Params (B)", type="slider", default=[0.428, 8.0], label="Number of parameters (B)"),
+                ],
+                interactive=True,
+            )
+        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
+            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
+            gr.Markdown(SUBMIT_FORM, elem_classes="markdown-text")
+    with gr.Row():
+        with gr.Accordion("📙 Citation", open=False):
+            citation_button = gr.Textbox(
+                value=CITATION_BUTTON_TEXT,
+                label=CITATION_BUTTON_LABEL,
+                lines=8,
+                elem_id="citation-button",
+                show_copy_button=True,
+            )
+scheduler = BackgroundScheduler()
+scheduler.add_job(restart_space, "interval", seconds=1800)
+scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+[tool.ruff]
+# Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
+select = ["E", "F"]
+ignore = ["E501"] # line too long (black is taking care of this)
+line-length = 119
+fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
+[tool.isort]
+profile = "black"
+line_length = 119
+[tool.black]
+line-length = 119

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+APScheduler
+black
+datasets
+gradio
+gradio[oauth]
+gradio_leaderboard==0.0.13
+gradio_client
+huggingface-hub>=0.18.0
+matplotlib
+numpy
+pandas
+python-dateutil
+tqdm
+transformers
+tokenizers>=0.15.0
+sentencepiece

results.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+Rank,Model,#Params (B),Overall,SR,CSR,SQA,OVC
+1,BGE-VL-Screenshot,3.75,60.61,70.09,59.58,53.1,54.46
+2,"<a href=""https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct"">GME</a>",2.21,48.14,61.62,37.68,37.78,47.98
+3,"<a href=""https://huggingface.co/Tevatron/dse-phi3-v1.0"">DSE</a>",4.15,45.21,61.54,37.78,39.24,31.51
+4,"<a href=""https://huggingface.co/vidore/colpali"">ColPali</a>",2.92,43.64,61.73,35.0,35.32,31.04
+5,"<a href=""https://huggingface.co/nvidia/MM-Embed"">MM-Embed</a>",7.57,34.48,25.86,40.93,42.83,32.67
+6,"<a href=""https://huggingface.co/google/siglip-so400m-patch14-384"">SigLIP</a>",0.878,33.34,38.33,34.48,19.6,40.64
+7,"<a href=""https://huggingface.co/TIGER-Lab/VLM2Vec-Full"">VLM2Vec</a>",4.15,32.19,15.93,48.05,49.42,23.24
+8,"<a href=""https://huggingface.co/royokong/e5-v"">E5-V</a>",8.35,25.13,34.11,26.59,5.23,32.85
+9,"<a href=""https://huggingface.co/openai/clip-vit-large-patch14"">CLIP</a>",0.428,23.75,18.89,25.39,23.9,30.4
+10,"<a href=""https://huggingface.co/TIGER-Lab/UniIR"">Uni-IR</a>",0.428,19.63,12.35,35.92,29.68,20.06
+11,"<a href=""https://huggingface.co/OpenDriveLab/Vista"">VISTA</a>",0.196,13.85,5.21,11.29,25.78,16.61

src/about.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from dataclasses import dataclass
+from enum import Enum
+@dataclass
+class Task:
+    benchmark: str
+    metric: str
+    col_name: str
+# Select your tasks here
+# ---------------------------------------------------
+class Tasks(Enum):
+    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    task0 = Task("anli_r1", "acc", "ANLI")
+    task1 = Task("logiqa", "acc_norm", "LogiQA")
+NUM_FEWSHOT = 0 # Change with your few shot
+# ---------------------------------------------------
+# Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">MVRB Leaderboard</h1>"""
+# What does your leaderboard evaluate?
+INTRODUCTION_TEXT = """
+**MVRB (Massive Visualized IR Benchmark)** evaluates multimodal retrievers’ performance on general **Vis-IR** tasks. The benchmark includes various task types, such as screenshot-based multimodal retrieval (screenshot to anything, anything to screenshot) and screenshotconditioned retrieval (e.g., searching for documents using queries conditioned on screenshots). It also covers a variety of important domains, including news, products, papers, and charts.
+More details can be found:
+- Paper: https://arxiv.org/pdf/2502.11431
+- Repo: https://github.com/VectorSpaceLab/Vis-IR
+"""
+# Which evaluations are you running? how can people reproduce what you have?
+LLM_BENCHMARKS_TEXT = f"""
+## Tasks
+- **Screenshot Retrieval (SR)** consists of evaluation samples, each comprising a textual query *q* and its relevant screenshot *s: (q, s)*. The retrieval model needs to precisely retrieve the relevant screenshot for a testing query from a given corpus *S*. Each evaluation sample is created in two steps: 1) sample a screenshot *s*, 2) prompt the LLM to generate a search query based on the caption of screenshot. We consider seven tasks under this category, including product retrieval, paper retrieval, repo retrieval, news retrieval, chart retrieval, document retrieval, and slide retrieval.
+- **Composed Screenshot Retrieval (CSR)** is made up of sq2s triplets. Given a screenshot *s1* and a query *q* conditioned on *s1*, the retrieval model needs to retrieve the relevant screenshot *s2* from the corpus *S*. We define four tasks for this category, including product discovery, news-to-Wiki, knowledge relation, and Wiki-to-product. All tasks in this category are created by human annotators. For each task, annotators are instructed to identify relevant screenshot pairs and write queries to retrieve *s2* based on *s1*.
+- **Screenshot Question Answering (SQA)** comprises sq2a triplets. Given a screenshot s and a question q conditioned on s, the retrieval model needs to retrieve the correct answer a from a candidate corpus A. Each evaluation sample is created in three steps: 1) sample a screenshot *s*. 2) prompt the MLLM to generate a question *q*. 3) prompt the MLLM to generate the answer *a* for *q* based on *s*. The following tasks are included in this category: product-QA, news-QA, Wiki-QA, paper-QA, repo-QA.
+- **Open-Vocab Classification (OVC)** is performed using evaluation samples of screenshots and their textual class labels. Given a screenshot s and the label class *C*, the retrieval model needs to discriminate the correct label c from *C* based on the embedding similarity. We include the following tasks in this category: product classification, news-topic classification, academic-field classification, knowledge classification. For each task, we employ human labelers to create the label class and assign each screenshot with its correct label.
+"""
+EVALUATION_QUEUE_TEXT = """
+## Some good practices before submitting a model
+### 1) Make sure you can load your model and tokenizer using AutoClasses:
+```python
+from transformers import AutoConfig, AutoModel, AutoTokenizer
+config = AutoConfig.from_pretrained("your model name", revision=revision)
+model = AutoModel.from_pretrained("your model name", revision=revision)
+tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
+```
+If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
+Note: make sure your model is public!
+Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
+### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
+It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
+### 3) Make sure your model has an open license!
+This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
+### 4) Fill up your model card
+When we add extra information about models to the leaderboard, it will be automatically taken from the model card
+## In case of model failure
+If your model is displayed in the `FAILED` category, its execution stopped.
+Make sure you have followed the above steps first.
+If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
+"""
+SUBMIT_FORM = """
+## Make sure you submit your evaluation results in a JSON file with the following format:
+```json
+{
+    "Model": "<Model Name>",
+    "URL (optional)": "<Model/Repo/Paper URL>"
+    "#params": "7.11B",
+    "Overall": 30.00,
+    "SR": 30.00,
+    "CSR": 30.00,
+    "VQA": 30.00,
+    "OVC": 30.00,
+}
+```
+Then send a email to xxx@xx.xx with the JSON file attached. We will review your submission and add it to the leaderboard.
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite MVRB:"
+CITATION_BUTTON_TEXT = """
+@article{liu2025any,
+  title={Any Information Is Just Worth One Single Screenshot: Unifying Search With Visualized Information Retrieval},
+  author={Liu, Ze and Liang, Zhengyang and Zhou, Junjie and Liu, Zheng and Lian, Defu},
+  journal={arXiv preprint arXiv:2502.11431},
+  year={2025}
+}
+"""

src/display/css_html_js.py ADDED Viewed

	@@ -0,0 +1,105 @@

+custom_css = """
+.markdown-text {
+    font-size: 16px !important;
+}
+#models-to-add-text {
+    font-size: 18px !important;
+}
+#citation-button span {
+    font-size: 16px !important;
+}
+#citation-button textarea {
+    font-size: 16px !important;
+}
+#citation-button > label > button {
+    margin: 6px;
+    transform: scale(1.3);
+}
+#leaderboard-table {
+    margin-top: 15px
+}
+#leaderboard-table-lite {
+    margin-top: 15px
+}
+#search-bar-table-box > div:first-child {
+    background: none;
+    border: none;
+}
+#search-bar {
+    padding: 0px;
+}
+/* Limit the width of the first AutoEvalColumn so that names don't expand too much */
+#leaderboard-table td:nth-child(2),
+#leaderboard-table th:nth-child(2) {
+    max-width: 400px;
+    overflow: auto;
+    white-space: nowrap;
+}
+.tab-buttons button {
+    font-size: 20px;
+}
+#scale-logo {
+    border-style: none !important;
+    box-shadow: none;
+    display: block;
+    margin-left: auto;
+    margin-right: auto;
+    max-width: 600px;
+}
+#scale-logo .download {
+    display: none;
+}
+#filter_type{
+    border: 0;
+    padding-left: 0;
+    padding-top: 0;
+}
+#filter_type label {
+    display: flex;
+}
+#filter_type label > span{
+    margin-top: var(--spacing-lg);
+    margin-right: 0.5em;
+}
+#filter_type label > .wrap{
+    width: 103px;
+}
+#filter_type label > .wrap .wrap-inner{
+    padding: 2px;
+}
+#filter_type label > .wrap .wrap-inner input{
+    width: 1px
+}
+#filter-columns-type{
+    border:0;
+    padding:0.5;
+}
+#filter-columns-size{
+    border:0;
+    padding:0.5;
+}
+#box-filter > .form{
+    border: 0
+}
+"""
+get_window_url_params = """
+    function(url_params) {
+        const params = new URLSearchParams(window.location.search);
+        url_params = Object.fromEntries(params);
+        return url_params;
+    }
+    """

src/display/formatting.py ADDED Viewed

	@@ -0,0 +1,27 @@

+def model_hyperlink(link, model_name):
+    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def make_clickable_model(model_name):
+    link = f"https://huggingface.co/{model_name}"
+    return model_hyperlink(link, model_name)
+def styled_error(error):
+    return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
+def styled_warning(warn):
+    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
+def styled_message(message):
+    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
+def has_no_nan_values(df, columns):
+    return df[columns].notna().all(axis=1)
+def has_nan_values(df, columns):
+    return df[columns].isna().any(axis=1)

src/display/utils.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from dataclasses import dataclass, make_dataclass
+from enum import Enum
+import pandas as pd
+from src.about import Tasks
+def fields(raw_class):
+    return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
+# These classes are for user facing column names,
+# to avoid having to change them all around the code
+# when a modif is needed
+@dataclass
+class ColumnContent:
+    name: str
+    type: str
+    displayed_by_default: bool
+    hidden: bool = False
+    never_hidden: bool = False
+## Leaderboard columns
+auto_eval_column_dict = []
+# Init
+auto_eval_column_dict.append(["rank", ColumnContent, ColumnContent("Rank", "number", True, never_hidden=True)])
+auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+auto_eval_column_dict.append(["num_params", ColumnContent, ColumnContent("#Params (B)", "number", True)])
+#Scores
+auto_eval_column_dict.append(["overall", ColumnContent, ColumnContent("Overall", "number", True, never_hidden=True)])
+auto_eval_column_dict.append(["SR", ColumnContent, ColumnContent("SR", "number", True)])
+auto_eval_column_dict.append(["CSR", ColumnContent, ColumnContent("CSR", "number", True)])
+auto_eval_column_dict.append(["SQA", ColumnContent, ColumnContent("SQA", "number", True)])
+auto_eval_column_dict.append(["OVC", ColumnContent, ColumnContent("OVC", "number", True)])
+# We use make dataclass to dynamically fill the scores from Tasks
+AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
+## For the queue columns in the submission tab
+@dataclass(frozen=True)
+class EvalQueueColumn:  # Queue column
+    model = ColumnContent("model", "markdown", True)
+    revision = ColumnContent("revision", "str", True)
+    private = ColumnContent("private", "bool", True)
+    precision = ColumnContent("precision", "str", True)
+    weight_type = ColumnContent("weight_type", "str", "Original")
+    status = ColumnContent("status", "str", True)
+## All the model information that we might need
+@dataclass
+class ModelDetails:
+    name: str
+    display_name: str = ""
+    symbol: str = "" # emoji
+class ModelType(Enum):
+    PT = ModelDetails(name="pretrained", symbol="🟢")
+    FT = ModelDetails(name="fine-tuned", symbol="🔶")
+    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
+    RL = ModelDetails(name="RL-tuned", symbol="🟦")
+    Unknown = ModelDetails(name="", symbol="?")
+    def to_str(self, separator=" "):
+        return f"{self.value.symbol}{separator}{self.value.name}"
+    @staticmethod
+    def from_str(type):
+        if "fine-tuned" in type or "🔶" in type:
+            return ModelType.FT
+        if "pretrained" in type or "🟢" in type:
+            return ModelType.PT
+        if "RL-tuned" in type or "🟦" in type:
+            return ModelType.RL
+        if "instruction-tuned" in type or "⭕" in type:
+            return ModelType.IFT
+        return ModelType.Unknown
+class WeightType(Enum):
+    Adapter = ModelDetails("Adapter")
+    Original = ModelDetails("Original")
+    Delta = ModelDetails("Delta")
+class Precision(Enum):
+    float16 = ModelDetails("float16")
+    bfloat16 = ModelDetails("bfloat16")
+    Unknown = ModelDetails("?")
+    def from_str(precision):
+        if precision in ["torch.float16", "float16"]:
+            return Precision.float16
+        if precision in ["torch.bfloat16", "bfloat16"]:
+            return Precision.bfloat16
+        return Precision.Unknown
+# Column selection
+COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
+EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
+EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+BENCHMARK_COLS = [t.value.col_name for t in Tasks]

src/envs.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+from huggingface_hub import HfApi
+# Info to change for your repository
+# ----------------------------------
+TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
+OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
+# ----------------------------------
+REPO_ID = f"{OWNER}/leaderboard"
+QUEUE_REPO = f"{OWNER}/requests"
+RESULTS_REPO = f"{OWNER}/results"
+# If you setup a cache later, just change HF_HOME
+CACHE_PATH=os.getenv("HF_HOME", ".")
+# Local caches
+EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
+EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
+EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
+EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
+API = HfApi(token=TOKEN)