VideoEval-Pro / utils.py
tonymwt's picture
update leaderboard
3fae107
import pandas as pd
import gradio as gr
import csv
import json
import os
import requests
import io
import shutil
from huggingface_hub import Repository
HF_TOKEN = os.environ.get("HF_TOKEN")
BASE_COLS = ["Rank", "Models", "Model Size(B)", "Type", "Frames"]
TASKS = ["LP_Open", "LP_MCQ", "LR_Open", "LR_MCQ", "HP_Open", "HP_MCQ", "HR_Open", "HR_MCQ", "Overall_Open", "Overall_MCQ"]
OPEN_TASKS = ["LP_Open", "LR_Open", "HP_Open", "HR_Open", "Overall_Open"]
MCQ_TASKS = ["LP_MCQ", "LR_MCQ", "HP_MCQ", "HR_MCQ", "Overall_MCQ"]
DEFAULT_NAMES = BASE_COLS + OPEN_TASKS
COLUMN_NAMES = BASE_COLS + TASKS
GROUP_FIELD = "Type" # "Proprietary" or "Open-source"
DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'number', 'str', 'str'] + \
['number'] * len(TASKS)
LEADERBOARD_INTRODUCTION = """
# 🥇 **VideoEval-Pro Leaderboard**
### A More Robust and Realistic QA Evaluation benchmark of Multi-modal LLMs in long video understanding
## Introduction
Do existing long video benchmarks faithfully reflect model's real capacity to understand long video content? Do the gains reported by newer models genuinely translate into stronger long video comprehension capability, or are they illusional? To probe these questions, we present VideoEval-Pro, a more robust and realistic long video understanding benchmark containing open-ended, short-answer QA problems. To construct VideoEval-Pro, we source the questions from four existing long video understanding MCQ benchmarks, and reformat these questions into free-form questions. We apply a series of filtering methods based on video duration, question and answer type, answerability and QA difficulty to ensure the quality of our benchmark. Our final benchmark contains a total of 1,289 short-answer questions based on 465 videos, with an average duration of 38 minutes. \n
| [**📈Overview**](https://tiger-ai-lab.github.io/VideoEval-Pro)
| [**👨‍💻Github**](https://github.com/TIGER-AI-Lab/VideoEval-Pro)
| [**📖VideoEval-Pro Paper**](https://arxiv.org/abs/2505.14640)
| [**🤗HuggingFace**](https://huggingface.co/datasets/TIGER-Lab/VideoEval-Pro) |
"""
TABLE_INTRODUCTION = """Models are ranked based on Overall_Open."""
LEADERBOARD_INFO = """
## Dataset Statistics and Tasks Info
* Local Perception (LP): LP focuses on identifying and retrieving visual elements or actions from a short video clip in a long video. Subtypes in this category include Segment QA, Needle-InA-Haystack (NIAH) QA, Attribute Perception, Action Recognition, Object Recognition, Entity Recognition, Key Information Retrieval and a combined Other subtype.
* Local Reasoning (LR): LR focuses on reasoning within short temporal windows, such as inferring causality, temporal order, or changes that happen over a local sequence of events. The four subtypes in this category are Egocentric Video Reasoning, Object Reasoning, Temporal Reasoning and Action Reasoning.
* Holistic Perception (HP): HP involves a global and holistic understanding of statistical, structural, or spatial information, typically requiring visual aggregation. In VIDEOEVAL-PRO, HP is comprised of Visual Counting problems.
* Holistic Reasoning (HR): HR requires abstract or high-level understanding of long videos across events or scenes, often involving narrative or intent understanding. The two subtypes for HR are Event Understanding and Plot Reasoning.
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@misc{ma2025videoevalprorobustrealisticlong,
title={VideoEval-Pro: Robust and Realistic Long Video Understanding Evaluation},
author={Wentao Ma and Weiming Ren and Yiming Jia and Zhuofeng Li and Ping Nie and Ge Zhang and Wenhu Chen},
year={2025},
eprint={2505.14640},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2505.14640},
}"""
SUBMIT_INTRODUCTION = """# Submit on VideoEval-Pro Leaderboard Introduction
## The evaluattion model should be used is *GPT-4o-0806*
## ⚠ Please note that you need to submit the JSON file with the following format:
```json
[
{
"Models": "<Model Name>",
"Model Size(B)": "100 or -",
"Frames": "<Number of Frames>",
"Type": "Proprietary or Open-source",
"URL": "<Model URL>" or null,
"LP_Open": 50.0 or null,
"LP_MCQ": 50.0 or null,
"LR_Open": 50.0 or null,
"LR_MCQ": 50.0 or null,
"HP_Open": 50.0 or null,
"HP_MCQ": 50.0 or null,
"HR_Open": 50.0 or null,
"HR_MCQ": 50.0 or null,
"Overall_Open": 50.0,
"Overall_MCQ": 50.0,
},
]
```
You may refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VideoEval-Pro) for instructions about evaluating your model. \n
Please send us an email at tonyyyma@gmail.com, attaching the JSON file. We will review your submission and update the leaderboard accordingly.
"""
def create_hyperlinked_names(df):
def convert_url(url, model_name):
return f'<a href="{url}">{model_name}</a>' if url is not None else model_name
def add_link_to_model_name(row):
row['Models'] = convert_url(row['URL'], row['Models'])
return row
df = df.copy()
df = df.apply(add_link_to_model_name, axis=1)
return df
# def fetch_data(file: str) -> pd.DataFrame:
# # fetch the leaderboard data from remote
# if file is None:
# raise ValueError("URL Not Provided")
# url = f"https://huggingface.co/spaces/TIGER-Lab/MMEB/resolve/main/{file}"
# print(f"Fetching data from {url}")
# response = requests.get(url)
# if response.status_code != 200:
# raise requests.HTTPError(f"Failed to fetch data: HTTP status code {response.status_code}")
# return pd.read_json(io.StringIO(response.text), orient='records', lines=True)
def get_df(file="results.jsonl"):
df = pd.read_json(file, orient='records', lines=True)
df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
for task in TASKS:
if df[task].isnull().any():
df[task] = df[task].apply(lambda score: '-' if pd.isna(score) else score)
df = df.sort_values(by=['Overall_Open'], ascending=False)
df = create_hyperlinked_names(df)
df['Rank'] = range(1, len(df) + 1)
return df
def refresh_data():
df = get_df()
return df[DEFAULT_NAMES]
def search_and_filter_models(df, query, min_size, max_size):
filtered_df = df.copy()
if query:
filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)]
size_mask = filtered_df['Model Size(B)'].apply(lambda x:
(min_size <= max_size) if x == 'unknown'
else (min_size <= x <= max_size))
filtered_df = filtered_df[size_mask]
return filtered_df[COLUMN_NAMES]
def search_models(df, query):
if query:
return df[df['Models'].str.contains(query, case=False, na=False)]
return df
def get_size_range(df):
sizes = df['Model Size(B)'].apply(lambda x: 0.0 if x == 'unknown' else x)
if (sizes == 0.0).all():
return 0.0, 1000.0
return float(sizes.min()), float(sizes.max())
def process_model_size(size):
if pd.isna(size) or size == 'unk':
return 'unknown'
try:
val = float(size)
return val
except (ValueError, TypeError):
return 'unknown'
def filter_columns_by_tasks(df, selected_tasks=None):
if selected_tasks is None or len(selected_tasks) == 0:
return df[COLUMN_NAMES]
base_columns = ['Models', 'Model Size(B)', 'Frames', 'Type', 'Overall_Open']
selected_columns = base_columns + selected_tasks
available_columns = [col for col in selected_columns if col in df.columns]
return df[available_columns]