import pandas as pd import gradio as gr import csv import json import os import requests import io import shutil from huggingface_hub import Repository HF_TOKEN = os.environ.get("HF_TOKEN") BASE_COLS = ["Rank", "Models", "Model Size(B)", "Type", "Frames"] TASKS = ["LP_Open", "LP_MCQ", "LR_Open", "LR_MCQ", "HP_Open", "HP_MCQ", "HR_Open", "HR_MCQ", "Overall_Open", "Overall_MCQ"] OPEN_TASKS = ["LP_Open", "LR_Open", "HP_Open", "HR_Open", "Overall_Open"] MCQ_TASKS = ["LP_MCQ", "LR_MCQ", "HP_MCQ", "HR_MCQ", "Overall_MCQ"] DEFAULT_NAMES = BASE_COLS + OPEN_TASKS COLUMN_NAMES = BASE_COLS + TASKS GROUP_FIELD = "Type" # "Proprietary" or "Open-source" DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'number', 'str', 'str'] + \ ['number'] * len(TASKS) LEADERBOARD_INTRODUCTION = """ # 🥇 **VideoEval-Pro Leaderboard** ### A More Robust and Realistic QA Evaluation benchmark of Multi-modal LLMs in long video understanding ## Introduction Do existing long video benchmarks faithfully reflect model's real capacity to understand long video content? Do the gains reported by newer models genuinely translate into stronger long video comprehension capability, or are they illusional? To probe these questions, we present VideoEval-Pro, a more robust and realistic long video understanding benchmark containing open-ended, short-answer QA problems. To construct VideoEval-Pro, we source the questions from four existing long video understanding MCQ benchmarks, and reformat these questions into free-form questions. We apply a series of filtering methods based on video duration, question and answer type, answerability and QA difficulty to ensure the quality of our benchmark. Our final benchmark contains a total of 1,289 short-answer questions based on 465 videos, with an average duration of 38 minutes. \n | [**📈Overview**](https://tiger-ai-lab.github.io/VideoEval-Pro) | [**👨‍💻Github**](https://github.com/TIGER-AI-Lab/VideoEval-Pro) | [**📖VideoEval-Pro Paper**](https://arxiv.org/abs/2505.14640) | [**🤗HuggingFace**](https://huggingface.co/datasets/TIGER-Lab/VideoEval-Pro) | """ TABLE_INTRODUCTION = """Models are ranked based on Overall_Open.""" LEADERBOARD_INFO = """ ## Dataset Statistics and Tasks Info * Local Perception (LP): LP focuses on identifying and retrieving visual elements or actions from a short video clip in a long video. Subtypes in this category include Segment QA, Needle-InA-Haystack (NIAH) QA, Attribute Perception, Action Recognition, Object Recognition, Entity Recognition, Key Information Retrieval and a combined Other subtype. * Local Reasoning (LR): LR focuses on reasoning within short temporal windows, such as inferring causality, temporal order, or changes that happen over a local sequence of events. The four subtypes in this category are Egocentric Video Reasoning, Object Reasoning, Temporal Reasoning and Action Reasoning. * Holistic Perception (HP): HP involves a global and holistic understanding of statistical, structural, or spatial information, typically requiring visual aggregation. In VIDEOEVAL-PRO, HP is comprised of Visual Counting problems. * Holistic Reasoning (HR): HR requires abstract or high-level understanding of long videos across events or scenes, often involving narrative or intent understanding. The two subtypes for HR are Event Understanding and Plot Reasoning. """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r"""@misc{ma2025videoevalprorobustrealisticlong, title={VideoEval-Pro: Robust and Realistic Long Video Understanding Evaluation}, author={Wentao Ma and Weiming Ren and Yiming Jia and Zhuofeng Li and Ping Nie and Ge Zhang and Wenhu Chen}, year={2025}, eprint={2505.14640}, archivePrefix={arXiv}, primaryClass={cs.CV}, url={https://arxiv.org/abs/2505.14640}, }""" SUBMIT_INTRODUCTION = """# Submit on VideoEval-Pro Leaderboard Introduction ## The evaluattion model should be used is *GPT-4o-0806* ## ⚠ Please note that you need to submit the JSON file with the following format: ```json [ { "Models": "", "Model Size(B)": "100 or -", "Frames": "", "Type": "Proprietary or Open-source", "URL": "" or null, "LP_Open": 50.0 or null, "LP_MCQ": 50.0 or null, "LR_Open": 50.0 or null, "LR_MCQ": 50.0 or null, "HP_Open": 50.0 or null, "HP_MCQ": 50.0 or null, "HR_Open": 50.0 or null, "HR_MCQ": 50.0 or null, "Overall_Open": 50.0, "Overall_MCQ": 50.0, }, ] ``` You may refer to the [**GitHub page**](https://github.com/TIGER-AI-Lab/VideoEval-Pro) for instructions about evaluating your model. \n Please send us an email at tonyyyma@gmail.com, attaching the JSON file. We will review your submission and update the leaderboard accordingly. """ def create_hyperlinked_names(df): def convert_url(url, model_name): return f'{model_name}' if url is not None else model_name def add_link_to_model_name(row): row['Models'] = convert_url(row['URL'], row['Models']) return row df = df.copy() df = df.apply(add_link_to_model_name, axis=1) return df # def fetch_data(file: str) -> pd.DataFrame: # # fetch the leaderboard data from remote # if file is None: # raise ValueError("URL Not Provided") # url = f"https://huggingface.co/spaces/TIGER-Lab/MMEB/resolve/main/{file}" # print(f"Fetching data from {url}") # response = requests.get(url) # if response.status_code != 200: # raise requests.HTTPError(f"Failed to fetch data: HTTP status code {response.status_code}") # return pd.read_json(io.StringIO(response.text), orient='records', lines=True) def get_df(file="results.jsonl"): df = pd.read_json(file, orient='records', lines=True) df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size) for task in TASKS: if df[task].isnull().any(): df[task] = df[task].apply(lambda score: '-' if pd.isna(score) else score) df = df.sort_values(by=['Overall_Open'], ascending=False) df = create_hyperlinked_names(df) df['Rank'] = range(1, len(df) + 1) return df def refresh_data(): df = get_df() return df[DEFAULT_NAMES] def search_and_filter_models(df, query, min_size, max_size): filtered_df = df.copy() if query: filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)] size_mask = filtered_df['Model Size(B)'].apply(lambda x: (min_size <= max_size) if x == 'unknown' else (min_size <= x <= max_size)) filtered_df = filtered_df[size_mask] return filtered_df[COLUMN_NAMES] def search_models(df, query): if query: return df[df['Models'].str.contains(query, case=False, na=False)] return df def get_size_range(df): sizes = df['Model Size(B)'].apply(lambda x: 0.0 if x == 'unknown' else x) if (sizes == 0.0).all(): return 0.0, 1000.0 return float(sizes.min()), float(sizes.max()) def process_model_size(size): if pd.isna(size) or size == 'unk': return 'unknown' try: val = float(size) return val except (ValueError, TypeError): return 'unknown' def filter_columns_by_tasks(df, selected_tasks=None): if selected_tasks is None or len(selected_tasks) == 0: return df[COLUMN_NAMES] base_columns = ['Models', 'Model Size(B)', 'Frames', 'Type', 'Overall_Open'] selected_columns = base_columns + selected_tasks available_columns = [col for col in selected_columns if col in df.columns] return df[available_columns]