Spaces:

opencompass
/

openlmm_spatial_leaderboard

Running

File size: 2,761 Bytes

2e20325

# CONSTANTS-URL
URL = "http://opencompass.openxlab.space/assets/SpatialLB.json"
# CONSTANTS-CITATION
CITATION_BUTTON_TEXT = r"""\
@inproceedings{duan2024vlmevalkit,
  title={Vlmevalkit: An open-source toolkit for evaluating large multi-modality models},
  author={Duan, Haodong and Yang, Junming and Qiao, Yuxuan and Fang, Xinyu and Chen, Lin and Liu, Yuan and Dong, Xiaoyi and Zang, Yuhang and Zhang, Pan and Wang, Jiaqi and others},
  booktitle={Proceedings of the 32nd ACM International Conference on Multimedia},
  pages={11198--11201},
  year={2024}
}
"""
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
# CONSTANTS-TEXT
LEADERBORAD_INTRODUCTION = """This leaderboard aims at providing a comprehensive evaluation of the spatial understanding capabilities of LMMs. \
Currently, it is a collection of evaluation results on 5 multi-modal spatial understanding benchmarks, all with image(s) as visual inputs. \
We obtain all evaluation results based on the VLMEvalKit, with the corresponding dataset names:

- Spatial457 [VQA]: A synthetic benchmark designed to evaluate LMMs on progressively complex 2D to 6D spatial reasoning tasks. https://arxiv.org/abs/2502.08636
- 3DSRBench_circular [MCQ]: A comprehensive benchmark assessing LMMs’ 3D spatial reasoning abilities through 2,700+ real world VQAs. https://arxiv.org/abs/2412.07825
- LEGO_circular [MCQ]: A benchmark designed to evaluate LMMs on multi-step spatial reasoning tasks using LEGO-based visual questions. https://arxiv.org/abs/2503.19990
- BLINK_circular [MCQ]: A benchmark that evaluates LMMs on core visual perception tasks, most VQAs are related to Spatial Understanding. https://arxiv.org/abs/2404.12390
- MMSIBench_circular [MCQ]: A VQA benchmark designed to evaluate LMMs on multi-image spatial reasoning. https://arxiv.org/abs/2505.23764

We adopt the CircularEval paradigm (proposed in MMBench) for all MCQ benchmarks. Especially, for 3DSRBench, CircularEval and FlipEval are performed at the same time, \
which means an LMM needs to correctly solve all problem versions with flipped images and circularly-shifted choices to complete one question. \
For all MCQ benchmarks, we present both CircularEval Acc and VanillaEval Acc as the metrics. 

To suggest new models or benchmarks for this leaderboard, please contact opencompass@pjlab.org.cn.
"""

# CONSTANTS-FIELDS
DATASETS_ALL = ['BLINK', 'LEGO', '3DSRBench', 'Spatial457', 'MMSIBench']
DATASETS_ESS = ['BLINK', 'LEGO', '3DSRBench', 'Spatial457', 'MMSIBench']
NON_MCQ_DATASETS = ['Spatial457']
META_FIELDS = ['Method', 'Param (B)', 'Language Model', 'Vision Model', 'OpenSource', 'Verified', 'Org']
MODEL_SIZE = ['<4B', '4B-10B', '10B-20B', '20B-40B', '>40B', 'Unknown']
MODEL_TYPE = ['OpenSource', 'API']