|
import streamlit as st |
|
from app.draw_diagram import * |
|
from app.content import * |
|
from app.summarization import * |
|
from app.show_examples import * |
|
|
|
def dataset_contents(dataset, metrics): |
|
|
|
custom_css = """ |
|
<style> |
|
.my-dataset-info { |
|
# background-color: #F9EBEA; |
|
# padding: 10px; |
|
color: #050505; |
|
font-style: normal; |
|
font-size: 8px; |
|
height: auto; |
|
} |
|
</style> |
|
""" |
|
st.markdown(custom_css, unsafe_allow_html=True) |
|
st.markdown(f"""<div class="my-dataset-info"> |
|
<p><b>About this dataset</b>:</p> |
|
<p>{dataset}</p> |
|
</div>""", unsafe_allow_html=True) |
|
st.markdown(f"""<div class="my-dataset-info"> |
|
<p><b>About this metric</b>:</p> |
|
<p>{metrics}</p> |
|
</div>""", unsafe_allow_html=True) |
|
|
|
|
|
def dashboard(): |
|
|
|
with st.container(): |
|
st.title("Leaderboard for AudioBench") |
|
|
|
st.markdown(""" |
|
[gh1]: https://github.com/AudioLLMs/AudioBench |
|
[gh2]: https://github.com/AudioLLMs/AudioBench |
|
**Toolkit:** [][gh1] | |
|
[**Paper @ NAACL 2025**](https://arxiv.org/abs/2406.16020) | |
|
**Resource for AudioLLMs:** [][gh2] |
|
""") |
|
|
|
|
|
st.markdown(""" |
|
#### Recent updates |
|
- **May. 2025**: Further expanded ASR task to 7 different languages. Added MERaLiON-2 to the leaderboard. |
|
- **Jan. 2025**: AudioBench is officially accepted to NAACL 2025! |
|
- **Jan. 2025**: Update the layout. |
|
- **Dec. 2024**: Added MuChoMusic dataset for Music Understanding - MCQ Questions. From Paper: https://arxiv.org/abs/2408.01337. |
|
- **Dec. 2024**: Singlish ASR task added! The datasets are available on [HF](https://huggingface.co/datasets/MERaLiON/MNSC). |
|
- **Dec. 2024**: Updated layout and added support for comparison between models with similar sizes. 1) Reorganized layout for a better user experience. 2) Added performance summary for each task. |
|
- **Aug. 2024**: Initial leaderboard is now online. |
|
""") |
|
|
|
st.divider() |
|
|
|
st.markdown(""" |
|
#### Evaluating Audio-based Large Language Models |
|
|
|
- AudioBench is a comprehensive evaluation benchmark designed for general instruction-following audio large language models. |
|
- AudioBench is an evaluation benchmark that we continually improve and maintain. |
|
|
|
Below are the initial 26 datasets that are included in AudioBench. We are now exteneded to over 40 datasets and going to extend to more in the future. |
|
""" |
|
) |
|
|
|
|
|
with st.container(): |
|
|
|
st.markdown(''' |
|
''') |
|
|
|
st.markdown("###### :dart: Our Benchmark includes: ") |
|
cols = st.columns(8) |
|
cols[0].metric(label="Tasks", value=">8") |
|
cols[1].metric(label="Datasets", value=">40") |
|
cols[2].metric(label="Evaluated Models", value=">5") |
|
|
|
st.divider() |
|
with st.container(): |
|
left_co, right_co = st.columns([1, 0.1]) |
|
|
|
with left_co: |
|
st.markdown(""" |
|
##### Citations :round_pushpin: |
|
``` |
|
@article{wang2024audiobench, |
|
title={AudioBench: A Universal Benchmark for Audio Large Language Models}, |
|
author={Wang, Bin and Zou, Xunlong and Lin, Geyu and Sun, Shuo and Liu, Zhuohan and Zhang, Wenyu and Liu, Zhengyuan and Aw, AiTi and Chen, Nancy F}, |
|
journal={NAACL}, |
|
year={2025} |
|
} |
|
``` |
|
``` |
|
@article{zhang2024mowe, |
|
title={MoWE-Audio: Multitask AudioLLMs with Mixture of Weak Encoders}, |
|
author={Zhang, Wenyu and Sun, Shuo and Wang, Bin and Zou, Xunlong and Liu, Zhuohan and He, Yingxu and Lin, Geyu and Chen, Nancy F and Aw, Ai Ti}, |
|
journal={ICASSP}, |
|
year={2025} |
|
} |
|
``` |
|
``` |
|
@article{wang2025advancing, |
|
title={Advancing Singlish Understanding: Bridging the Gap with Datasets and Multimodal Models}, |
|
author={Wang, Bin and Zou, Xunlong and Sun, Shuo and Zhang, Wenyu and He, Yingxu and Liu, Zhuohan and Wei, Chengwei and Chen, Nancy F and Aw, AiTi}, |
|
journal={arXiv preprint arXiv:2501.01034}, |
|
year={2025} |
|
} |
|
``` |
|
``` |
|
@article{he2024meralion, |
|
title={MERaLiON-AudioLLM: Technical Report}, |
|
author={He, Yingxu and Liu, Zhuohan and Sun, Shuo and Wang, Bin and Zhang, Wenyu and Zou, Xunlong and Chen, Nancy F and Aw, Ai Ti}, |
|
journal={arXiv preprint arXiv:2412.09818}, |
|
year={2024} |
|
} |
|
``` |
|
""") |
|
|
|
def show_examples_in_page(data): |
|
|
|
''' |
|
Show Dataset Examples |
|
''' |
|
|
|
|
|
if "show_dataset_examples" not in st.session_state: |
|
st.session_state.show_dataset_examples = False |
|
|
|
|
|
if st.button("Show Dataset Examples"): |
|
st.session_state.show_dataset_examples = not st.session_state.show_dataset_examples |
|
|
|
if st.session_state.show_dataset_examples: |
|
try: |
|
show_dataset_examples(data) |
|
except: |
|
st.markdown('To be implemented') |
|
|
|
|
|
|
|
def asr_english(): |
|
st.title("Task: Automatic Speech Recognition - English") |
|
|
|
sum = ['Overall'] |
|
|
|
filters_levelone = sum + asr_english_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
if filter_1: |
|
if filter_1 in sum: |
|
sum_table_mulit_metrix('asr_english', ['wer']) |
|
else: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer']) |
|
show_examples_in_page(filter_1) |
|
draw('su', 'asr_english', filter_1, 'wer', cus_sort=True) |
|
|
|
|
|
def asr_singlish(): |
|
st.title("Task: Automatic Speech Recognition - Singlish") |
|
|
|
sum = ['Overall'] |
|
|
|
filters_levelone = sum + asr_singlish_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
if filter_1: |
|
if filter_1 in sum: |
|
sum_table_mulit_metrix('asr_singlish', ['wer']) |
|
else: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer']) |
|
show_examples_in_page(filter_1) |
|
draw('su', 'asr_singlish', filter_1, 'wer') |
|
|
|
|
|
def asr_mandarin(): |
|
st.title("Task: Automatic Speech Recognition - Mandarin") |
|
|
|
sum = ['Overall'] |
|
|
|
filters_levelone = sum + asr_mandarin_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
if filter_1: |
|
if filter_1 in sum: |
|
sum_table_mulit_metrix('asr_mandarin', ['wer']) |
|
else: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer']) |
|
show_examples_in_page(filter_1) |
|
draw('su', 'asr_mandarin', filter_1, 'wer') |
|
|
|
|
|
def asr_malay(): |
|
st.title("Task: Automatic Speech Recognition - Malay") |
|
|
|
sum = ['Overall'] |
|
|
|
filters_levelone = sum + asr_malay_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
if filter_1: |
|
if filter_1 in sum: |
|
sum_table_mulit_metrix('asr_malay', ['wer']) |
|
else: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer']) |
|
show_examples_in_page(filter_1) |
|
draw('su', 'asr_malay', filter_1, 'wer') |
|
|
|
|
|
def asr_tamil(): |
|
st.title("Task: Automatic Speech Recognition - Tamil") |
|
|
|
sum = ['Overall'] |
|
|
|
filters_levelone = sum + asr_tamil_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
if filter_1: |
|
if filter_1 in sum: |
|
sum_table_mulit_metrix('asr_tamil', ['wer']) |
|
else: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer']) |
|
show_examples_in_page(filter_1) |
|
draw('su', 'asr_tamil', filter_1, 'wer') |
|
|
|
|
|
def asr_indonesian(): |
|
st.title("Task: Automatic Speech Recognition - Indonesian") |
|
|
|
sum = ['Overall'] |
|
|
|
filters_levelone = sum + asr_indonesian_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
if filter_1: |
|
if filter_1 in sum: |
|
sum_table_mulit_metrix('asr_indonesian', ['wer']) |
|
else: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer']) |
|
show_examples_in_page(filter_1) |
|
draw('su', 'asr_indonesian', filter_1, 'wer') |
|
|
|
|
|
def asr_thai(): |
|
st.title("Task: Automatic Speech Recognition - Thai") |
|
|
|
sum = ['Overall'] |
|
|
|
filters_levelone = sum + asr_thai_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
if filter_1: |
|
if filter_1 in sum: |
|
sum_table_mulit_metrix('asr_thai', ['wer']) |
|
else: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer']) |
|
show_examples_in_page(filter_1) |
|
draw('su', 'asr_thai', filter_1, 'wer') |
|
|
|
|
|
def asr_vietnamese(): |
|
st.title("Task: Automatic Speech Recognition - Vietnamese") |
|
|
|
sum = ['Overall'] |
|
|
|
filters_levelone = sum + asr_vietnamese_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
if filter_1: |
|
if filter_1 in sum: |
|
sum_table_mulit_metrix('asr_vietnamese', ['wer']) |
|
else: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer']) |
|
show_examples_in_page(filter_1) |
|
draw('su', 'asr_vietnamese', filter_1, 'wer') |
|
|
|
|
|
def asr_private(): |
|
st.title("Task: Automatic Speech Recognition - Private Datasets") |
|
|
|
sum = ['Overall'] |
|
|
|
filters_levelone = sum + asr_private_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
if filter_1: |
|
if filter_1 in sum: |
|
sum_table_mulit_metrix('asr_private', ['wer']) |
|
else: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['wer']) |
|
show_examples_in_page(filter_1) |
|
draw('su', 'asr_private', filter_1, 'wer') |
|
|
|
|
|
def speech_translation(): |
|
st.title("Task: Speech Translation") |
|
|
|
sum = ['Overall'] |
|
|
|
filters_levelone = sum + speech_translation_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
if filter_1: |
|
if filter_1 in sum: |
|
sum_table_mulit_metrix('st', ['bleu']) |
|
else: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['bleu']) |
|
show_examples_in_page(filter_1) |
|
draw('su', 'ST', filter_1, 'bleu') |
|
|
|
|
|
def speech_question_answering_english(): |
|
st.title("Task: Spoken Question Answering - English") |
|
|
|
sum = ['Overall'] |
|
|
|
filters_levelone = sum + speech_qa_english_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
if filter_1: |
|
if filter_1 in sum: |
|
sum_table_mulit_metrix('sqa_english', ['llama3_70b_judge']) |
|
|
|
|
|
|
|
|
|
|
|
else: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge']) |
|
show_examples_in_page(filter_1) |
|
draw('su', 'sqa_english', filter_1, 'llama3_70b_judge') |
|
|
|
|
|
def speech_question_answering_singlish(): |
|
st.title("Task: Spoken Question Answering - Singlish") |
|
|
|
sum = ['Overall'] |
|
|
|
filters_levelone = sum + speech_qa_singlish_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
if filter_1: |
|
if filter_1 in sum: |
|
sum_table_mulit_metrix('sqa_singlish', ['llama3_70b_judge']) |
|
|
|
else: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge']) |
|
show_examples_in_page(filter_1) |
|
draw('su', 'sqa_singlish', filter_1, 'llama3_70b_judge') |
|
|
|
|
|
def spoken_dialogue_summarization_singlish(): |
|
st.title("Task: Spoken Dialogue Summarization - Singlish") |
|
|
|
sum = ['Overall'] |
|
|
|
filters_levelone = sum + sds_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
if filter_1: |
|
if filter_1 in sum: |
|
sum_table_mulit_metrix('sds_singlish', ['llama3_70b_judge']) |
|
|
|
else: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge']) |
|
show_examples_in_page(filter_1) |
|
draw('su', 'sds_singlish', filter_1, 'llama3_70b_judge') |
|
|
|
|
|
def speech_instruction(): |
|
st.title("Task: Speech Instruction") |
|
|
|
sum = ['Overall'] |
|
|
|
filters_levelone = sum + si_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
if filter_1: |
|
if filter_1 in sum: |
|
sum_table_mulit_metrix('speech_instruction', ['llama3_70b_judge']) |
|
else: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge']) |
|
show_examples_in_page(filter_1) |
|
draw('su', 'speech_instruction', filter_1, 'llama3_70b_judge') |
|
|
|
|
|
def audio_captioning(): |
|
st.title("Task: Audio Captioning") |
|
|
|
filters_levelone = ac_datasets |
|
|
|
filters_leveltwo = ['Llama3-70b-judge', 'Meteor'] |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
with middle: |
|
metric = st.selectbox('Metric', filters_leveltwo) |
|
|
|
if filter_1 or metric: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info[metric.lower().replace('-', '_')]) |
|
show_examples_in_page(filter_1) |
|
draw('asu', 'audio_captioning', filter_1, metric.lower().replace('-', '_')) |
|
|
|
|
|
def audio_scene_question_answering(): |
|
st.title("Task: Audio Scene Question Answering") |
|
|
|
sum = ['Overall'] |
|
|
|
filters_levelone = sum + asqa_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
if filter_1: |
|
if filter_1 in sum: |
|
sum_table_mulit_metrix('audio_scene_question_answering', ['llama3_70b_judge']) |
|
else: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge']) |
|
show_examples_in_page(filter_1) |
|
draw('asu', 'audio_scene_question_answering', filter_1, 'llama3_70b_judge') |
|
|
|
|
|
def emotion_recognition(): |
|
st.title("Task: Emotion Recognition") |
|
|
|
sum = ['Overall'] |
|
|
|
filters_levelone = sum + er_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
if filter_1: |
|
if filter_1 in sum: |
|
sum_table_mulit_metrix('emotion_recognition', ['llama3_70b_judge']) |
|
else: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge']) |
|
show_examples_in_page(filter_1) |
|
draw('vu', 'emotion_recognition', filter_1, 'llama3_70b_judge') |
|
|
|
|
|
def accent_recognition(): |
|
st.title("Task: Accent Recognition") |
|
|
|
sum = ['Overall'] |
|
|
|
filters_levelone = sum + ar_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
|
|
if filter_1: |
|
if filter_1 in sum: |
|
sum_table_mulit_metrix('accent_recognition', ['llama3_70b_judge']) |
|
else: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge']) |
|
show_examples_in_page(filter_1) |
|
draw('vu', 'accent_recognition', filter_1, 'llama3_70b_judge') |
|
|
|
|
|
def gender_recognition(): |
|
st.title("Task: Gender Recognition") |
|
|
|
sum = ['Overall'] |
|
|
|
filters_levelone = sum + gr_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
if filter_1: |
|
if filter_1 in sum: |
|
sum_table_mulit_metrix('gender_recognition', ['llama3_70b_judge']) |
|
else: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge']) |
|
show_examples_in_page(filter_1) |
|
draw('vu', 'gender_recognition', filter_1, 'llama3_70b_judge') |
|
|
|
|
|
def music_understanding(): |
|
st.title("Task: Music Understanding - MCQ Questions") |
|
|
|
sum = ['Overall'] |
|
|
|
filters_levelone = sum + music_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
if filter_1: |
|
if filter_1 in sum: |
|
sum_table_mulit_metrix('music_understanding', ['llama3_70b_judge']) |
|
else: |
|
dataset_contents(dataset_diaplay_information[filter_1], metrics_info['llama3_70b_judge']) |
|
show_examples_in_page(filter_1) |
|
draw('vu', 'music_understanding', filter_1, 'llama3_70b_judge') |
|
|
|
|
|
def under_development(): |
|
st.title("Task: Under Development") |
|
|
|
filters_levelone = non_wer_development_datasets + wer_development_datasets |
|
|
|
left, center, _, middle, right = st.columns([0.4, 0.2, 0.2, 0.2 ,0.2]) |
|
|
|
with left: |
|
filter_1 = st.selectbox('Dataset', filters_levelone) |
|
|
|
dataset_contents(dataset_diaplay_information[filter_1], 'under_development') |
|
|
|
show_examples_in_page(filter_1) |
|
|
|
if filter_1 in wer_development_datasets: |
|
draw('vu', 'under_development_wer', filter_1, 'wer') |
|
|
|
elif filter_1 in non_wer_development_datasets: |
|
draw('vu', 'under_development_llama3_70b_judge', filter_1, 'llama3_70b_judge') |
|
|