TRUEBench / src /data_loader.py
송종윤/AI Productivity팀(SR)/삼성전자
Initial commit
8a254d6
import pandas as pd
from pathlib import Path
from typing import Optional
# Global cache variables
_category_df_cache: Optional[pd.DataFrame] = None
_language_df_cache: Optional[pd.DataFrame] = None
def _load_category_csv() -> pd.DataFrame:
"""Load the category CSV file with proper encoding and delimiter."""
abs_path = Path(__file__).parent
df = pd.read_csv(str(abs_path / "data/stats.csv"), encoding='utf-8', delimiter="\t")
return df.copy()
def _load_language_csv() -> pd.DataFrame:
"""Load the language CSV file with proper encoding and delimiter."""
abs_path = Path(__file__).parent
df = pd.read_csv(str(abs_path / "data/stats_lang.csv"), encoding='utf-8', delimiter="\t")
return df.copy()
def get_category_dataframe(processed: bool = True) -> pd.DataFrame:
"""
Get the category dataframe.
Args:
processed: If True, returns processed dataframe (for vis_utils.py compatibility)
If False, returns raw dataframe sorted by Overall (for data_utils.py compatibility)
Returns:
pd.DataFrame: The category dataframe
"""
global _category_df_cache
if _category_df_cache is None:
_category_df_cache = _load_category_csv()
df = _category_df_cache.copy()
if processed:
# Apply vis_utils.py processing
required_cols = ['Model Name', 'Link', "Group", "Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)", "Type", "Model Type", "Think", 'Content Generation', 'Editing', 'Data Analysis',
'Reasoning', 'Hallucination', 'Safety', 'Repetition',
'Summarization', 'Translation', 'Multi-Turn']
for col in required_cols:
if col not in df.columns:
if col in ["Link", "Group"]:
df[col] = ""
else:
df[col] = 0
from constants import NUMERIC_COLS_CATEGORY, NUMERIC_INT_COLS_CATEGORY
for col in NUMERIC_COLS_CATEGORY:
if col in df.columns:
if col in NUMERIC_INT_COLS_CATEGORY:
df[col] = pd.to_numeric(df[col], errors='coerce').round(0)
else:
df[col] = pd.to_numeric(df[col], errors='coerce').round(3)
else:
df[col] = 0
if "Think" not in df.columns:
df["Think"] = "Off"
df = df.fillna('')
else:
# Apply data_utils.py processing
df = df.sort_values("Overall", ascending=False)
return df
def get_language_dataframe(processed: bool = True) -> pd.DataFrame:
"""
Get the language dataframe.
Args:
processed: If True, returns processed dataframe (for vis_utils.py compatibility)
If False, returns raw dataframe sorted by Overall (for data_utils.py compatibility)
Returns:
pd.DataFrame: The language dataframe
"""
global _language_df_cache
if _language_df_cache is None:
_language_df_cache = _load_language_csv()
df = _language_df_cache.copy()
if processed:
# Apply vis_utils.py processing
language_cols = ['Model Name', 'Link', "Group", "Overall", "Med. Len.", "Med. Resp. Len.", "Parameter Size (B)", "Type", "Model Type", "Think", 'KO', 'EN', 'JA', 'ZH', 'PL', 'DE', 'PT', 'ES', 'FR', 'IT', 'RU', 'VI']
for col in language_cols:
if col not in df.columns:
if col in ["Link", "Group"]:
df[col] = ""
else:
df[col] = 0
from constants import NUMERIC_COLS_LANGUAGE, NUMERIC_INT_COLS_LANGUAGE
for col in NUMERIC_COLS_LANGUAGE:
if col in df.columns:
if col in NUMERIC_INT_COLS_LANGUAGE:
df[col] = pd.to_numeric(df[col], errors='coerce').round(0)
else:
df[col] = pd.to_numeric(df[col], errors='coerce').round(3)
else:
df[col] = 0
df = df.fillna('')
else:
# Apply data_utils.py processing
df = df.sort_values("Overall", ascending=False)
return df
def clear_cache():
"""Clear the cached dataframes to force reload on next access."""
global _category_df_cache, _language_df_cache
_category_df_cache = None
_language_df_cache = None