|
import pandas as pd |
|
import json |
|
import os |
|
import glob |
|
import gradio as gr |
|
import traceback |
|
import re |
|
import plotly.express as px |
|
import plotly.graph_objects as go |
|
from src.envs import API, TOKEN, REPO_ID |
|
import requests |
|
import logging |
|
from datetime import datetime |
|
from dotenv import load_dotenv |
|
from utils.rag_score_calculator import RAGScoreCalculator |
|
|
|
|
|
logger = logging.getLogger("mezura.utils") |
|
|
|
|
|
submission_logger = logging.getLogger("mezura.submissions") |
|
submission_handler = logging.FileHandler("submissions.log") |
|
submission_formatter = logging.Formatter('%(asctime)s - %(message)s') |
|
submission_handler.setFormatter(submission_formatter) |
|
submission_logger.addHandler(submission_handler) |
|
submission_logger.setLevel(logging.INFO) |
|
|
|
|
|
MODEL_METADATA_LOOKUP = { |
|
"mistralai/Magistral-Small-2506": {"license": "Apache 2.0", "dtype": "bfloat16"}, |
|
"newmindai/Qwen2.5-72B-Instruct": {"license": "Qwen", "dtype": "bfloat16"}, |
|
"Qwen/Qwen2.5-72B-Instruct": {"license": "Qwen", "dtype": "bfloat16"}, |
|
"deepseek-ai/DeepSeek-R1": {"license": "MIT", "dtype": "bfloat16"}, |
|
"Qwen/Qwen3-32B": {"license": "Qwen", "dtype": "bfloat16"}, |
|
"newmindai/QwQ-32B-r1": {"license": "Apache 2.0", "dtype": "bfloat16"}, |
|
"google/gemma-3-27b-it": {"license": "Gemma", "dtype": "bfloat16"}, |
|
"Qwen/Qwen3-14B": {"license": "Apache 2.0", "dtype": "bfloat16"}, |
|
"newmindai/Llama-3.3-70b-Instruct": {"license": "Llama-3.3", "dtype": "bfloat16"}, |
|
"Qwen/QwQ-32B": {"license": "Apache 2.0", "dtype": "bfloat16"}, |
|
"microsoft/phi-4": {"license": "MIT", "dtype": "bfloat16"}, |
|
"meta-llama/Meta-Llama-3.1-70B-Instruct": {"license": "Llama 3.1", "dtype": "bfloat16"}, |
|
"grok-3": {"license": "Proprietary", "dtype": "Unknown"}, |
|
"grok-3-mini-fast": {"license": "Proprietary", "dtype": "Unknown"}, |
|
"meta-llama/Llama-3.3-70B-Instruct": {"license": "Llama-3.3", "dtype": "bfloat16"}, |
|
"meta-llama/Llama-3.3-70b-Instruct": {"license": "Llama 3.3", "dtype": "bfloat16"}, |
|
"newmindai/Qwen2.5-72b-Instruct": {"license": "Qwen", "dtype": "bfloat16"}, |
|
"grok-3-mini-fast-beta": {"license": "Proprietary", "dtype": "Unknown"}, |
|
|
|
"deepseek-r1-distill-llama-70b": {"license": "MIT", "dtype": "bfloat16"}, |
|
"qwen-qwq-32b": {"license": "Apache 2.0", "dtype": "bfloat16"} |
|
} |
|
|
|
def log_model_submission(repo_id, base_model): |
|
""" |
|
Logs model submission details to a dedicated log file |
|
|
|
Args: |
|
repo_id: The repository ID of the model |
|
base_model: The base model used |
|
""" |
|
submission_logger.info(f"SUBMISSION - REPO_ID: {repo_id}, BASE_MODEL: {base_model}") |
|
|
|
def restart_space(): |
|
try: |
|
if API is not None: |
|
API.restart_space(repo_id=REPO_ID, token=TOKEN) |
|
else: |
|
print("Warning: API is None, cannot restart space") |
|
except Exception as e: |
|
print(f"Warning: Could not restart space: {e}") |
|
|
|
|
|
def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame: |
|
|
|
selected_columns = [AutoEvalColumn.model_type_symbol.name, AutoEvalColumn.model.name] |
|
|
|
|
|
for column in columns: |
|
if column in df.columns: |
|
selected_columns.append(column) |
|
|
|
|
|
selected_columns.append(AutoEvalColumn.dummy.name) |
|
|
|
return df[selected_columns] |
|
|
|
|
|
def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame: |
|
if not query: |
|
return filtered_df |
|
|
|
|
|
queries = query.split(";") |
|
filtered_dfs = [] |
|
|
|
for q in queries: |
|
q = q.strip() |
|
if not q: |
|
continue |
|
filtered_dfs.append(filtered_df[filtered_df[AutoEvalColumn.dummy.name].str.contains(q, case=False)]) |
|
|
|
if not filtered_dfs: |
|
return filtered_df |
|
|
|
|
|
return pd.concat(filtered_dfs).drop_duplicates() |
|
|
|
|
|
def filter_models( |
|
df: pd.DataFrame |
|
) -> pd.DataFrame: |
|
|
|
filtered_df = df.copy() |
|
|
|
|
|
filtered_df = filtered_df[filtered_df[AutoEvalColumn.still_on_hub.name]] |
|
|
|
return filtered_df |
|
|
|
|
|
|
|
def load_benchmark_results(): |
|
""" |
|
Load benchmark results from local files |
|
""" |
|
results = { |
|
"avg": { |
|
"evalmix": [], |
|
"light_eval": [], |
|
"snake": [], |
|
"retrieval": [], |
|
"arena": [], |
|
"human_arena": [] |
|
}, |
|
"raw": { |
|
"evalmix": [], |
|
"light_eval": [], |
|
"snake": [], |
|
"retrieval": [], |
|
"arena": [], |
|
"human_arena": [] |
|
} |
|
} |
|
|
|
|
|
benchmark_types = ["evalmix", "light_eval", "snake", "retrieval", "arena", "human_arena"] |
|
|
|
|
|
rag_calculator = None |
|
rag_scores_cache = {} |
|
try: |
|
rag_calculator = RAGScoreCalculator() |
|
if rag_calculator.stats: |
|
logger.info("RAG Score calculator initialized successfully") |
|
|
|
for data in rag_calculator.all_data: |
|
run_id = data.get('run_id') |
|
if run_id: |
|
rag_score = rag_calculator.calculate_rag_score(data) |
|
rag_scores_cache[run_id] = rag_score |
|
logger.info(f"Pre-calculated {len(rag_scores_cache)} RAG scores") |
|
else: |
|
logger.warning("No RAG statistics available for score calculation") |
|
except Exception as e: |
|
logger.warning(f"Could not initialize RAG Score calculator: {e}") |
|
rag_calculator = None |
|
|
|
|
|
for benchmark_type in benchmark_types: |
|
dir_path = f"result/{benchmark_type}" |
|
|
|
|
|
|
|
|
|
if not os.path.exists(dir_path): |
|
continue |
|
|
|
|
|
avg_files = glob.glob(f"{dir_path}/avg_*.json") |
|
|
|
for file in avg_files: |
|
try: |
|
with open(file, "r") as f: |
|
data = json.load(f) |
|
|
|
|
|
if isinstance(data, list): |
|
|
|
if benchmark_type == "arena" and len(data) > 0: |
|
|
|
processed_data = { |
|
"model_name": f"Model {os.path.basename(file).replace('avg_', '').split('.')[0]}", |
|
"file": os.path.basename(file) |
|
} |
|
|
|
|
|
if len(data) > 0: |
|
for i, item in enumerate(data): |
|
if isinstance(item, dict): |
|
for key, value in item.items(): |
|
processed_data[f"item_{i}_{key}"] = value |
|
|
|
data = processed_data |
|
else: |
|
|
|
data = {"model_name": f"Model {os.path.basename(file).replace('avg_', '').split('.')[0]}"} |
|
else: |
|
|
|
if not isinstance(data, dict): |
|
data = {"model_name": f"Model {os.path.basename(file).replace('avg_', '').split('.')[0]}"} |
|
|
|
|
|
data["file"] = os.path.basename(file) |
|
|
|
|
|
if "model_name" not in data or not data["model_name"]: |
|
|
|
file_name = os.path.basename(file) |
|
model_id = file_name.replace("avg_", "").split(".")[0] |
|
data["model_name"] = f"Model {model_id}" |
|
|
|
|
|
if "model_name" in data: |
|
data["model_name"] = format_model_name(data["model_name"]) |
|
|
|
|
|
if benchmark_type == "retrieval" and rag_scores_cache: |
|
run_id = data.get('run_id') |
|
if run_id and run_id in rag_scores_cache: |
|
data["RAG_score"] = rag_scores_cache[run_id] |
|
logger.debug(f"Added cached RAG_score {rag_scores_cache[run_id]} for avg file {data.get('model_name', 'unknown')}") |
|
else: |
|
logger.debug(f"No cached RAG_score found for run_id: {run_id}") |
|
|
|
results["avg"][benchmark_type].append(data) |
|
except Exception as e: |
|
print(f"Error loading {benchmark_type} avg file: {file} - {e}") |
|
|
|
|
|
detail_files = glob.glob(f"{dir_path}/detail_*.json") |
|
|
|
for file in detail_files: |
|
try: |
|
with open(file, "r") as f: |
|
data = json.load(f) |
|
|
|
|
|
if isinstance(data, list): |
|
|
|
if benchmark_type == "arena" and len(data) > 0: |
|
|
|
processed_data = { |
|
"model_name": f"Model {os.path.basename(file).replace('detail_', '').split('.')[0]}", |
|
"file": os.path.basename(file) |
|
} |
|
|
|
|
|
if len(data) > 0: |
|
for i, item in enumerate(data): |
|
if isinstance(item, dict): |
|
for key, value in item.items(): |
|
processed_data[f"item_{i}_{key}"] = value |
|
|
|
data = processed_data |
|
else: |
|
|
|
data = {"model_name": f"Model {os.path.basename(file).replace('detail_', '').split('.')[0]}"} |
|
else: |
|
|
|
if not isinstance(data, dict): |
|
data = {"model_name": f"Model {os.path.basename(file).replace('detail_', '').split('.')[0]}"} |
|
|
|
|
|
data["file"] = os.path.basename(file) |
|
|
|
|
|
if "model_name" not in data or not data["model_name"]: |
|
|
|
file_name = os.path.basename(file) |
|
model_id = file_name.replace("detail_", "").split(".")[0] |
|
data["model_name"] = f"Model {model_id}" |
|
|
|
|
|
if "model_name" in data: |
|
data["model_name"] = format_model_name(data["model_name"]) |
|
|
|
|
|
if benchmark_type == "retrieval" and rag_scores_cache: |
|
run_id = data.get('run_id') |
|
if run_id and run_id in rag_scores_cache: |
|
data["RAG_score"] = rag_scores_cache[run_id] |
|
logger.debug(f"Added cached RAG_score {rag_scores_cache[run_id]} for detail file {data.get('model_name', 'unknown')}") |
|
else: |
|
logger.debug(f"No cached RAG_score found for detail run_id: {run_id}") |
|
|
|
results["raw"][benchmark_type].append(data) |
|
|
|
|
|
|
|
|
|
simplified_data = {"model_name": data["model_name"], "file": data["file"]} |
|
|
|
|
|
if benchmark_type == "retrieval": |
|
|
|
|
|
if "RAG_score" in data: |
|
simplified_data["RAG_score"] = data["RAG_score"] |
|
if "RAG_success_rate" in data: |
|
simplified_data["RAG_success_rate"] = data["RAG_success_rate"] |
|
if "average_judge_score" in data: |
|
simplified_data["average_judge_score"] = data["average_judge_score"] |
|
|
|
|
|
if not any(item.get("model_name") == data["model_name"] for item in results["avg"][benchmark_type]): |
|
results["avg"][benchmark_type].append(simplified_data) |
|
except Exception as e: |
|
print(f"Error loading {benchmark_type} detail file: {file} - {e}") |
|
|
|
return results |
|
|
|
def format_model_name(model_name): |
|
""" |
|
Formats model names for better display in leaderboards: |
|
- Replaces underscores with spaces |
|
- Preserves original casing |
|
|
|
Args: |
|
model_name: Original model name string |
|
|
|
Returns: |
|
str: Formatted model name |
|
""" |
|
if not model_name: |
|
return model_name |
|
|
|
|
|
if "/" in model_name: |
|
org, name = model_name.split("/", 1) |
|
|
|
formatted_name = name.replace("_", " ") |
|
return f"{org}/{formatted_name}" |
|
else: |
|
|
|
return model_name.replace("_", " ") |
|
|
|
def create_evalmix_table(data): |
|
""" |
|
Hybrid benchmark sonuçlarından tablo oluşturur |
|
""" |
|
if not data: |
|
return pd.DataFrame() |
|
|
|
|
|
for item in data: |
|
if "model_name" in item: |
|
raw_model_name = item["model_name"] |
|
item["model_name"] = format_model_name(raw_model_name) |
|
|
|
|
|
for field in ["dtype", "license"]: |
|
if raw_model_name in MODEL_METADATA_LOOKUP: |
|
item[field] = MODEL_METADATA_LOOKUP[raw_model_name][field] |
|
else: |
|
defaults = {"dtype": "unknown", "license": "Unknown"} |
|
item[field] = defaults[field] |
|
|
|
df = pd.DataFrame(data) |
|
|
|
|
|
if 'file' in df.columns: |
|
df = df.drop(columns=['file']) |
|
|
|
|
|
sample_columns = ["total_samples", "Total Samples", "samples_number"] |
|
for col in sample_columns: |
|
if col in df.columns: |
|
df = df.drop(columns=[col]) |
|
|
|
if "model_name" in df.columns: |
|
df = df.sort_values(by="model_name") |
|
|
|
|
|
if all(col in df.columns for col in ["lexical_metric", "semantic_metric"]): |
|
if "judge_metric" in df.columns: |
|
df["average_score"] = df[["lexical_metric", "semantic_metric", "judge_metric"]].mean(axis=1).round(2) |
|
else: |
|
df["average_score"] = df[["lexical_metric", "semantic_metric"]].mean(axis=1).round(2) |
|
|
|
|
|
for column in df.columns: |
|
try: |
|
if pd.api.types.is_float_dtype(df[column]): |
|
df[column] = df[column].round(2) |
|
except: |
|
continue |
|
|
|
|
|
column_mapping = {} |
|
for col in df.columns: |
|
|
|
if col == "model_name": |
|
column_mapping[col] = "Model Name" |
|
continue |
|
|
|
|
|
if "turkish_semantic" in col.lower(): |
|
column_mapping[col] = "Turkish Semantic" |
|
continue |
|
|
|
if "multilingual_semantic" in col.lower(): |
|
column_mapping[col] = "Multilingual Semantic" |
|
continue |
|
|
|
|
|
if col == "average_score": |
|
column_mapping[col] = "Average Score" |
|
continue |
|
if col == "lexical_metric": |
|
column_mapping[col] = "Lexical Score" |
|
continue |
|
if col == "semantic_metric": |
|
column_mapping[col] = "Semantic Score" |
|
continue |
|
if col == "judge_metric": |
|
column_mapping[col] = "Judge Score" |
|
continue |
|
if col == "openai_accuracy": |
|
column_mapping[col] = "OpenAI Accuracy" |
|
continue |
|
if col == "dtype": |
|
column_mapping[col] = "Dtype" |
|
continue |
|
if col == "license": |
|
column_mapping[col] = "License" |
|
continue |
|
|
|
|
|
formatted_col = " ".join([word.capitalize() for word in col.replace("_", " ").split()]) |
|
column_mapping[col] = formatted_col |
|
|
|
|
|
df = df.rename(columns=column_mapping) |
|
|
|
|
|
if "Turkish Semantic" in df.columns: |
|
df = df.sort_values(by="Turkish Semantic", ascending=False) |
|
elif "turkish_semantic" in df.columns: |
|
df = df.sort_values(by="turkish_semantic", ascending=False) |
|
|
|
|
|
desired_cols = [ |
|
"Model Name", |
|
"Turkish Semantic", |
|
"Multilingual Semantic", |
|
"Average Score", |
|
"Lexical Score", |
|
"Semantic Score", |
|
"Judge Score", |
|
"OpenAI Accuracy", |
|
"Dtype", |
|
"License" |
|
] |
|
|
|
|
|
final_cols = [col for col in desired_cols if col in df.columns] |
|
|
|
|
|
remaining_cols = [col for col in df.columns if col not in final_cols] |
|
final_cols.extend(remaining_cols) |
|
|
|
|
|
df = df[final_cols] |
|
|
|
return df |
|
|
|
def create_light_eval_table(data, is_detail=False): |
|
""" |
|
Creates a table from Light Eval results |
|
|
|
Args: |
|
data: Light eval data |
|
is_detail: If True, keep 4 decimal places for detail results |
|
""" |
|
if not data: |
|
return pd.DataFrame() |
|
|
|
|
|
formatted_data = [] |
|
for item in data: |
|
model_data = {"model_name": format_model_name(item.get("model_name", "Bilinmeyen Model"))} |
|
|
|
|
|
metrics = [ |
|
"overall_average", |
|
"mmlu_average", |
|
"truthfulqa", |
|
"winogrande", |
|
"hellaswag", |
|
"gsm8k", |
|
"arc_challenge", |
|
"dtype", |
|
"license" |
|
|
|
] |
|
|
|
for metric in metrics: |
|
try: |
|
if metric in ["dtype", "license"]: |
|
|
|
raw_model_name = item.get("model_name", "") |
|
if raw_model_name in MODEL_METADATA_LOOKUP: |
|
model_data[metric] = MODEL_METADATA_LOOKUP[raw_model_name][metric] |
|
else: |
|
|
|
defaults = {"dtype": "unknown", "license": "Unknown"} |
|
model_data[metric] = defaults[metric] |
|
elif metric in item: |
|
if metric == "overall_average" and item[metric] == "N/A": |
|
model_data[metric] = "N/A" |
|
elif isinstance(item[metric], str) and item[metric] != "N/A": |
|
model_data[metric] = float(item[metric]) |
|
else: |
|
model_data[metric] = item[metric] |
|
else: |
|
model_data[metric] = "N/A" |
|
except Exception as e: |
|
if metric in ["dtype", "license"]: |
|
defaults = {"dtype": "unknown", "license": "Unknown"} |
|
model_data[metric] = defaults[metric] |
|
else: |
|
model_data[metric] = item.get(metric, "N/A") |
|
|
|
formatted_data.append(model_data) |
|
|
|
|
|
df = pd.DataFrame(formatted_data) |
|
|
|
|
|
if 'file' in df.columns: |
|
df = df.drop(columns=['file']) |
|
|
|
|
|
numeric_cols = ["overall_average", "mmlu_average", "truthfulqa", "winogrande", "hellaswag", "gsm8k", "arc_challenge"] |
|
for col in numeric_cols: |
|
if col in df.columns: |
|
try: |
|
|
|
df[col] = df[col].apply(lambda x: float(x) if isinstance(x, (int, float)) or (isinstance(x, str) and x != "N/A") else x) |
|
except Exception as e: |
|
pass |
|
|
|
|
|
if "overall_average" in df.columns: |
|
|
|
sort_col = pd.to_numeric(df["overall_average"], errors="coerce") |
|
|
|
df = df.iloc[sort_col.fillna(-1).argsort(kind="stable").iloc[::-1]] |
|
|
|
|
|
decimal_places = 4 if is_detail else 2 |
|
for column in df.columns: |
|
try: |
|
if pd.api.types.is_float_dtype(df[column]): |
|
df[column] = df[column].round(decimal_places) |
|
except: |
|
continue |
|
|
|
|
|
column_mapping = { |
|
"model_name": "Model Name", |
|
"overall_average": "Overall", |
|
"mmlu_average": "MMLU", |
|
"truthfulqa": "Truthfulqa", |
|
"winogrande": "Winogrande", |
|
"hellaswag": "Hellaswag", |
|
"gsm8k": "Gsm8k", |
|
"arc_challenge": "ARC", |
|
"dtype": "Dtype", |
|
"license": "License" |
|
} |
|
|
|
|
|
df = df.rename(columns=column_mapping) |
|
|
|
|
|
desired_cols = [ |
|
"Model Name", |
|
"Overall", |
|
"MMLU", |
|
"Truthfulqa", |
|
"Winogrande", |
|
"Hellaswag", |
|
"Gsm8k", |
|
"ARC", |
|
"Dtype", |
|
"License" |
|
] |
|
|
|
|
|
final_cols = [col for col in desired_cols if col in df.columns] |
|
|
|
|
|
remaining_cols = [col for col in df.columns if col not in final_cols] |
|
final_cols.extend(remaining_cols) |
|
|
|
|
|
df = df[final_cols] |
|
|
|
return df |
|
|
|
def create_benchmark_plots(benchmark_data, data_type="avg"): |
|
""" |
|
Benchmark verilerinden grafikler oluşturur |
|
|
|
Args: |
|
benchmark_data: Benchmark verileri |
|
data_type: "avg" veya "raw" olabilir |
|
""" |
|
plots = {} |
|
|
|
|
|
if benchmark_data[data_type]["evalmix"]: |
|
df = create_evalmix_table(benchmark_data[data_type]["evalmix"]) |
|
if not df.empty and all(col in df.columns for col in ["model_name", "lexical_metric", "semantic_metric"]): |
|
|
|
metrics = ["lexical_metric", "semantic_metric"] |
|
if "judge_metric" in df.columns: |
|
metrics.append("judge_metric") |
|
|
|
|
|
plot_df = pd.melt( |
|
df, |
|
id_vars=["model_name"], |
|
value_vars=metrics, |
|
var_name="Metrik", |
|
value_name="Değer" |
|
) |
|
|
|
|
|
plot_df["Metrik"] = plot_df["Metrik"].replace({ |
|
"lexical_metric": "Lexical Metric", |
|
"semantic_metric": "Semantic Metric", |
|
"judge_metric": "Judge Metric" |
|
}) |
|
|
|
fig = px.bar( |
|
plot_df, |
|
x="model_name", |
|
y="Değer", |
|
color="Metrik", |
|
title="Hybrid Benchmark Results", |
|
labels={"model_name": "Model", "Değer": "Score"}, |
|
barmode="group" |
|
) |
|
plots["evalmix"] = fig |
|
|
|
|
|
if benchmark_data[data_type]["light_eval"]: |
|
df = create_light_eval_table(benchmark_data[data_type]["light_eval"]) |
|
if not df.empty: |
|
|
|
metric_cols = [col for col in df.columns if col not in ["model_name", "Ortalama", "file", "overall_average", "total_samples"]] |
|
if metric_cols: |
|
fig = go.Figure() |
|
|
|
for _, row in df.iterrows(): |
|
fig.add_trace(go.Scatterpolar( |
|
r=[row[col] for col in metric_cols], |
|
theta=metric_cols, |
|
fill='toself', |
|
name=row.get("model_name", "Unknown Model") |
|
)) |
|
|
|
fig.update_layout( |
|
polar=dict( |
|
radialaxis=dict( |
|
visible=True, |
|
range=[0, 1] |
|
) |
|
), |
|
title="Light Eval Results", |
|
showlegend=True |
|
) |
|
plots["light_eval"] = fig |
|
|
|
return plots |
|
|
|
def create_combined_leaderboard_table(benchmark_data): |
|
""" |
|
Creates a combined leaderboard table from avg JSON data |
|
""" |
|
|
|
benchmark_types = ["evalmix", "light_eval", "retrieval", "arena", "human_arena"] |
|
|
|
all_models = {} |
|
|
|
|
|
for benchmark_type in benchmark_types: |
|
|
|
if benchmark_type in ["human_arena", "retrieval"]: |
|
data_source = benchmark_data["raw"][benchmark_type] |
|
else: |
|
data_source = benchmark_data["avg"][benchmark_type] |
|
|
|
|
|
if not data_source: |
|
continue |
|
|
|
|
|
for item in data_source: |
|
model_name = item.get("model_name", "") |
|
if not model_name: |
|
continue |
|
|
|
|
|
formatted_model_name = format_model_name(model_name) |
|
|
|
|
|
if formatted_model_name not in all_models: |
|
all_models[formatted_model_name] = {"model_name": formatted_model_name} |
|
|
|
|
|
for field in ["dtype", "license"]: |
|
if model_name in MODEL_METADATA_LOOKUP: |
|
all_models[formatted_model_name][field] = MODEL_METADATA_LOOKUP[model_name][field] |
|
else: |
|
defaults = {"dtype": "unknown", "license": "Unknown"} |
|
all_models[formatted_model_name][field] = defaults[field] |
|
|
|
|
|
if benchmark_type == "evalmix": |
|
if "lexical_metric" in item: |
|
all_models[formatted_model_name]["Lexical"] = round(item.get("lexical_metric", 0), 2) |
|
if "semantic_metric" in item: |
|
all_models[formatted_model_name]["Multilingual Semantic"] = round(item.get("semantic_metric", 0), 2) |
|
|
|
if "turkish_semantic" in item: |
|
all_models[formatted_model_name]["Turkish Semantic"] = round(item.get("turkish_semantic", 0), 2) |
|
elif "turkish_semantic_" in item: |
|
all_models[formatted_model_name]["Turkish Semantic"] = round(item.get("turkish_semantic_", 0), 2) |
|
elif "nlp_metrics" in item and "cosine_similarity_turkish" in item.get("nlp_metrics", {}): |
|
turkish_sim = item.get("nlp_metrics", {}).get("cosine_similarity_turkish", {}).get("mean", 0) |
|
all_models[formatted_model_name]["Turkish Semantic"] = round(turkish_sim, 2) |
|
|
|
|
|
if "multilingual_semantic" in item: |
|
all_models[formatted_model_name]["Multilingual Semantic"] = round(item.get("multilingual_semantic", 0), 2) |
|
elif "multilingual_semantic_" in item: |
|
all_models[formatted_model_name]["Multilingual Semantic"] = round(item.get("multilingual_semantic_", 0), 2) |
|
elif "nlp_metrics" in item and "cosine_similarity_multilingual" in item.get("nlp_metrics", {}): |
|
multi_sim = item.get("nlp_metrics", {}).get("cosine_similarity_multilingual", {}).get("mean", 0) |
|
all_models[formatted_model_name]["Multilingual Semantic"] = round(multi_sim, 2) |
|
|
|
|
|
if "bert_score" in item and isinstance(item.get("bert_score"), dict) and "f1" in item.get("bert_score", {}): |
|
bert_f1 = item.get("bert_score", {}).get("f1", {}).get("mean", 0) |
|
all_models[formatted_model_name]["BERTScore F1"] = round(bert_f1, 2) |
|
elif "nlp_metrics" in item and "bert_score" in item.get("nlp_metrics", {}): |
|
bert_f1 = item.get("nlp_metrics", {}).get("bert_score", {}).get("f1", {}).get("mean", 0) |
|
all_models[formatted_model_name]["BERTScore F1"] = round(bert_f1, 2) |
|
|
|
elif benchmark_type == "light_eval": |
|
if "overall_average" in item: |
|
try: |
|
if isinstance(item["overall_average"], str) and item["overall_average"] != "N/A": |
|
avg_value = float(item["overall_average"]) |
|
else: |
|
avg_value = item["overall_average"] |
|
all_models[formatted_model_name]["Light Eval"] = round(avg_value, 2) |
|
except (ValueError, TypeError): |
|
all_models[formatted_model_name]["Light Eval"] = item["overall_average"] |
|
|
|
elif benchmark_type == "retrieval": |
|
|
|
if "RAG_score" in item: |
|
avg_value = item["RAG_score"] |
|
all_models[formatted_model_name]["Retrieval"] = round(avg_value, 4) |
|
elif "RAG_success_rate" in item: |
|
avg_value = item["RAG_success_rate"] |
|
all_models[formatted_model_name]["Retrieval"] = round(avg_value, 2) |
|
|
|
elif benchmark_type == "arena": |
|
if "Melo Score" in item: |
|
all_models[formatted_model_name]["Auto Elo Score"] = round(item.get("Melo Score", 0), 2) |
|
|
|
elif benchmark_type == "human_arena": |
|
if "elo_rating" in item: |
|
all_models[formatted_model_name]["Human Elo Score"] = round(item.get("elo_rating", 0), 2) |
|
|
|
|
|
|
|
if all_models: |
|
df = pd.DataFrame(list(all_models.values())) |
|
|
|
|
|
if "model_name" in df.columns: |
|
df = df.rename(columns={"model_name": "Model Name"}) |
|
|
|
|
|
column_mapping = { |
|
"dtype": "Dtype", |
|
"license": "License" |
|
} |
|
df = df.rename(columns=column_mapping) |
|
|
|
|
|
if 'file' in df.columns: |
|
df = df.drop(columns=['file']) |
|
|
|
|
|
for field in ['run_id', 'user_id', 'Run Id', 'User Id']: |
|
if field in df.columns: |
|
df = df.drop(columns=[field]) |
|
|
|
|
|
display_cols = [ |
|
"Auto Elo Score", |
|
"Human Elo Score", |
|
"Retrieval", |
|
"Light Eval", |
|
"Turkish Semantic", |
|
"Multilingual Semantic", |
|
"Lexical", |
|
"Dtype", |
|
"License" |
|
] |
|
valid_display_cols = [col for col in display_cols if col in df.columns] |
|
|
|
|
|
for col in valid_display_cols: |
|
df[col] = df[col].fillna(0) |
|
|
|
|
|
desired_order = ["Model Name", "Auto Elo Score", "Human Elo Score", "Retrieval", "Light Eval", "Turkish Semantic", "Multilingual Semantic", "Lexical", "Dtype", "License"] |
|
|
|
|
|
actual_order = [col for col in desired_order if col in df.columns] |
|
|
|
|
|
if len(actual_order) > 0: |
|
df = df[actual_order] |
|
|
|
|
|
if "Auto Elo Score" in df.columns: |
|
df = df.sort_values(by="Auto Elo Score", ascending=False) |
|
elif "Human Elo Score" in df.columns: |
|
df = df.sort_values(by="Human Elo Score", ascending=False) |
|
|
|
|
|
for column in df.columns: |
|
try: |
|
if pd.api.types.is_float_dtype(df[column]): |
|
df[column] = df[column].round(2) |
|
except: |
|
continue |
|
|
|
return df |
|
|
|
return pd.DataFrame() |
|
|
|
def create_raw_details_table(benchmark_data, benchmark_type): |
|
""" |
|
Creates a detailed table from raw JSON data for a specific benchmark type |
|
""" |
|
if not benchmark_data["raw"][benchmark_type]: |
|
return pd.DataFrame() |
|
|
|
|
|
flattened_data = [] |
|
|
|
for item in benchmark_data["raw"][benchmark_type]: |
|
raw_model_name = item.get("model_name", "Unknown Model") |
|
flat_item = { |
|
"file": item.get("file", ""), |
|
"model_name": format_model_name(raw_model_name) |
|
} |
|
|
|
|
|
for field in ["dtype", "license"]: |
|
if raw_model_name in MODEL_METADATA_LOOKUP: |
|
flat_item[field] = MODEL_METADATA_LOOKUP[raw_model_name][field] |
|
else: |
|
defaults = {"dtype": "unknown", "license": "Unknown"} |
|
flat_item[field] = defaults[field] |
|
|
|
|
|
excluded_fields = ["file", "job_id", "start_time", "end_time", "run_id", "user_id", |
|
"total_samples", "Total Samples", "samples_number", "sample_count", "eval_samples", |
|
"total_success_references", "Total Success References", "total_eval_samples", |
|
"provider", "Provider"] |
|
|
|
|
|
if benchmark_type == "light_eval": |
|
excluded_fields.append("mmlu_tasks") |
|
|
|
|
|
for key, value in item.items(): |
|
if key not in excluded_fields and key not in ["dtype", "license"] and not key.startswith("_") and not isinstance(value, (dict, list)): |
|
flat_item[key] = value |
|
|
|
|
|
for key, value in item.items(): |
|
if key.startswith("_") or key in excluded_fields: |
|
|
|
continue |
|
elif isinstance(value, dict): |
|
|
|
_flatten_dict(value, flat_item, prefix=key) |
|
elif isinstance(value, list) and all(isinstance(x, dict) for x in value): |
|
|
|
for i, sub_dict in enumerate(value): |
|
_flatten_dict(sub_dict, flat_item, prefix=f"{key}_{i}") |
|
|
|
flattened_data.append(flat_item) |
|
|
|
|
|
df = pd.DataFrame(flattened_data) |
|
|
|
|
|
if benchmark_type == "arena" and "95%(CI)" in df.columns: |
|
def format_confidence_interval(ci_value): |
|
"""Convert '-1.65/+2.66' to '+2.66/-1.65' format""" |
|
if isinstance(ci_value, str) and "/" in ci_value: |
|
parts = ci_value.split("/") |
|
if len(parts) == 2: |
|
negative_part = parts[0].strip() |
|
positive_part = parts[1].strip() |
|
|
|
|
|
if negative_part.startswith("-"): |
|
negative_num = negative_part[1:] |
|
else: |
|
negative_num = negative_part |
|
|
|
if positive_part.startswith("+"): |
|
positive_num = positive_part[1:] |
|
else: |
|
positive_num = positive_part |
|
|
|
|
|
return f"+{positive_num}/-{negative_num}" |
|
return ci_value |
|
|
|
df["95%(CI)"] = df["95%(CI)"].apply(format_confidence_interval) |
|
|
|
|
|
if "model_name" in df.columns: |
|
cols = ["model_name"] + [col for col in df.columns if col != "model_name"] |
|
df = df[cols] |
|
|
|
|
|
for column in df.columns: |
|
try: |
|
if pd.api.types.is_float_dtype(df[column]): |
|
df[column] = df[column].round(2) |
|
except: |
|
continue |
|
|
|
|
|
if 'file' in df.columns: |
|
df = df.drop(columns=['file']) |
|
|
|
|
|
column_mapping = { |
|
"model_name": "Model Name", |
|
"dtype": "Dtype", |
|
"license": "License" |
|
} |
|
|
|
|
|
if benchmark_type == "arena": |
|
|
|
custom_columns = { |
|
"Melo Score": "Auto Elo Score", |
|
"Win Rate": "Win Rate", |
|
"95%(CI)": "95% CI", |
|
"Response Tokens Average": "Completion Tokens", |
|
"dtype": "Dtype", |
|
"Licance": "License", |
|
} |
|
column_mapping.update(custom_columns) |
|
|
|
elif benchmark_type == "retrieval": |
|
|
|
custom_columns = { |
|
"RAG_score": "RAG Score", |
|
"RAG_success_rate": "Rag Success Rate", |
|
"max_correct_references": "Max Correct Ref.", |
|
"total_false_positives": "Hallucinate Ref.", |
|
"total_missed_references": "Missed Ref.", |
|
"average_judge_score": "Legal Judge Score" |
|
|
|
} |
|
column_mapping.update(custom_columns) |
|
|
|
elif benchmark_type == "evalmix": |
|
|
|
custom_columns = { |
|
"turkish_semantic_mean": "Turkish Semantic", |
|
"turkish_semantic": "Turkish Semantic", |
|
"multilingual_semantic_mean": "Multilingual Semantic", |
|
"multilingual_semantic": "Multilingual Semantic", |
|
"judge_metric": "Judge Score", |
|
"bleu mean": "BLEU", |
|
"rouge1 mean": "ROUGE-1", |
|
"rouge2 mean": "ROUGE-2", |
|
"rougeL mean": "ROUGE-L", |
|
"bert_score f1 mean": "BERTScore F1", |
|
"dtype": "Dtype", |
|
"license": "License", |
|
"bert_score precision mean": "BERTScore Precision" |
|
|
|
} |
|
column_mapping.update(custom_columns) |
|
|
|
|
|
if all(col in df.columns for col in ["openai_accuracy", "openai_relevance", "openai_coherence"]): |
|
df["judge_average_score"] = df[["openai_accuracy", "openai_relevance", "openai_coherence"]].mean(axis=1).round(2) |
|
column_mapping["judge_average_score"] = "Judge Score" |
|
|
|
|
|
columns_to_drop = ["openai_accuracy", "openai_relevance", "openai_coherence"] |
|
for col in columns_to_drop: |
|
if col in df.columns: |
|
df = df.drop(columns=[col]) |
|
|
|
elif benchmark_type == "light_eval": |
|
|
|
custom_columns = { |
|
"overall_average": "Overall", |
|
"mmlu_average": "MMLU", |
|
"truthfulqa": "Truthfulqa", |
|
"winogrande": "Winogrande", |
|
"hellaswag": "Hellaswag", |
|
"gsm8k": "Gsm8k", |
|
"arc_challenge": "ARC", |
|
"dtype": "Dtype", |
|
"license": "License" |
|
} |
|
column_mapping.update(custom_columns) |
|
|
|
elif benchmark_type == "snake": |
|
|
|
custom_columns = { |
|
"elo": "Elo Rating", |
|
"win_rate": "Win Rate", |
|
"draw_rate": "Draw Rate", |
|
"dtype": "Dtype", |
|
"license": "License" |
|
} |
|
column_mapping.update(custom_columns) |
|
|
|
|
|
|
|
|
|
for col in df.columns: |
|
if col not in column_mapping: |
|
|
|
cleaned_col = col.replace(" mean", "") |
|
|
|
formatted_col = " ".join([word.capitalize() for word in cleaned_col.replace("_", " ").split()]) |
|
column_mapping[col] = formatted_col |
|
|
|
|
|
df = df.rename(columns=column_mapping) |
|
|
|
|
|
if benchmark_type == "retrieval" and "Success Ref." in df.columns: |
|
df = df.drop(columns=["Success Ref."]) |
|
|
|
if "Total Success References" in df.columns: |
|
df = df.drop(columns=["Total Success References"]) |
|
|
|
|
|
if benchmark_type == "arena" and "Auto Elo Score" in df.columns: |
|
df = df.sort_values(by="Auto Elo Score", ascending=False) |
|
|
|
|
|
desired_cols = [ |
|
"Model Name", |
|
"Auto Elo Score", |
|
"Win Rate", |
|
"95% CI", |
|
"Completion Tokens", |
|
"Dtype", |
|
"License" |
|
] |
|
|
|
|
|
final_cols = [col for col in desired_cols if col in df.columns] |
|
|
|
|
|
remaining_cols = [col for col in df.columns if col not in final_cols] |
|
final_cols.extend(remaining_cols) |
|
|
|
|
|
df = df[final_cols] |
|
|
|
elif benchmark_type == "retrieval": |
|
|
|
if "RAG Score" in df.columns: |
|
df = df.sort_values(by="RAG Score", ascending=False) |
|
primary_metric = "RAG Score" |
|
elif "Rag Success Rate" in df.columns: |
|
df = df.sort_values(by="Rag Success Rate", ascending=False) |
|
primary_metric = "Rag Success Rate" |
|
else: |
|
primary_metric = None |
|
|
|
|
|
desired_cols = [ |
|
"Model Name", |
|
"RAG Score", |
|
"Rag Success Rate", |
|
"Max Correct Ref.", |
|
"Hallucinate Ref.", |
|
"Missed Ref.", |
|
"Legal Judge Score", |
|
"Dtype", |
|
"License" |
|
] |
|
|
|
|
|
final_cols = [col for col in desired_cols if col in df.columns] |
|
|
|
|
|
remaining_cols = [col for col in df.columns if col not in final_cols] |
|
final_cols.extend(remaining_cols) |
|
|
|
|
|
df = df[final_cols] |
|
elif benchmark_type == "evalmix": |
|
if "Turkish Semantic" in df.columns: |
|
df = df.sort_values(by="Turkish Semantic", ascending=False) |
|
|
|
|
|
desired_cols = [ |
|
"Model Name", |
|
"Turkish Semantic", |
|
"Multilingual Semantic", |
|
"Judge Score", |
|
"BLEU", |
|
"ROUGE-1", |
|
"ROUGE-2", |
|
"ROUGE-L", |
|
"BERTScore F1", |
|
"BERTScore Precision", |
|
"BERTScore Recall", |
|
"Dtype", |
|
"License" |
|
|
|
] |
|
|
|
|
|
final_cols = [col for col in desired_cols if col in df.columns] |
|
|
|
|
|
df = df[final_cols] |
|
|
|
|
|
|
|
elif benchmark_type == "light_eval" and "Overall" in df.columns: |
|
df = df.sort_values(by="Overall", ascending=False) |
|
elif benchmark_type == "snake": |
|
|
|
if "Elo Rating" in df.columns: |
|
df = df.sort_values(by="Elo Rating", ascending=False) |
|
elif "Elo" in df.columns: |
|
df = df.sort_values(by="Elo", ascending=False) |
|
|
|
|
|
desired_cols = [ |
|
"Model Name", |
|
"Elo Rating", |
|
"Win Rate", |
|
"Draw Rate", |
|
"Wins", |
|
"Losses", |
|
"Ties", |
|
"Loss Rate", |
|
"Dtype", |
|
"License" |
|
] |
|
|
|
|
|
final_cols = [col for col in desired_cols if col in df.columns] |
|
|
|
|
|
remaining_cols = [col for col in df.columns if col not in final_cols] |
|
final_cols.extend(remaining_cols) |
|
|
|
|
|
df = df[final_cols] |
|
|
|
return df |
|
|
|
def _flatten_dict(d, target_dict, prefix=""): |
|
""" |
|
Flattens nested dictionaries |
|
|
|
Args: |
|
d: Dictionary to flatten |
|
target_dict: Target dictionary to add flattened values to |
|
prefix: Key prefix |
|
""" |
|
|
|
excluded_fields = ["total_success_references", "total_eval_samples", |
|
"details", "metadata", "config", "logs"] |
|
|
|
|
|
special_field_mappings = { |
|
"turkish_semantic_mean": "turkish_semantic", |
|
"turkish_semantic_ mean": "turkish_semantic", |
|
"multilingual_semantic_mean": "multilingual_semantic" |
|
} |
|
|
|
for key, value in d.items(): |
|
|
|
if key in excluded_fields: |
|
continue |
|
|
|
|
|
transformed_key = special_field_mappings.get(key, key) |
|
|
|
new_key = f"{prefix}_{transformed_key}" if prefix else transformed_key |
|
|
|
if isinstance(value, dict): |
|
|
|
_flatten_dict(value, target_dict, new_key) |
|
elif isinstance(value, list) and all(isinstance(x, dict) for x in value): |
|
|
|
for i, sub_dict in enumerate(value): |
|
_flatten_dict(sub_dict, target_dict, f"{new_key}_{i}") |
|
elif isinstance(value, list) and len(value) > 0: |
|
|
|
try: |
|
|
|
if all(isinstance(x, (int, float)) for x in value): |
|
import numpy as np |
|
target_dict[f"{new_key}_mean"] = round(sum(value) / len(value), 2) |
|
if len(value) > 1: |
|
target_dict[f"{new_key}_std"] = round(np.std(value), 2) |
|
else: |
|
|
|
target_dict[new_key] = str(value) |
|
except: |
|
|
|
target_dict[new_key] = str(value) |
|
else: |
|
|
|
|
|
if isinstance(value, float): |
|
target_dict[new_key] = round(value, 2) |
|
else: |
|
target_dict[new_key] = value |
|
|
|
def update_supported_base_models(): |
|
""" |
|
Updates the list of supported base models by querying API. |
|
This function is called when the application starts to keep the base model list up to date. |
|
""" |
|
try: |
|
import requests |
|
import json |
|
import re |
|
from dotenv import load_dotenv |
|
import os |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
api_key = os.getenv("API_KEY") |
|
if not api_key: |
|
logger.error("API_KEY not found in environment variables") |
|
return None |
|
|
|
|
|
url = os.getenv("API_URL") |
|
headers = { |
|
"Content-Type": "application/json", |
|
"Authorization": f"Bearer {api_key}" |
|
} |
|
|
|
|
|
payload = { |
|
"source": "FILE_ID_BURAYA_GELECEK", |
|
"base_model": "non-existent-model/fake-model-123", |
|
"name": "test-invalid-model", |
|
"description": "Desteklenen modelleri görmeye çalışıyorum" |
|
} |
|
|
|
|
|
response = requests.post(url, headers=headers, json=payload) |
|
|
|
|
|
if response.status_code != 200: |
|
error_detail = response.json().get("detail", "") |
|
|
|
match = re.search(r"list of supported models: \[(.*?)\]", error_detail) |
|
if match: |
|
supported_models_str = match.group(1) |
|
|
|
supported_models = [model.strip("'") for model in supported_models_str.split(", ")] |
|
|
|
|
|
from api.config import update_base_model_list |
|
update_base_model_list(supported_models) |
|
|
|
logger.info(f"Successfully updated supported base models: {supported_models}") |
|
return supported_models |
|
else: |
|
logger.error("Could not extract supported models from API response") |
|
return None |
|
else: |
|
logger.error("Unexpected successful response from API") |
|
return None |
|
|
|
except Exception as e: |
|
logger.error(f"Error updating supported base models: {str(e)}") |
|
return None |
|
|
|
def create_human_arena_table(data): |
|
""" |
|
Create Human Arena results table from detail data |
|
""" |
|
if not data: |
|
return pd.DataFrame() |
|
|
|
|
|
for item in data: |
|
if "model_name" in item: |
|
raw_model_name = item["model_name"] |
|
item["model_name"] = format_model_name(raw_model_name) |
|
|
|
|
|
for field in ["dtype", "license"]: |
|
if raw_model_name in MODEL_METADATA_LOOKUP: |
|
item[field] = MODEL_METADATA_LOOKUP[raw_model_name][field] |
|
else: |
|
defaults = {"dtype": "unknown", "license": "Unknown"} |
|
item[field] = defaults[field] |
|
|
|
df = pd.DataFrame(data) |
|
|
|
|
|
if "model_name" in df.columns: |
|
cols = ["model_name"] + [col for col in df.columns if col != "model_name"] |
|
df = df[cols] |
|
|
|
|
|
column_mapping = { |
|
'model_name': 'Model Name', |
|
'elo_rating': 'Human Elo Score', |
|
'wins': 'Wins', |
|
'losses': 'Losses', |
|
'ties': 'Ties', |
|
'total_games': 'Total Games', |
|
'win_rate': 'Win Rate (%)', |
|
'votes': 'Votes', |
|
'dtype': 'Dtype', |
|
'license': 'License', |
|
'evaluation_date': 'Evaluation Date', |
|
'evaluation_type': 'Type' |
|
} |
|
|
|
|
|
df = df.rename(columns=column_mapping) |
|
|
|
|
|
columns_to_remove = ['file', 'run_id', 'Evaluation Date', 'Type', 'provider', 'Provider', 'Votes'] |
|
for col in columns_to_remove: |
|
if col in df.columns: |
|
df = df.drop(columns=[col]) |
|
|
|
|
|
if 'Human Elo Score' in df.columns: |
|
df = df.sort_values(by='Human Elo Score', ascending=False) |
|
|
|
|
|
numeric_cols = ['Human Elo Score', 'Win Rate (%)'] |
|
for col in numeric_cols: |
|
if col in df.columns: |
|
df[col] = pd.to_numeric(df[col], errors='coerce').round(2) |
|
|
|
return df |
|
|
|
|