Spaces:
Running
Running
#!/usr/bin/python3 | |
# -*- coding: utf-8 -*- | |
""" | |
docker build -t llm_eval_system:v20250724_1442 . | |
docker stop llm_eval_system_7862 && docker rm llm_eval_system_7862 | |
docker run -itd \ | |
--name llm_eval_system_7862 \ | |
--restart=always \ | |
--network host \ | |
-e port=7862 \ | |
llm_eval_system:v20250724_1442 \ | |
/bin/bash | |
docker run -itd \ | |
--name llm_eval_system_7862 \ | |
--restart=always \ | |
--network host \ | |
-v /data/tianxing/PycharmProjects/llm_eval_system:/data/tianxing/PycharmProjects/llm_eval_system \ | |
python:3.12 \ | |
/bin/bash | |
nohup python3 main.py --server_port 7862 & | |
""" | |
import argparse | |
import json | |
import logging | |
from pathlib import Path | |
import platform | |
from typing import Tuple, List | |
import time | |
import gradio as gr | |
import numpy as np | |
import pandas as pd | |
from project_settings import environment, project_path, log_directory | |
from toolbox.os.command import Command | |
import log | |
from tabs.fs_tab import get_fs_tab | |
from tabs.shell_tab import get_shell_tab | |
log.setup_size_rotating(log_directory=log_directory) | |
logger = logging.getLogger("main") | |
def get_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"--eval_data_dir", | |
default=(project_path / "data/eval_data").as_posix(), | |
type=str, | |
) | |
parser.add_argument( | |
"--server_port", | |
default=7860, | |
type=int, | |
) | |
args = parser.parse_args() | |
return args | |
css = """ | |
#dataset_df th:nth-child(1), #dataset_df td:nth-child(1) { | |
max-width: 50px !important; /* 第一列 */ | |
} | |
#dataset_df th:nth-child(2), #dataset_df td:nth-child(2) { | |
max-width: 500px !important; /* 第二列 */ | |
} | |
#dataset_df th:nth-child(3), #dataset_df td:nth-child(3) { | |
max-width: 50px !important; /* 第三列 */ | |
} | |
#view_chat_df th:nth-child(1), #view_chat_df td:nth-child(1) { | |
max-width: 50px !important; /* 第一列 */ | |
} | |
#view_chat_df th:nth-child(2), #view_chat_df td:nth-child(2) { | |
max-width: 500px !important; /* 第二列 */ | |
} | |
#view_chat_df th:nth-child(3), #view_chat_df td:nth-child(3) { | |
max-width: 400px !important; /* 第三列 */ | |
} | |
#view_chat_df th:nth-child(4), #view_chat_df td:nth-child(4) { | |
max-width: 400px !important; /* 第四列 */ | |
} | |
#view_chat_df th:nth-child(5), #view_chat_df td:nth-child(5) { | |
max-width: 400px !important; /* 第五列 */ | |
} | |
#view_chat_df th:nth-child(6), #view_chat_df td:nth-child(6) { | |
max-width: 80px !important; /* 第六列 */ | |
} | |
""" | |
eval_data_dir: Path = None | |
llm_ranking: pd.DataFrame = None | |
last_update_ts: float = 0 | |
update_interval = 1 * 60 * 60 | |
def load_board(): | |
result = list() | |
for filename in eval_data_dir.glob("**/*.jsonl"): | |
name = filename.stem | |
dataset = filename.parts[-1] | |
date = filename.parts[-2] | |
service = filename.parts[-3] | |
client = filename.parts[-4] | |
model_name = filename.parts[-5] | |
company = filename.parts[-6] | |
script = filename.parts[-7] | |
if date.endswith("-delete"): | |
continue | |
# if name.endswith("-chat"): | |
# continue | |
score_list = list() | |
time_cost_list = list() | |
total = 0 | |
with open(filename.as_posix(), "r", encoding="utf-8") as f: | |
for row in f: | |
try: | |
row = json.loads(row) | |
except Exception as e: | |
print(f"json load row failed. error type: {type(e)}, error text: {str(e)}") | |
logger.error(f"json load row failed. error type: {type(e)}, error text: {str(e)}") | |
raise e | |
if name.endswith("-choice"): | |
score_ = row["correct"] | |
elif name.endswith("-chat"): | |
score_ = row["score"] | |
else: | |
raise AssertionError | |
time_cost_ = row["time_cost"] | |
score_list.append(score_) | |
time_cost_list.append(time_cost_) | |
total += 1 | |
if total == 0: | |
continue | |
score = np.mean(score_list) | |
time_cost_mean = np.mean(time_cost_list) | |
time_cost_var = np.var(time_cost_list) | |
time_cost_p75 = np.percentile(time_cost_list, 95) | |
time_cost_p95 = np.percentile(time_cost_list, 95) | |
time_cost_p99 = np.percentile(time_cost_list, 99) | |
row_ = { | |
"company": company, | |
"model_name": model_name, | |
"dataset": dataset, | |
"score": round(score, 4), | |
"time_cost(mean)": round(time_cost_mean, 4), | |
"time_cost(var)": round(time_cost_var, 4), | |
"time_cost(75%)": round(time_cost_p75, 4), | |
"time_cost(95%)": round(time_cost_p95, 4), | |
"time_cost(99%)": round(time_cost_p99, 4), | |
"service": service, | |
"client": client, | |
"script": f"{script}.py", | |
"version": date, | |
"count": total, | |
} | |
result.append(row_) | |
result = pd.DataFrame(result) | |
return result | |
def load_board_lazy(): | |
global llm_ranking | |
global last_update_ts | |
now = time.time() | |
if now - last_update_ts > update_interval: | |
llm_ranking = load_board() | |
last_update_ts = now | |
return llm_ranking | |
def when_click_board_button(columns: List[str]): | |
result = load_board_lazy() | |
try: | |
result = result[columns] | |
except KeyError as e: | |
raise gr.Error(f"{str(e)}, columns: {list(result.columns)}") | |
return result | |
def when_click_view_dataset_button(filename: str): | |
filename = (project_path / filename).as_posix() | |
result = list() | |
with open(filename, "r", encoding="utf-8") as f: | |
for row in f: | |
row = json.loads(row) | |
result.append(row) | |
result = pd.DataFrame(result) | |
return result | |
def when_click_view_chat_button(filename: str): | |
filename = (project_path / filename).as_posix() | |
result = list() | |
with open(filename, "r", encoding="utf-8") as f: | |
for row in f: | |
row = json.loads(row) | |
idx = row["idx"] | |
prompt: str = row["prompt"] | |
conversation = prompt.split("\n\n")[-1].strip() | |
response = row["response"] | |
prediction = row["prediction"] | |
evaluate = row["evaluate"] | |
score = row["score"] | |
row_ = { | |
"idx": idx, | |
"conversation": conversation, | |
"response": response, | |
"prediction": prediction, | |
"evaluate": json.dumps(evaluate, ensure_ascii=False, indent=4), | |
"score": score, | |
} | |
result.append(row_) | |
result = pd.DataFrame(result) | |
return result | |
board_columns_choices = [ | |
"company", "model_name", "dataset", "score", | |
"time_cost(mean)", | |
"time_cost(var)", | |
"time_cost(75%)", "time_cost(95%)", "time_cost(99%)", | |
"service", "client", | |
"script", "version", "count" | |
] | |
board_columns_choices_default_value = [ | |
"company", "model_name", "dataset", "score", | |
"time_cost(mean)", | |
"time_cost(var)", | |
# "time_cost(75%)", "time_cost(95%)", "time_cost(99%)", | |
] | |
dataset_examples_list = [ | |
[ | |
"arc-easy-1000-choice.jsonl", | |
"ARC(AI2 推理挑战赛)\nAI2 的推理挑战赛 (ARC) 数据集是一个多项选择题问答数据集,包含 3 年级至 9 年级的科学考试题目。\n该数据集分为两个部分:简单部分和挑战部分。\n\n从简单部分取前1000条作为 arc-easy-1000-choice.jsonl", | |
"data/dataset/arc-easy-1000-choice.jsonl" | |
], | |
[ | |
"agent-lingoace-zh-400-choice.jsonl", | |
"lingoace数据集。", | |
"data/dataset/agent-lingoace-zh-400-choice.jsonl" | |
], | |
] | |
def main(): | |
args = get_args() | |
global eval_data_dir | |
global llm_ranking | |
eval_data_dir = Path(args.eval_data_dir) | |
llm_ranking_board = when_click_board_button(board_columns_choices_default_value) | |
# chat | |
chat_eval_data_examples = list() | |
for filename in eval_data_dir.glob("**/*-chat.jsonl"): | |
dataset = filename.parts[-1] | |
model_name = filename.parts[-5] | |
company = filename.parts[-6] | |
chat_eval_data_examples.append([ | |
company, model_name, dataset, filename.as_posix() | |
]) | |
# ui | |
with gr.Blocks(css=css) as blocks: | |
with gr.Tabs(): | |
with gr.TabItem("board"): | |
board_columns = gr.CheckboxGroup( | |
choices=board_columns_choices, | |
value=board_columns_choices_default_value, | |
label="columns" | |
) | |
board_button = gr.Button(value="View", variant="primary", visible=True) | |
board_board = gr.DataFrame( | |
value=llm_ranking_board, | |
max_height=800, min_width=160, | |
label="board", | |
interactive=True, show_search="search" | |
) | |
board_button.click( | |
fn=when_click_board_button, | |
inputs=[board_columns], | |
outputs=[board_board], | |
) | |
with gr.TabItem("view_chat"): | |
view_chat_company = gr.Textbox(label="company", visible=False) | |
view_chat_model_name = gr.Textbox(label="model_name", visible=False) | |
view_chat_dataset = gr.Textbox(label="dataset", visible=False) | |
view_chat_filename = gr.Textbox(label="filename", visible=True) | |
gr.Examples( | |
examples=chat_eval_data_examples, | |
inputs=[view_chat_company, view_chat_model_name, view_chat_dataset, view_chat_filename], | |
outputs=None, | |
) | |
with gr.Row(): | |
view_chat_button = gr.Button(value="View", variant="primary", visible=True) | |
view_chat_df = gr.DataFrame( | |
value=None, | |
max_height = 1000, min_width = 160, | |
label="dataset", interactive=True, | |
show_search="search", | |
elem_id="view_chat_df" | |
) | |
view_chat_button.click( | |
fn=when_click_view_chat_button, | |
inputs=[view_chat_filename], | |
outputs=[view_chat_df], | |
) | |
with gr.TabItem("dataset"): | |
dataset_name = gr.Textbox(label="name") | |
dataset_desc = gr.Textbox(label="desc") | |
dataset_filename = gr.Textbox(label="filename") | |
gr.Examples( | |
examples=dataset_examples_list, | |
inputs=[dataset_name, dataset_desc, dataset_filename], | |
outputs=None, | |
) | |
dataset_button = gr.Button(value="View", variant="primary", visible=True) | |
dataset_df = gr.DataFrame( | |
value=None, label="dataset", interactive=True, | |
show_search="search", | |
elem_id="dataset_df" | |
) | |
dataset_button.click( | |
fn=when_click_view_dataset_button, | |
inputs=[dataset_filename], | |
outputs=[dataset_df], | |
) | |
_ = get_fs_tab() | |
_ = get_shell_tab() | |
# http://127.0.0.1:7861/ | |
# http://10.75.27.247:7861/ | |
blocks.queue().launch( | |
share=False if platform.system() == "Windows" else False, | |
server_name="127.0.0.1" if platform.system() == "Windows" else "0.0.0.0", | |
# server_name="0.0.0.0", | |
server_port=environment.get("port", default=args.server_port, dtype=int), | |
) | |
return | |
if __name__ == "__main__": | |
main() | |