Spaces:
Running
Running
#!/usr/bin/python3 | |
# -*- coding: utf-8 -*- | |
""" | |
https://cloud.siliconflow.cn/sft-d1rosn8o8n4s73ftpa1g/playground/chat/17885302852 | |
https://docs.siliconflow.cn/cn/userguide/capabilities/reasoning | |
Model Name: | |
Pro/deepseek-ai/DeepSeek-R1 | |
Tips: | |
(1)thinking_budget: Must be greater than or equal to 1 | |
(2)The selected model requires paid balance. Your paid balance is insufficient. Please top up and try again. | |
Model Name: | |
tencent/Hunyuan-A13B-Instruct | |
Tips: | |
(1)它在回答时总是会先思考,最后给出答案.这适合知识问答,但不符合我们Agent的需求. 后来强制其只能输出 A-E 中的一个字符(max_tokens=4),以完成评估. | |
max_tokens=4, | |
logit_bias={ | |
32: 100, | |
33: 100, | |
34: 100, | |
35: 100, | |
36: 100, | |
37: 100, | |
}, | |
Model Name: | |
deepseek-ai/DeepSeek-R1 | |
Tips: | |
(1)为了让它只输出一个字符,设置 max_tokens=3 | |
Model Name: | |
Qwen/Qwen3-8B | |
deepseek-ai/DeepSeek-R1-0528-Qwen3-8B | |
deepseek-ai/DeepSeek-R1-Distill-Qwen-7B | |
deepseek-ai/DeepSeek-V3 | |
Tips: | |
(1)为了让它只输出一个字符,设置 max_tokens=1 | |
Model Name: | |
baidu/ERNIE-4.5-300B-A47B | |
Tips: | |
(1)它可能使用的是bpe 分词, logit_bias 注释掉。 | |
""" | |
import argparse | |
from datetime import datetime | |
import json | |
import os | |
from pathlib import Path | |
import sys | |
import time | |
from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装 | |
pwd = os.path.abspath(os.path.dirname(__file__)) | |
sys.path.append(os.path.join(pwd, "../")) | |
from openai import OpenAI | |
from project_settings import environment, project_path | |
def get_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"--model_name", | |
# default="Pro/deepseek-ai/DeepSeek-R1", | |
# default="tencent/Hunyuan-A13B-Instruct", | |
default="deepseek-ai/DeepSeek-V3", | |
# default="Qwen/Qwen3-8B", | |
# default="deepseek-ai/DeepSeek-R1", | |
# default="deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", | |
# default="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", | |
# default="baidu/ERNIE-4.5-300B-A47B", | |
type=str | |
) | |
parser.add_argument( | |
"--eval_dataset_name", | |
default="agent-bingoplus-ph-90-choice.jsonl", | |
# default="agent-lingoace-zh-400-choice.jsonl", | |
# default="arc-easy-1000-choice.jsonl", | |
type=str | |
) | |
parser.add_argument( | |
"--eval_dataset_dir", | |
default=(project_path / "data/dataset").as_posix(), | |
type=str | |
) | |
parser.add_argument( | |
"--eval_data_dir", | |
default=(project_path / "data/eval_data").as_posix(), | |
type=str | |
) | |
parser.add_argument( | |
"--client", | |
default="shenzhen_sase", | |
type=str | |
) | |
parser.add_argument( | |
"--service", | |
default="siliconflow_api_key", | |
type=str | |
) | |
parser.add_argument( | |
"--create_time_str", | |
default="null", | |
# default="20250728_113641", | |
type=str | |
) | |
parser.add_argument( | |
"--interval", | |
default=1, | |
type=int | |
) | |
args = parser.parse_args() | |
return args | |
def main(): | |
args = get_args() | |
eval_dataset_dir = Path(args.eval_dataset_dir) | |
eval_dataset_dir.mkdir(parents=True, exist_ok=True) | |
eval_data_dir = Path(args.eval_data_dir) | |
eval_data_dir.mkdir(parents=True, exist_ok=True) | |
if args.create_time_str == "null": | |
tz = ZoneInfo("Asia/Shanghai") | |
now = datetime.now(tz) | |
create_time_str = now.strftime("%Y%m%d_%H%M%S") | |
# create_time_str = "20250724_090615" | |
else: | |
create_time_str = args.create_time_str | |
eval_dataset = eval_dataset_dir / args.eval_dataset_name | |
model_name_ = args.model_name.replace("/", "#") | |
output_file = eval_data_dir / f"siliconflow/siliconflow/{model_name_}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}" | |
output_file.parent.mkdir(parents=True, exist_ok=True) | |
api_key = environment.get(args.service, dtype=str) | |
client = OpenAI( | |
base_url="https://api.siliconflow.cn/v1/", | |
# Read your Ark API Key from the environment variable. | |
api_key=api_key | |
) | |
total = 0 | |
total_correct = 0 | |
# finished | |
finished_idx_set = set() | |
if os.path.exists(output_file.as_posix()): | |
with open(output_file.as_posix(), "r", encoding="utf-8") as f: | |
for row in f: | |
row = json.loads(row) | |
idx = row["idx"] | |
total = row["total"] | |
total_correct = row["total_correct"] | |
finished_idx_set.add(idx) | |
print(f"finished count: {len(finished_idx_set)}") | |
with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout: | |
for row in fin: | |
row = json.loads(row) | |
idx = row["idx"] | |
prompt = row["prompt"] | |
response = row["response"] | |
if idx in finished_idx_set: | |
continue | |
finished_idx_set.add(idx) | |
try: | |
time.sleep(args.interval) | |
print(f"sleep: {args.interval}") | |
time_begin = time.time() | |
completion = client.chat.completions.create( | |
model=args.model_name, | |
messages=[ | |
{"role": "user", "content": prompt}, | |
], | |
stream=False, | |
# max_tokens=4096, | |
max_tokens=1, | |
temperature=0.6, | |
top_p=0.95, | |
logit_bias={ | |
32: 100, | |
33: 100, | |
34: 100, | |
35: 100, | |
36: 100, | |
37: 100, | |
38: 100, | |
39: 100, | |
}, | |
extra_body={ | |
"thinking_budget": 1 | |
} | |
) | |
time_cost = time.time() - time_begin | |
print(f"time_cost: {time_cost}") | |
except Exception as e: | |
print(f"request failed, error type: {type(e)}, error text: {str(e)}") | |
continue | |
# print(f"completion: {completion}") | |
prediction = completion.choices[0].message.content | |
correct = 1 if prediction == response else 0 | |
total += 1 | |
total_correct += correct | |
score = total_correct / total | |
row_ = { | |
"idx": idx, | |
"prompt": prompt, | |
"response": response, | |
"prediction": prediction, | |
"correct": correct, | |
"total": total, | |
"total_correct": total_correct, | |
"score": score, | |
"time_cost": time_cost, | |
} | |
row_ = json.dumps(row_, ensure_ascii=False) | |
fout.write(f"{row_}\n") | |
fout.flush() | |
return | |
if __name__ == "__main__": | |
main() | |