#!/usr/bin/python3 # -*- coding: utf-8 -*- """ https://cloud.siliconflow.cn/sft-d1rosn8o8n4s73ftpa1g/playground/chat/17885302852 https://docs.siliconflow.cn/cn/userguide/capabilities/reasoning Model Name: Pro/deepseek-ai/DeepSeek-R1 Tips: (1)thinking_budget: Must be greater than or equal to 1 (2)The selected model requires paid balance. Your paid balance is insufficient. Please top up and try again. Model Name: tencent/Hunyuan-A13B-Instruct Tips: (1)它在回答时总是会先思考,最后给出答案.这适合知识问答,但不符合我们Agent的需求. 后来强制其只能输出 A-E 中的一个字符(max_tokens=4),以完成评估. max_tokens=4, logit_bias={ 32: 100, 33: 100, 34: 100, 35: 100, 36: 100, 37: 100, }, Model Name: deepseek-ai/DeepSeek-R1 Tips: (1)为了让它只输出一个字符,设置 max_tokens=3 Model Name: Qwen/Qwen3-8B deepseek-ai/DeepSeek-R1-0528-Qwen3-8B deepseek-ai/DeepSeek-R1-Distill-Qwen-7B deepseek-ai/DeepSeek-V3 Tips: (1)为了让它只输出一个字符,设置 max_tokens=1 Model Name: baidu/ERNIE-4.5-300B-A47B Tips: (1)它可能使用的是bpe 分词, logit_bias 注释掉。 """ import argparse from datetime import datetime import json import os from pathlib import Path import sys import time from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装 pwd = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.join(pwd, "../")) from openai import OpenAI from project_settings import environment, project_path def get_args(): parser = argparse.ArgumentParser() parser.add_argument( "--model_name", # default="Pro/deepseek-ai/DeepSeek-R1", # default="tencent/Hunyuan-A13B-Instruct", default="deepseek-ai/DeepSeek-V3", # default="Qwen/Qwen3-8B", # default="deepseek-ai/DeepSeek-R1", # default="deepseek-ai/DeepSeek-R1-0528-Qwen3-8B", # default="deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", # default="baidu/ERNIE-4.5-300B-A47B", type=str ) parser.add_argument( "--eval_dataset_name", default="agent-bingoplus-ph-90-choice.jsonl", # default="agent-lingoace-zh-400-choice.jsonl", # default="arc-easy-1000-choice.jsonl", type=str ) parser.add_argument( "--eval_dataset_dir", default=(project_path / "data/dataset").as_posix(), type=str ) parser.add_argument( "--eval_data_dir", default=(project_path / "data/eval_data").as_posix(), type=str ) parser.add_argument( "--client", default="shenzhen_sase", type=str ) parser.add_argument( "--service", default="siliconflow_api_key", type=str ) parser.add_argument( "--create_time_str", default="null", # default="20250728_113641", type=str ) parser.add_argument( "--interval", default=1, type=int ) args = parser.parse_args() return args def main(): args = get_args() eval_dataset_dir = Path(args.eval_dataset_dir) eval_dataset_dir.mkdir(parents=True, exist_ok=True) eval_data_dir = Path(args.eval_data_dir) eval_data_dir.mkdir(parents=True, exist_ok=True) if args.create_time_str == "null": tz = ZoneInfo("Asia/Shanghai") now = datetime.now(tz) create_time_str = now.strftime("%Y%m%d_%H%M%S") # create_time_str = "20250724_090615" else: create_time_str = args.create_time_str eval_dataset = eval_dataset_dir / args.eval_dataset_name model_name_ = args.model_name.replace("/", "#") output_file = eval_data_dir / f"siliconflow/siliconflow/{model_name_}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}" output_file.parent.mkdir(parents=True, exist_ok=True) api_key = environment.get(args.service, dtype=str) client = OpenAI( base_url="https://api.siliconflow.cn/v1/", # Read your Ark API Key from the environment variable. api_key=api_key ) total = 0 total_correct = 0 # finished finished_idx_set = set() if os.path.exists(output_file.as_posix()): with open(output_file.as_posix(), "r", encoding="utf-8") as f: for row in f: row = json.loads(row) idx = row["idx"] total = row["total"] total_correct = row["total_correct"] finished_idx_set.add(idx) print(f"finished count: {len(finished_idx_set)}") with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout: for row in fin: row = json.loads(row) idx = row["idx"] prompt = row["prompt"] response = row["response"] if idx in finished_idx_set: continue finished_idx_set.add(idx) try: time.sleep(args.interval) print(f"sleep: {args.interval}") time_begin = time.time() completion = client.chat.completions.create( model=args.model_name, messages=[ {"role": "user", "content": prompt}, ], stream=False, # max_tokens=4096, max_tokens=1, temperature=0.6, top_p=0.95, logit_bias={ 32: 100, 33: 100, 34: 100, 35: 100, 36: 100, 37: 100, 38: 100, 39: 100, }, extra_body={ "thinking_budget": 1 } ) time_cost = time.time() - time_begin print(f"time_cost: {time_cost}") except Exception as e: print(f"request failed, error type: {type(e)}, error text: {str(e)}") continue # print(f"completion: {completion}") prediction = completion.choices[0].message.content correct = 1 if prediction == response else 0 total += 1 total_correct += correct score = total_correct / total row_ = { "idx": idx, "prompt": prompt, "response": response, "prediction": prediction, "correct": correct, "total": total, "total_correct": total_correct, "score": score, "time_cost": time_cost, } row_ = json.dumps(row_, ensure_ascii=False) fout.write(f"{row_}\n") fout.flush() return if __name__ == "__main__": main()