#!/usr/bin/python3 # -*- coding: utf-8 -*- """ https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/claude?hl=zh-cn https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/claude/use-claude?hl=zh-cn Llama https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama/use-llama?hl=zh-cn https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/llama/use-llama?hl=zh-cn#regions-quotas Model Name llama-4-maverick-17b-128e-instruct-maas llama-4-scout-17b-16e-instruct-maas 区域选择 us-east5 Model Name gemini-2.5-pro The model does not support setting thinking_budget to 0. Unable to submit request because thinking_budget is out of range; supported values are integers from 128 to 32768. """ import argparse from datetime import datetime import json import os from pathlib import Path import sys import time import tempfile from zoneinfo import ZoneInfo # Python 3.9+ 自带,无需安装 pwd = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.join(pwd, "../")) from google import genai from google.genai import types from project_settings import environment, project_path def get_args(): parser = argparse.ArgumentParser() parser.add_argument( "--model_name", # default="gemini-2.5-pro", # The model does not support setting thinking_budget to 0. default="gemini-2.5-flash", # default="gemini-2.5-flash-lite-preview-06-17", # default="llama-4-maverick-17b-128e-instruct-maas", # default="llama-4-scout-17b-16e-instruct-maas", type=str ) parser.add_argument( "--eval_dataset_name", # default="agent-bingoplus-ph-90-choice.jsonl", default="agent-lingoace-zh-400-choice.jsonl", # default="arc-easy-1000-choice.jsonl", type=str ) parser.add_argument( "--eval_dataset_dir", default=(project_path / "data/dataset").as_posix(), type=str ) parser.add_argument( "--eval_data_dir", default=(project_path / "data/eval_data").as_posix(), type=str ) parser.add_argument( "--client", default="shenzhen_sase", type=str ) parser.add_argument( "--service", # default="google_potent_veld_462405_t3", default="google_nxcloud_312303", type=str ) parser.add_argument( "--create_time_str", default="null", # default="20250731_162116", type=str ) parser.add_argument( "--interval", default=1, type=int ) args = parser.parse_args() return args def main(): args = get_args() service = environment.get(args.service, dtype=json.loads) project_id = service["project_id"] google_application_credentials = Path(tempfile.gettempdir()) / f"llm_eval_system/{project_id}.json" google_application_credentials.parent.mkdir(parents=True, exist_ok=True) with open(google_application_credentials.as_posix(), "w", encoding="utf-8") as f: content = json.dumps(service, ensure_ascii=False, indent=4) f.write(f"{content}\n") os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = google_application_credentials.as_posix() eval_dataset_dir = Path(args.eval_dataset_dir) eval_dataset_dir.mkdir(parents=True, exist_ok=True) eval_data_dir = Path(args.eval_data_dir) eval_data_dir.mkdir(parents=True, exist_ok=True) if args.create_time_str == "null": tz = ZoneInfo("Asia/Shanghai") now = datetime.now(tz) create_time_str = now.strftime("%Y%m%d_%H%M%S") # create_time_str = "20250729-interval-5" else: create_time_str = args.create_time_str eval_dataset = eval_dataset_dir / args.eval_dataset_name output_file = eval_data_dir / f"gemini_google/google/{args.model_name}/{args.client}/{args.service}/{create_time_str}/{args.eval_dataset_name}" output_file.parent.mkdir(parents=True, exist_ok=True) client = genai.Client( vertexai=True, project=project_id, location="global", # location="us-east5", ) generate_content_config = types.GenerateContentConfig( top_p=0.95, temperature=0.6, max_output_tokens=1, response_modalities=["TEXT"], thinking_config=types.ThinkingConfig( thinking_budget=0 ) ) total = 0 total_correct = 0 # finished finished_idx_set = set() if os.path.exists(output_file.as_posix()): with open(output_file.as_posix(), "r", encoding="utf-8") as f: for row in f: row = json.loads(row) idx = row["idx"] total = row["total"] total_correct = row["total_correct"] finished_idx_set.add(idx) print(f"finished count: {len(finished_idx_set)}") with open(eval_dataset.as_posix(), "r", encoding="utf-8") as fin, open(output_file.as_posix(), "a+", encoding="utf-8") as fout: for row in fin: row = json.loads(row) idx = row["idx"] prompt = row["prompt"] response = row["response"] if idx in finished_idx_set: continue finished_idx_set.add(idx) contents = [ types.Content( role="user", parts=[ types.Part.from_text(text=prompt) ] ) ] time.sleep(args.interval) print(f"sleep: {args.interval}") time_begin = time.time() llm_response: types.GenerateContentResponse = client.models.generate_content( model=args.model_name, contents=contents, config=generate_content_config, ) time_cost = time.time() - time_begin print(f"time_cost: {time_cost}") try: prediction = llm_response.candidates[0].content.parts[0].text except TypeError as e: print(f"request failed, error type: {type(e)}, error text: {str(e)}") continue correct = 1 if prediction == response else 0 total += 1 total_correct += correct score = total_correct / total row_ = { "idx": idx, "prompt": prompt, "response": response, "prediction": prediction, "correct": correct, "total": total, "total_correct": total_correct, "score": score, "time_cost": time_cost, } row_ = json.dumps(row_, ensure_ascii=False) fout.write(f"{row_}\n") return if __name__ == "__main__": main()