#!/usr/bin/python3 # -*- coding: utf-8 -*- import argparse import json import os import sys import time pwd = os.path.abspath(os.path.dirname(__file__)) sys.path.append(os.path.join(pwd, "../../")) import openai from openai import AzureOpenAI from project_settings import environment, project_path def get_args(): """ python3 eval_openai.py --model_name gpt-4o --eval_result eval_math_result_gpt-4o.jsonl python3 eval_openai.py --model_name gpt-4o-mini --eval_result eval_math_result_gpt-4o-mini.jsonl :return: """ parser = argparse.ArgumentParser() parser.add_argument( "--api_key", default=environment.get(key="OPENAI_API_KEY"), type=str ) parser.add_argument( "--model_name", default="gpt-4o", # default="gpt-4o-mini", type=str ) parser.add_argument( "--eval_data", default=(project_path / "data/arc-easy.jsonl").as_posix(), type=str ) parser.add_argument( "--eval_result", default=(project_path / "data/eval_math_result.jsonl").as_posix(), type=str ) args = parser.parse_args() return args def main(): args = get_args() # gpt-4o: 82 # gemini: 89 client = AzureOpenAI( # api_key=args.api_key, api_key="Dqt75blRABmhgrwhfcupd1rq44YqNuEgku8FcFFDrEljMq6gltf0JQQJ99BCACYeBjFXJ3w3AAABACOG2njW", api_version="2025-01-01-preview", azure_endpoint="https://west-us-chatgpt.openai.azure.com" ) total = 0 total_correct = 0 # finished finished_idx_set = set() if os.path.exists(args.eval_result): with open(args.eval_result, "r", encoding="utf-8") as f: for row in f: row = json.loads(row) idx = row["id"] total = row["total"] total_correct = row["total_correct"] finished_idx_set.add(idx) print(f"finished count: {len(finished_idx_set)}") with open(args.eval_data, "r", encoding="utf-8") as fin, open(args.eval_result, "a+", encoding="utf-8") as fout: for row in fin: if total > 20: break row = json.loads(row) idx = row["id"] question = row["question"] choices = row["choices"] answer_key = row["answerkey"] if idx in finished_idx_set: continue finished_idx_set.add(idx) instruct = "Complete this single-choice question." choices_str = "" for choice in choices: label = choice["label"] text = choice["text"] choices_str += f"If you think the answer is `{text}` output: `{label}`\n" prompt = f""" {instruct} Question: {question} Choices: {choices_str} Remember to output ONLY the corresponding letter. Your output is: """.strip() # print(prompt) try: time_begin = time.time() response = client.chat.completions.create( model=args.model_name, messages=[{"role": "user", "content": prompt}], stream=False, # max_tokens=1, temperature=0.0, # logit_bias={ # 32: 100, # 33: 100, # 34: 100, # 35: 100, # 36: 100, # } ) time_cost = time.time() - time_begin print(time_cost) except openai.BadRequestError as e: print(f"request failed, error type: {type(e)}, error text: {str(e)}") continue prediction = response.choices[0].message.content correct = 1 if prediction == answer_key else 0 total += 1 total_correct += correct score = total_correct / total row_ = { "id": idx, "question": question, "choices": choices, "ground_true": answer_key, "prediction": prediction, "correct": correct, "total": total, "total_correct": total_correct, "score": score, "time_cost": time_cost, } row_ = json.dumps(row_, ensure_ascii=False) fout.write(f"{row_}\n") # print(f"score: {score}") return if __name__ == "__main__": main()