Spaces:
Sleeping
Sleeping
#!/usr/bin/python3 | |
# -*- coding: utf-8 -*- | |
import argparse | |
import json | |
import os | |
import sys | |
import time | |
pwd = os.path.abspath(os.path.dirname(__file__)) | |
sys.path.append(os.path.join(pwd, "../../")) | |
import openai | |
from openai import AzureOpenAI | |
from project_settings import environment, project_path | |
def get_args(): | |
""" | |
python3 eval_openai.py --model_name gpt-4o --eval_result eval_math_result_gpt-4o.jsonl | |
python3 eval_openai.py --model_name gpt-4o-mini --eval_result eval_math_result_gpt-4o-mini.jsonl | |
:return: | |
""" | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"--api_key", | |
default=environment.get(key="OPENAI_API_KEY"), | |
type=str | |
) | |
parser.add_argument( | |
"--model_name", | |
default="gpt-4o", | |
# default="gpt-4o-mini", | |
type=str | |
) | |
parser.add_argument( | |
"--eval_data", | |
default=(project_path / "data/arc-easy.jsonl").as_posix(), | |
type=str | |
) | |
parser.add_argument( | |
"--eval_result", | |
default=(project_path / "data/eval_math_result.jsonl").as_posix(), | |
type=str | |
) | |
args = parser.parse_args() | |
return args | |
def main(): | |
args = get_args() | |
# gpt-4o: 82 | |
# gemini: 89 | |
client = AzureOpenAI( | |
# api_key=args.api_key, | |
api_key="Dqt75blRABmhgrwhfcupd1rq44YqNuEgku8FcFFDrEljMq6gltf0JQQJ99BCACYeBjFXJ3w3AAABACOG2njW", | |
api_version="2025-01-01-preview", | |
azure_endpoint="https://west-us-chatgpt.openai.azure.com" | |
) | |
total = 0 | |
total_correct = 0 | |
# finished | |
finished_idx_set = set() | |
if os.path.exists(args.eval_result): | |
with open(args.eval_result, "r", encoding="utf-8") as f: | |
for row in f: | |
row = json.loads(row) | |
idx = row["id"] | |
total = row["total"] | |
total_correct = row["total_correct"] | |
finished_idx_set.add(idx) | |
print(f"finished count: {len(finished_idx_set)}") | |
with open(args.eval_data, "r", encoding="utf-8") as fin, open(args.eval_result, "a+", encoding="utf-8") as fout: | |
for row in fin: | |
if total > 20: | |
break | |
row = json.loads(row) | |
idx = row["id"] | |
question = row["question"] | |
choices = row["choices"] | |
answer_key = row["answerkey"] | |
if idx in finished_idx_set: | |
continue | |
finished_idx_set.add(idx) | |
instruct = "Complete this single-choice question." | |
choices_str = "" | |
for choice in choices: | |
label = choice["label"] | |
text = choice["text"] | |
choices_str += f"If you think the answer is `{text}` output: `{label}`\n" | |
prompt = f""" | |
{instruct} | |
Question: | |
{question} | |
Choices: | |
{choices_str} | |
Remember to output ONLY the corresponding letter. | |
Your output is: | |
""".strip() | |
# print(prompt) | |
try: | |
time_begin = time.time() | |
response = client.chat.completions.create( | |
model=args.model_name, | |
messages=[{"role": "user", "content": prompt}], | |
stream=False, | |
# max_tokens=1, | |
temperature=0.0, | |
# logit_bias={ | |
# 32: 100, | |
# 33: 100, | |
# 34: 100, | |
# 35: 100, | |
# 36: 100, | |
# } | |
) | |
time_cost = time.time() - time_begin | |
print(time_cost) | |
except openai.BadRequestError as e: | |
print(f"request failed, error type: {type(e)}, error text: {str(e)}") | |
continue | |
prediction = response.choices[0].message.content | |
correct = 1 if prediction == answer_key else 0 | |
total += 1 | |
total_correct += correct | |
score = total_correct / total | |
row_ = { | |
"id": idx, | |
"question": question, | |
"choices": choices, | |
"ground_true": answer_key, | |
"prediction": prediction, | |
"correct": correct, | |
"total": total, | |
"total_correct": total_correct, | |
"score": score, | |
"time_cost": time_cost, | |
} | |
row_ = json.dumps(row_, ensure_ascii=False) | |
fout.write(f"{row_}\n") | |
# print(f"score: {score}") | |
return | |
if __name__ == "__main__": | |
main() | |