File size: 2,993 Bytes
9d5b280 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
"""
Take in a YAML, and output all other splits with this YAML
"""
import argparse
import os
import requests
import yaml
from tqdm import tqdm
from lm_eval.utils import logging
API_URL = "https://datasets-server.huggingface.co/splits?dataset=facebook/belebele"
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", required=True)
parser.add_argument("--save_prefix_path", default="belebele")
parser.add_argument("--cot_prompt_path", default=None)
parser.add_argument("--task_prefix", default="")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path, encoding="utf-8") as f:
base_yaml = yaml.full_load(f)
if args.cot_prompt_path is not None:
import json
with open(args.cot_prompt_path, encoding="utf-8") as f:
cot_file = json.load(f)
def query():
response = requests.get(API_URL)
return response.json()["splits"]
print(query())
languages = [split["split"] for split in query()]
for lang in tqdm([lang for lang in languages if "default" not in lang]):
yaml_dict = {
"include": base_yaml_name,
"task": f"belebele_{args.task_prefix}_{lang}"
if args.task_prefix != ""
else f"belebele_{lang}",
"test_split": lang,
"fewshot_split": lang,
}
file_save_path = args.save_prefix_path + f"_{lang}.yaml"
logging.info(f"Saving yaml for subset {lang} to {file_save_path}")
with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
width=float("inf"),
allow_unicode=True,
default_style='"',
)
# write group config out
group_yaml_dict = {
"group": f"belebele_{args.task_prefix}"
if args.task_prefix != ""
else "belebele",
"task": [
(
f"belebele_{args.task_prefix}_{lang}"
if args.task_prefix != ""
else f"belebele_{lang}"
)
for lang in languages
if "default" not in lang
],
"aggregate_metric_list": [
{"metric": "acc", "aggregation": "mean", "weight_by_size": False},
{"metric": "acc_norm", "aggregation": "mean", "weight_by_size": False},
],
"metadata": {"version": 0.0},
}
file_save_path = "_" + args.save_prefix_path + f"{args.task_prefix}.yaml"
with open(file_save_path, "w", encoding="utf-8") as group_yaml_file:
yaml.dump(
group_yaml_dict,
group_yaml_file,
width=float("inf"),
allow_unicode=True,
default_style='"',
)
|