File size: 1,371 Bytes
9d5b280 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
# ruff: noqa: E731, E741
"""
Script to generate task YAMLs for the mlqa dataset.
Based on `tasks/bigbench/generate_tasks.py`.
"""
from datasets import get_dataset_config_names
chosen_subtasks = []
language_dict = {
"en": "english",
"es": "spanish",
"hi": "hindi",
"vi": "vietnamese",
"de": "german",
"ar": "arabic",
"zh": "chinese",
}
def main() -> None:
configs = get_dataset_config_names("facebook/mlqa", trust_remote_code=True)
for config in configs:
if len(config.split(".")) == 2:
continue
else:
chosen_subtasks.append(config)
assert len(chosen_subtasks) == 49
for task in chosen_subtasks:
file_name = f"{task.replace('.', '_')}.yaml"
context_lang = file_name.split("_")[1]
# Not using yaml to avoid tagging issues with !function
with open(file_name, "w", encoding="utf-8") as f:
f.write("# Generated by generate_tasks.py\n")
# Manually writing the YAML-like content inside files to avoid tagging issues
f.write("include: mlqa_common_yaml\n")
f.write(f"task: {task.replace('.', '_')}\n")
f.write(f"dataset_name: {task}\n")
f.write(
f"process_results: !function utils.process_results_{context_lang}\n"
)
if __name__ == "__main__":
main()
|