out

File size: 1,371 Bytes

9d5b280

# ruff: noqa: E731, E741
"""
Script to generate task YAMLs for the mlqa dataset.
Based on `tasks/bigbench/generate_tasks.py`.
"""

from datasets import get_dataset_config_names


chosen_subtasks = []

language_dict = {
    "en": "english",
    "es": "spanish",
    "hi": "hindi",
    "vi": "vietnamese",
    "de": "german",
    "ar": "arabic",
    "zh": "chinese",
}


def main() -> None:
    configs = get_dataset_config_names("facebook/mlqa", trust_remote_code=True)
    for config in configs:
        if len(config.split(".")) == 2:
            continue
        else:
            chosen_subtasks.append(config)
    assert len(chosen_subtasks) == 49
    for task in chosen_subtasks:
        file_name = f"{task.replace('.', '_')}.yaml"
        context_lang = file_name.split("_")[1]
        # Not using yaml to avoid tagging issues with !function
        with open(file_name, "w", encoding="utf-8") as f:
            f.write("# Generated by generate_tasks.py\n")

            # Manually writing the YAML-like content inside files to avoid tagging issues
            f.write("include: mlqa_common_yaml\n")
            f.write(f"task: {task.replace('.', '_')}\n")
            f.write(f"dataset_name: {task}\n")
            f.write(
                f"process_results: !function utils.process_results_{context_lang}\n"
            )


if __name__ == "__main__":
    main()