BayesTensor's picture
Upload folder using huggingface_hub
9d5b280 verified
import argparse
import yaml
# Different languages that are part of xnli.
# These correspond to dataset names (Subsets) on HuggingFace.
# A yaml file is generated by this script for each language.
LANGUAGES = {
"de": { # German
"QUESTION_WORD": "richtig",
"YES": "Ja",
"NO": "Nein",
},
"en": { # English
"QUESTION_WORD": "right",
"YES": "Yes",
"NO": "No",
},
"es": { # Spanish
"QUESTION_WORD": "verdad",
"YES": "SΓ­",
"NO": "No",
},
"fr": { # French
"QUESTION_WORD": "n'est-ce pas",
"YES": "Oui",
"NO": "No",
},
"ja": { # Japanese
"QUESTION_WORD": "ですね",
"YES": "はい",
"NO": "γ„γ„γˆ",
},
"ko": { # Korean
"QUESTION_WORD": "맞죠",
"YES": "예",
"NO": "μ•„λ‹ˆμš”",
},
"zh": { # Chinese
"QUESTION_WORD": "对吧",
"YES": "是",
"NO": "不是",
},
}
def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
"""
Generate a yaml file for each language.
:param output_dir: The directory to output the files to.
:param overwrite: Whether to overwrite files if they already exist.
"""
err = []
for lang in LANGUAGES.keys():
file_name = f"paws_{lang}.yaml"
try:
QUESTION_WORD = LANGUAGES[lang]["QUESTION_WORD"]
YES = LANGUAGES[lang]["YES"]
NO = LANGUAGES[lang]["NO"]
with open(
f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
) as f:
f.write("# Generated by utils.py\n")
yaml.dump(
{
"include": "pawsx_template_yaml",
"dataset_name": lang,
"task": f"paws_{lang}",
"doc_to_text": "",
"doc_to_choice": f"{{{{["
f"""sentence1+\", {QUESTION_WORD}? {YES}, \"+sentence2,"""
f""" sentence1+\", {QUESTION_WORD}? {NO}, \"+sentence2"""
f"]}}}}",
},
f,
allow_unicode=True,
)
except FileExistsError:
err.append(file_name)
if len(err) > 0:
raise FileExistsError(
"Files were not created because they already exist (use --overwrite flag):"
f" {', '.join(err)}"
)
def main() -> None:
"""Parse CLI args and generate language-specific yaml files."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--overwrite",
default=False,
action="store_true",
help="Overwrite files if they already exist",
)
parser.add_argument(
"--output-dir", default=".", help="Directory to write yaml files to"
)
args = parser.parse_args()
gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
if __name__ == "__main__":
main()