out / lm-evaluation-harness /lm_eval /__main__.py

Upload folder using huggingface_hub

9d5b280 verified 7 months ago

17.5 kB

	import argparse
	import json
	import logging
	import os
	import sys
	from functools import partial
	from typing import Union

	from lm_eval import evaluator, utils
	from lm_eval.evaluator import request_caching_arg_to_dict
	from lm_eval.loggers import EvaluationTracker, WandbLogger
	from lm_eval.tasks import TaskManager
	from lm_eval.utils import handle_non_serializable, make_table, simple_parse_args_string


	def _int_or_none_list_arg_type(
	min_len: int, max_len: int, defaults: str, value: str, split_char: str = ","
	):
	def parse_value(item):
	item = item.strip().lower()
	if item == "none":
	return None
	try:
	return int(item)
	except ValueError:
	raise argparse.ArgumentTypeError(f"{item} is not an integer or None")

	items = [parse_value(v) for v in value.split(split_char)]
	num_items = len(items)

	if num_items == 1:
	# Makes downstream handling the same for single and multiple values
	items = items * max_len
	elif num_items < min_len or num_items > max_len:
	raise argparse.ArgumentTypeError(
	f"Argument requires {max_len} integers or None, separated by '{split_char}'"
	)
	elif num_items != max_len:
	logging.warning(
	f"Argument requires {max_len} integers or None, separated by '{split_char}'. "
	"Missing values will be filled with defaults."
	)
	default_items = [parse_value(v) for v in defaults.split(split_char)]
	items.extend(
	default_items[num_items:]
	) # extend items list with missing defaults

	return items


	def check_argument_types(parser: argparse.ArgumentParser):
	"""
	Check to make sure all CLI args are typed, raises error if not
	"""
	for action in parser._actions:
	if action.dest != "help" and not action.const:
	if action.type is None:
	raise ValueError(
	f"Argument '{action.dest}' doesn't have a type specified."
	)
	else:
	continue


	def setup_parser() -> argparse.ArgumentParser:
	parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
	parser.add_argument(
	"--model", "-m", type=str, default="hf", help="Name of model e.g. `hf`"
	)
	parser.add_argument(
	"--tasks",
	"-t",
	default=None,
	type=str,
	metavar="task1,task2",
	help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above",
	)
	parser.add_argument(
	"--model_args",
	"-a",
	default="",
	type=str,
	help="Comma separated string arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
	)
	parser.add_argument(
	"--num_fewshot",
	"-f",
	type=int,
	default=None,
	metavar="N",
	help="Number of examples in few-shot context",
	)
	parser.add_argument(
	"--batch_size",
	"-b",
	type=str,
	default=1,
	metavar="auto\|auto:N\|N",
	help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.",
	)
	parser.add_argument(
	"--max_batch_size",
	type=int,
	default=None,
	metavar="N",
	help="Maximal batch size to try with --batch_size auto.",
	)
	parser.add_argument(
	"--device",
	type=str,
	default=None,
	help="Device to use (e.g. cuda, cuda:0, cpu).",
	)
	parser.add_argument(
	"--output_path",
	"-o",
	default=None,
	type=str,
	metavar="DIR\|DIR/file.json",
	help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
	)
	parser.add_argument(
	"--limit",
	"-L",
	type=float,
	default=None,
	metavar="N\|0<N<1",
	help="Limit the number of examples per task. "
	"If <1, limit is a percentage of the total number of examples.",
	)
	parser.add_argument(
	"--use_cache",
	"-c",
	type=str,
	default=None,
	metavar="DIR",
	help="A path to a sqlite db file for caching model responses. `None` if not caching.",
	)
	parser.add_argument(
	"--cache_requests",
	type=str,
	default=None,
	choices=["true", "refresh", "delete"],
	help="Speed up evaluation by caching the building of dataset requests. `None` if not caching.",
	)
	parser.add_argument(
	"--check_integrity",
	action="store_true",
	help="Whether to run the relevant part of the test suite for the tasks.",
	)
	parser.add_argument(
	"--write_out",
	"-w",
	action="store_true",
	default=False,
	help="Prints the prompt for the first few documents.",
	)
	parser.add_argument(
	"--log_samples",
	"-s",
	action="store_true",
	default=False,
	help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
	)
	parser.add_argument(
	"--system_instruction",
	type=str,
	default=None,
	help="System instruction to be used in the prompt",
	)
	parser.add_argument(
	"--apply_chat_template",
	type=str,
	nargs="?",
	const=True,
	default=False,
	help=(
	"If True, apply chat template to the prompt. "
	"Providing `--apply_chat_template` without an argument will apply the default chat template to the prompt. "
	"To apply a specific template from the available list of templates, provide the template name as an argument. "
	"E.g. `--apply_chat_template template_name`"
	),
	)
	parser.add_argument(
	"--fewshot_as_multiturn",
	action="store_true",
	default=False,
	help="If True, uses the fewshot as a multi-turn conversation",
	)
	parser.add_argument(
	"--show_config",
	action="store_true",
	default=False,
	help="If True, shows the the full config of all tasks at the end of the evaluation.",
	)
	parser.add_argument(
	"--include_path",
	type=str,
	default=None,
	metavar="DIR",
	help="Additional path to include if there are external tasks to include.",
	)
	parser.add_argument(
	"--gen_kwargs",
	type=str,
	default=None,
	help=(
	"String arguments for model generation on greedy_until tasks,"
	" e.g. `temperature=0,top_k=0,top_p=0`."
	),
	)
	parser.add_argument(
	"--verbosity",
	"-v",
	type=str.upper,
	default="INFO",
	metavar="CRITICAL\|ERROR\|WARNING\|INFO\|DEBUG",
	help="Controls the reported logging error level. Set to DEBUG when testing + adding new task configurations for comprehensive log output.",
	)
	parser.add_argument(
	"--wandb_args",
	type=str,
	default="",
	help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval",
	)
	parser.add_argument(
	"--hf_hub_log_args",
	type=str,
	default="",
	help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`",
	)
	parser.add_argument(
	"--predict_only",
	"-x",
	action="store_true",
	default=False,
	help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
	)
	default_seed_string = "0,1234,1234,1234"
	parser.add_argument(
	"--seed",
	type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string),
	default=default_seed_string, # for backward compatibility
	help=(
	"Set seed for python's random, numpy, torch, and fewshot sampling.\n"
	"Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
	"respectively, or a single integer to set the same seed for all four.\n"
	f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` "
	"(for backward compatibility).\n"
	"E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
	"Here numpy's seed is not set since the second value is `None`.\n"
	"E.g, `--seed 42` sets all four seeds to 42."
	),
	)
	parser.add_argument(
	"--trust_remote_code",
	action="store_true",
	help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
	)
	parser.add_argument(
	"--confirm_run_unsafe_code",
	action="store_true",
	help="Confirm that you understand the risks of running unsafe code for tasks that require it",
	)
	return parser


	def parse_eval_args(parser: argparse.ArgumentParser) -> argparse.Namespace:
	check_argument_types(parser)
	return parser.parse_args()


	def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
	if not args:
	# we allow for args to be passed externally, else we parse them ourselves
	parser = setup_parser()
	args = parse_eval_args(parser)

	if args.wandb_args:
	wandb_logger = WandbLogger(**simple_parse_args_string(args.wandb_args))

	eval_logger = utils.eval_logger
	eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
	eval_logger.info(f"Verbosity set to {args.verbosity}")
	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	# update the evaluation tracker args with the output path and the HF token
	if args.output_path:
	args.hf_hub_log_args += f",output_path={args.output_path}"
	if os.environ.get("HF_TOKEN", None):
	args.hf_hub_log_args += f",token={os.environ.get('HF_TOKEN')}"
	evaluation_tracker_args = simple_parse_args_string(args.hf_hub_log_args)
	evaluation_tracker = EvaluationTracker(**evaluation_tracker_args)

	if args.predict_only:
	args.log_samples = True
	if (args.log_samples or args.predict_only) and not args.output_path:
	raise ValueError(
	"Specify --output_path if providing --log_samples or --predict_only"
	)

	if args.fewshot_as_multiturn and args.apply_chat_template is False:
	raise ValueError(
	"When `fewshot_as_multiturn` is selected, `apply_chat_template` must be set (either to `True` or to the chosen template name)."
	)

	if args.include_path is not None:
	eval_logger.info(f"Including path: {args.include_path}")
	task_manager = TaskManager(args.verbosity, include_path=args.include_path)

	if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
	eval_logger.warning(
	"Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub."
	)

	if args.limit:
	eval_logger.warning(
	" --limit SHOULD ONLY BE USED FOR TESTING."
	"REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
	)

	if args.tasks is None:
	eval_logger.error("Need to specify task to evaluate.")
	sys.exit()
	elif args.tasks == "list":
	print(task_manager.list_all_tasks())
	sys.exit()
	elif args.tasks == "list_groups":
	print(task_manager.list_all_tasks(list_subtasks=False, list_tags=False))
	sys.exit()
	elif args.tasks == "list_tags":
	print(task_manager.list_all_tasks(list_groups=False, list_subtasks=False))
	sys.exit()
	elif args.tasks == "list_subtasks":
	print(task_manager.list_all_tasks(list_groups=False, list_tags=False))
	sys.exit()
	else:
	if os.path.isdir(args.tasks):
	import glob

	task_names = []
	yaml_path = os.path.join(args.tasks, "*.yaml")
	for yaml_file in glob.glob(yaml_path):
	config = utils.load_yaml_config(yaml_file)
	task_names.append(config)
	else:
	task_list = args.tasks.split(",")
	task_names = task_manager.match_tasks(task_list)
	for task in [task for task in task_list if task not in task_names]:
	if os.path.isfile(task):
	config = utils.load_yaml_config(task)
	task_names.append(config)
	task_missing = [
	task for task in task_list if task not in task_names and "*" not in task
	] # we don't want errors if a wildcard ("*") task name was used

	if task_missing:
	missing = ", ".join(task_missing)
	eval_logger.error(
	f"Tasks were not found: {missing}\n"
	f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
	)
	raise ValueError(
	f"Tasks not found: {missing}. Try `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above, or pass '--verbosity DEBUG' to troubleshoot task registration issues."
	)

	# Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
	if args.trust_remote_code:
	eval_logger.info(
	"Passed `--trust_remote_code`, setting environment variable `HF_DATASETS_TRUST_REMOTE_CODE=true`"
	)
	# HACK: import datasets and override its HF_DATASETS_TRUST_REMOTE_CODE value internally,
	# because it's already been determined based on the prior env var before launching our
	# script--`datasets` gets imported by lm_eval internally before these lines can update the env.
	import datasets

	datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True

	args.model_args = args.model_args + ",trust_remote_code=True"

	eval_logger.info(f"Selected Tasks: {task_names}")

	request_caching_args = request_caching_arg_to_dict(
	cache_requests=args.cache_requests
	)

	results = evaluator.simple_evaluate(
	model=args.model,
	model_args=args.model_args,
	tasks=task_names,
	num_fewshot=args.num_fewshot,
	batch_size=args.batch_size,
	max_batch_size=args.max_batch_size,
	device=args.device,
	use_cache=args.use_cache,
	limit=args.limit,
	check_integrity=args.check_integrity,
	write_out=args.write_out,
	log_samples=args.log_samples,
	evaluation_tracker=evaluation_tracker,
	system_instruction=args.system_instruction,
	apply_chat_template=args.apply_chat_template,
	fewshot_as_multiturn=args.fewshot_as_multiturn,
	gen_kwargs=args.gen_kwargs,
	task_manager=task_manager,
	verbosity=args.verbosity,
	predict_only=args.predict_only,
	random_seed=args.seed[0],
	numpy_random_seed=args.seed[1],
	torch_random_seed=args.seed[2],
	fewshot_random_seed=args.seed[3],
	confirm_run_unsafe_code=args.confirm_run_unsafe_code,
	**request_caching_args,
	)

	if results is not None:
	if args.log_samples:
	samples = results.pop("samples")
	dumped = json.dumps(
	results, indent=2, default=handle_non_serializable, ensure_ascii=False
	)
	if args.show_config:
	print(dumped)

	batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))

	# Add W&B logging
	if args.wandb_args:
	try:
	wandb_logger.post_init(results)
	wandb_logger.log_eval_result()
	if args.log_samples:
	wandb_logger.log_eval_samples(samples)
	except Exception as e:
	eval_logger.info(f"Logging to Weights and Biases failed due to {e}")

	evaluation_tracker.save_results_aggregated(
	results=results, samples=samples if args.log_samples else None
	)

	if args.log_samples:
	for task_name, config in results["configs"].items():
	evaluation_tracker.save_results_samples(
	task_name=task_name, samples=samples[task_name]
	)

	if (
	evaluation_tracker.push_results_to_hub
	or evaluation_tracker.push_samples_to_hub
	):
	evaluation_tracker.recreate_metadata_card()

	print(
	f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
	f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
	)
	print(make_table(results))
	if "groups" in results:
	print(make_table(results, "groups"))

	if args.wandb_args:
	# Tear down wandb run once all the logging is done.
	wandb_logger.run.finish()


	if __name__ == "__main__":
	cli_evaluate()