Sarthak

chore: update README and REPORT with performance insights and dataset changes

0dbb356 4 months ago

3.95 kB

	"""Main entry point for the distiller package."""

	from typing import Annotated

	import typer

	app = typer.Typer(
	help="Model2Vec Code-Specialized Distillation Pipeline",
	no_args_is_help=True,
	context_settings={"help_option_names": ["-h", "--help"]},
	)


	@app.command()
	def distill(
	use_beam: Annotated[bool, typer.Option(help="Use Beam for distillation")] = False,
	train: Annotated[bool, typer.Option(help="Enable advanced training (CodeSearchNet fine-tuning)")] = False,
	teacher_models: Annotated[list[str] \| None, typer.Option(help="Specific teacher models to distill")] = None,
	pca_dims: Annotated[int \| None, typer.Option(help="PCA dimensions (uses config default if not specified)")] = None,
	clear_cache: Annotated[
	bool, typer.Option(help="Clear HuggingFace cache for problematic models before distillation")
	] = False,
	clear_checkpoints: Annotated[
	bool, typer.Option(help="Clear tokenlearn checkpoints to force fresh featurization and training")
	] = False,
	use_optimized_dataset: Annotated[
	bool,
	typer.Option(
	"--use-optimized-dataset", help="Use the pre-created optimized dataset from code_model2vec/dataset"
	),
	] = False,
	dataset_path: Annotated[
	str \| None,
	typer.Option("--dataset-path", help="Path to custom dataset directory (defaults to code_model2vec/dataset)"),
	] = None,
	) -> None:
	"""Run unified Model2Vec distillation with optional training."""
	from .distill import main as distill_main

	# Call the distill main function with all arguments
	distill_main(
	use_beam,
	train,
	teacher_models,
	pca_dims,
	clear_cache,
	clear_checkpoints,
	use_optimized_dataset,
	dataset_path,
	)


	@app.command()
	def evaluate(
	use_beam: Annotated[bool, typer.Option(help="Use Beam for evaluation")] = False,
	skip_third_party: Annotated[bool, typer.Option(help="Skip third-party models")] = False,
	skip_benchmark: Annotated[bool, typer.Option(help="Skip performance benchmarking")] = False,
	max_queries: Annotated[int, typer.Option(help="Maximum queries per language")] = 100,
	) -> None:
	"""Run CodeSearchNet evaluation on models."""
	from .evaluate import main as evaluate_main

	# Call the evaluate main function with arguments
	evaluate_main(use_beam, skip_third_party, skip_benchmark, max_queries)


	@app.command()
	def analyze(
	results_dir: Annotated[str \| None, typer.Option(help="Results directory")] = None,
	model_name: Annotated[str, typer.Option(help="Model name for analysis")] = "gte_qwen2_m2v_code (Ours)",
	output: Annotated[str, typer.Option(help="Output report file")] = "REPORT.md",
	export_csv: Annotated[str \| None, typer.Option(help="Export results to CSV")] = None,
	) -> None:
	"""Generate comprehensive analysis reports."""
	from .analyze import main as analyze_main

	# Call the analyze main function with arguments
	analyze_main(results_dir or "code_model2vec/evaluation_results", model_name, output, export_csv)


	@app.command()
	def dataset(
	max_samples_per_lang: Annotated[int, typer.Option(help="Maximum samples per language")] = 50000,
	min_doc_words: Annotated[int, typer.Option(help="Minimum words in documentation")] = 3,
	max_doc_words: Annotated[int, typer.Option(help="Maximum words in documentation")] = 100,
	min_code_chars: Annotated[int, typer.Option(help="Minimum characters in code")] = 50,
	max_code_chars: Annotated[int, typer.Option(help="Maximum characters in code")] = 2000,
	output_dir: Annotated[str \| None, typer.Option(help="Output directory for dataset")] = None,
	simple_format: Annotated[
	bool, typer.Option(help="Create only simple format (not multiple training formats)")
	] = False,
	) -> None:
	"""Create optimized training dataset from CodeSearchNet for code search tasks."""
	from .dataset import main as dataset_main

	# Call the dataset main function with arguments
	dataset_main(
	max_samples_per_lang, min_doc_words, max_doc_words, min_code_chars, max_code_chars, output_dir, simple_format
	)


	if __name__ == "__main__":
	app()