"""Main entry point for the distiller package.""" from typing import Annotated import typer app = typer.Typer( help="Model2Vec Code-Specialized Distillation Pipeline", no_args_is_help=True, context_settings={"help_option_names": ["-h", "--help"]}, ) @app.command() def distill( use_beam: Annotated[bool, typer.Option(help="Use Beam for distillation")] = False, train: Annotated[bool, typer.Option(help="Enable advanced training (CodeSearchNet fine-tuning)")] = False, teacher_models: Annotated[list[str] | None, typer.Option(help="Specific teacher models to distill")] = None, pca_dims: Annotated[int | None, typer.Option(help="PCA dimensions (uses config default if not specified)")] = None, clear_cache: Annotated[ bool, typer.Option(help="Clear HuggingFace cache for problematic models before distillation") ] = False, clear_checkpoints: Annotated[ bool, typer.Option(help="Clear tokenlearn checkpoints to force fresh featurization and training") ] = False, use_optimized_dataset: Annotated[ bool, typer.Option( "--use-optimized-dataset", help="Use the pre-created optimized dataset from code_model2vec/dataset" ), ] = False, dataset_path: Annotated[ str | None, typer.Option("--dataset-path", help="Path to custom dataset directory (defaults to code_model2vec/dataset)"), ] = None, ) -> None: """Run unified Model2Vec distillation with optional training.""" from .distill import main as distill_main # Call the distill main function with all arguments distill_main( use_beam, train, teacher_models, pca_dims, clear_cache, clear_checkpoints, use_optimized_dataset, dataset_path, ) @app.command() def evaluate( use_beam: Annotated[bool, typer.Option(help="Use Beam for evaluation")] = False, skip_third_party: Annotated[bool, typer.Option(help="Skip third-party models")] = False, skip_benchmark: Annotated[bool, typer.Option(help="Skip performance benchmarking")] = False, max_queries: Annotated[int, typer.Option(help="Maximum queries per language")] = 100, ) -> None: """Run CodeSearchNet evaluation on models.""" from .evaluate import main as evaluate_main # Call the evaluate main function with arguments evaluate_main(use_beam, skip_third_party, skip_benchmark, max_queries) @app.command() def analyze( results_dir: Annotated[str | None, typer.Option(help="Results directory")] = None, model_name: Annotated[str, typer.Option(help="Model name for analysis")] = "gte_qwen2_m2v_code (Ours)", output: Annotated[str, typer.Option(help="Output report file")] = "REPORT.md", export_csv: Annotated[str | None, typer.Option(help="Export results to CSV")] = None, ) -> None: """Generate comprehensive analysis reports.""" from .analyze import main as analyze_main # Call the analyze main function with arguments analyze_main(results_dir or "code_model2vec/evaluation_results", model_name, output, export_csv) @app.command() def dataset( max_samples_per_lang: Annotated[int, typer.Option(help="Maximum samples per language")] = 50000, min_doc_words: Annotated[int, typer.Option(help="Minimum words in documentation")] = 3, max_doc_words: Annotated[int, typer.Option(help="Maximum words in documentation")] = 100, min_code_chars: Annotated[int, typer.Option(help="Minimum characters in code")] = 50, max_code_chars: Annotated[int, typer.Option(help="Maximum characters in code")] = 2000, output_dir: Annotated[str | None, typer.Option(help="Output directory for dataset")] = None, simple_format: Annotated[ bool, typer.Option(help="Create only simple format (not multiple training formats)") ] = False, ) -> None: """Create optimized training dataset from CodeSearchNet for code search tasks.""" from .dataset import main as dataset_main # Call the dataset main function with arguments dataset_main( max_samples_per_lang, min_doc_words, max_doc_words, min_code_chars, max_code_chars, output_dir, simple_format ) if __name__ == "__main__": app()