|
"""Main entry point for the distiller package.""" |
|
|
|
from typing import Annotated |
|
|
|
import typer |
|
|
|
app = typer.Typer( |
|
help="Model2Vec Code-Specialized Distillation Pipeline", |
|
no_args_is_help=True, |
|
context_settings={"help_option_names": ["-h", "--help"]}, |
|
) |
|
|
|
|
|
@app.command() |
|
def distill( |
|
use_beam: Annotated[bool, typer.Option(help="Use Beam for distillation")] = False, |
|
train: Annotated[bool, typer.Option(help="Enable advanced training (CodeSearchNet fine-tuning)")] = False, |
|
teacher_models: Annotated[list[str] | None, typer.Option(help="Specific teacher models to distill")] = None, |
|
pca_dims: Annotated[int | None, typer.Option(help="PCA dimensions (uses config default if not specified)")] = None, |
|
clear_cache: Annotated[ |
|
bool, typer.Option(help="Clear HuggingFace cache for problematic models before distillation") |
|
] = False, |
|
clear_checkpoints: Annotated[ |
|
bool, typer.Option(help="Clear tokenlearn checkpoints to force fresh featurization and training") |
|
] = False, |
|
use_optimized_dataset: Annotated[ |
|
bool, |
|
typer.Option( |
|
"--use-optimized-dataset", help="Use the pre-created optimized dataset from code_model2vec/dataset" |
|
), |
|
] = False, |
|
dataset_path: Annotated[ |
|
str | None, |
|
typer.Option("--dataset-path", help="Path to custom dataset directory (defaults to code_model2vec/dataset)"), |
|
] = None, |
|
) -> None: |
|
"""Run unified Model2Vec distillation with optional training.""" |
|
from .distill import main as distill_main |
|
|
|
|
|
distill_main( |
|
use_beam, |
|
train, |
|
teacher_models, |
|
pca_dims, |
|
clear_cache, |
|
clear_checkpoints, |
|
use_optimized_dataset, |
|
dataset_path, |
|
) |
|
|
|
|
|
@app.command() |
|
def evaluate( |
|
use_beam: Annotated[bool, typer.Option(help="Use Beam for evaluation")] = False, |
|
skip_third_party: Annotated[bool, typer.Option(help="Skip third-party models")] = False, |
|
skip_benchmark: Annotated[bool, typer.Option(help="Skip performance benchmarking")] = False, |
|
max_queries: Annotated[int, typer.Option(help="Maximum queries per language")] = 100, |
|
) -> None: |
|
"""Run CodeSearchNet evaluation on models.""" |
|
from .evaluate import main as evaluate_main |
|
|
|
|
|
evaluate_main(use_beam, skip_third_party, skip_benchmark, max_queries) |
|
|
|
|
|
@app.command() |
|
def analyze( |
|
results_dir: Annotated[str | None, typer.Option(help="Results directory")] = None, |
|
model_name: Annotated[str, typer.Option(help="Model name for analysis")] = "gte_qwen2_m2v_code (Ours)", |
|
output: Annotated[str, typer.Option(help="Output report file")] = "REPORT.md", |
|
export_csv: Annotated[str | None, typer.Option(help="Export results to CSV")] = None, |
|
) -> None: |
|
"""Generate comprehensive analysis reports.""" |
|
from .analyze import main as analyze_main |
|
|
|
|
|
analyze_main(results_dir or "code_model2vec/evaluation_results", model_name, output, export_csv) |
|
|
|
|
|
@app.command() |
|
def dataset( |
|
max_samples_per_lang: Annotated[int, typer.Option(help="Maximum samples per language")] = 50000, |
|
min_doc_words: Annotated[int, typer.Option(help="Minimum words in documentation")] = 3, |
|
max_doc_words: Annotated[int, typer.Option(help="Maximum words in documentation")] = 100, |
|
min_code_chars: Annotated[int, typer.Option(help="Minimum characters in code")] = 50, |
|
max_code_chars: Annotated[int, typer.Option(help="Maximum characters in code")] = 2000, |
|
output_dir: Annotated[str | None, typer.Option(help="Output directory for dataset")] = None, |
|
simple_format: Annotated[ |
|
bool, typer.Option(help="Create only simple format (not multiple training formats)") |
|
] = False, |
|
) -> None: |
|
"""Create optimized training dataset from CodeSearchNet for code search tasks.""" |
|
from .dataset import main as dataset_main |
|
|
|
|
|
dataset_main( |
|
max_samples_per_lang, min_doc_words, max_doc_words, min_code_chars, max_code_chars, output_dir, simple_format |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
app() |
|
|