codemalt / src /distiller /__main__.py
Sarthak
chore: update README and REPORT with performance insights and dataset changes
0dbb356
"""Main entry point for the distiller package."""
from typing import Annotated
import typer
app = typer.Typer(
help="Model2Vec Code-Specialized Distillation Pipeline",
no_args_is_help=True,
context_settings={"help_option_names": ["-h", "--help"]},
)
@app.command()
def distill(
use_beam: Annotated[bool, typer.Option(help="Use Beam for distillation")] = False,
train: Annotated[bool, typer.Option(help="Enable advanced training (CodeSearchNet fine-tuning)")] = False,
teacher_models: Annotated[list[str] | None, typer.Option(help="Specific teacher models to distill")] = None,
pca_dims: Annotated[int | None, typer.Option(help="PCA dimensions (uses config default if not specified)")] = None,
clear_cache: Annotated[
bool, typer.Option(help="Clear HuggingFace cache for problematic models before distillation")
] = False,
clear_checkpoints: Annotated[
bool, typer.Option(help="Clear tokenlearn checkpoints to force fresh featurization and training")
] = False,
use_optimized_dataset: Annotated[
bool,
typer.Option(
"--use-optimized-dataset", help="Use the pre-created optimized dataset from code_model2vec/dataset"
),
] = False,
dataset_path: Annotated[
str | None,
typer.Option("--dataset-path", help="Path to custom dataset directory (defaults to code_model2vec/dataset)"),
] = None,
) -> None:
"""Run unified Model2Vec distillation with optional training."""
from .distill import main as distill_main
# Call the distill main function with all arguments
distill_main(
use_beam,
train,
teacher_models,
pca_dims,
clear_cache,
clear_checkpoints,
use_optimized_dataset,
dataset_path,
)
@app.command()
def evaluate(
use_beam: Annotated[bool, typer.Option(help="Use Beam for evaluation")] = False,
skip_third_party: Annotated[bool, typer.Option(help="Skip third-party models")] = False,
skip_benchmark: Annotated[bool, typer.Option(help="Skip performance benchmarking")] = False,
max_queries: Annotated[int, typer.Option(help="Maximum queries per language")] = 100,
) -> None:
"""Run CodeSearchNet evaluation on models."""
from .evaluate import main as evaluate_main
# Call the evaluate main function with arguments
evaluate_main(use_beam, skip_third_party, skip_benchmark, max_queries)
@app.command()
def analyze(
results_dir: Annotated[str | None, typer.Option(help="Results directory")] = None,
model_name: Annotated[str, typer.Option(help="Model name for analysis")] = "gte_qwen2_m2v_code (Ours)",
output: Annotated[str, typer.Option(help="Output report file")] = "REPORT.md",
export_csv: Annotated[str | None, typer.Option(help="Export results to CSV")] = None,
) -> None:
"""Generate comprehensive analysis reports."""
from .analyze import main as analyze_main
# Call the analyze main function with arguments
analyze_main(results_dir or "code_model2vec/evaluation_results", model_name, output, export_csv)
@app.command()
def dataset(
max_samples_per_lang: Annotated[int, typer.Option(help="Maximum samples per language")] = 50000,
min_doc_words: Annotated[int, typer.Option(help="Minimum words in documentation")] = 3,
max_doc_words: Annotated[int, typer.Option(help="Maximum words in documentation")] = 100,
min_code_chars: Annotated[int, typer.Option(help="Minimum characters in code")] = 50,
max_code_chars: Annotated[int, typer.Option(help="Maximum characters in code")] = 2000,
output_dir: Annotated[str | None, typer.Option(help="Output directory for dataset")] = None,
simple_format: Annotated[
bool, typer.Option(help="Create only simple format (not multiple training formats)")
] = False,
) -> None:
"""Create optimized training dataset from CodeSearchNet for code search tasks."""
from .dataset import main as dataset_main
# Call the dataset main function with arguments
dataset_main(
max_samples_per_lang, min_doc_words, max_doc_words, min_code_chars, max_code_chars, output_dir, simple_format
)
if __name__ == "__main__":
app()