File size: 3,946 Bytes
ea0b2a0
 
454e47c
ea0b2a0
454e47c
ea0b2a0
454e47c
 
 
 
 
ea0b2a0
 
454e47c
 
 
 
 
 
7837959
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
454e47c
 
 
ea0b2a0
7837959
 
 
 
 
 
 
 
 
 
 
ea0b2a0
 
454e47c
 
 
 
 
da2f1e0
454e47c
 
 
ea0b2a0
454e47c
 
ea0b2a0
 
454e47c
 
 
 
 
 
 
 
 
ea0b2a0
454e47c
 
ea0b2a0
 
7837959
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea0b2a0
454e47c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""Main entry point for the distiller package."""

from typing import Annotated

import typer

app = typer.Typer(
	help="Model2Vec Code-Specialized Distillation Pipeline",
	no_args_is_help=True,
	context_settings={"help_option_names": ["-h", "--help"]},
)


@app.command()
def distill(
	use_beam: Annotated[bool, typer.Option(help="Use Beam for distillation")] = False,
	train: Annotated[bool, typer.Option(help="Enable advanced training (CodeSearchNet fine-tuning)")] = False,
	teacher_models: Annotated[list[str] | None, typer.Option(help="Specific teacher models to distill")] = None,
	pca_dims: Annotated[int | None, typer.Option(help="PCA dimensions (uses config default if not specified)")] = None,
	clear_cache: Annotated[
		bool, typer.Option(help="Clear HuggingFace cache for problematic models before distillation")
	] = False,
	clear_checkpoints: Annotated[
		bool, typer.Option(help="Clear tokenlearn checkpoints to force fresh featurization and training")
	] = False,
	use_optimized_dataset: Annotated[
		bool,
		typer.Option(
			"--use-optimized-dataset", help="Use the pre-created optimized dataset from code_model2vec/dataset"
		),
	] = False,
	dataset_path: Annotated[
		str | None,
		typer.Option("--dataset-path", help="Path to custom dataset directory (defaults to code_model2vec/dataset)"),
	] = None,
) -> None:
	"""Run unified Model2Vec distillation with optional training."""
	from .distill import main as distill_main

	# Call the distill main function with all arguments
	distill_main(
		use_beam,
		train,
		teacher_models,
		pca_dims,
		clear_cache,
		clear_checkpoints,
		use_optimized_dataset,
		dataset_path,
	)


@app.command()
def evaluate(
	use_beam: Annotated[bool, typer.Option(help="Use Beam for evaluation")] = False,
	skip_third_party: Annotated[bool, typer.Option(help="Skip third-party models")] = False,
	skip_benchmark: Annotated[bool, typer.Option(help="Skip performance benchmarking")] = False,
	max_queries: Annotated[int, typer.Option(help="Maximum queries per language")] = 100,
) -> None:
	"""Run CodeSearchNet evaluation on models."""
	from .evaluate import main as evaluate_main

	# Call the evaluate main function with arguments
	evaluate_main(use_beam, skip_third_party, skip_benchmark, max_queries)


@app.command()
def analyze(
	results_dir: Annotated[str | None, typer.Option(help="Results directory")] = None,
	model_name: Annotated[str, typer.Option(help="Model name for analysis")] = "gte_qwen2_m2v_code (Ours)",
	output: Annotated[str, typer.Option(help="Output report file")] = "REPORT.md",
	export_csv: Annotated[str | None, typer.Option(help="Export results to CSV")] = None,
) -> None:
	"""Generate comprehensive analysis reports."""
	from .analyze import main as analyze_main

	# Call the analyze main function with arguments
	analyze_main(results_dir or "code_model2vec/evaluation_results", model_name, output, export_csv)


@app.command()
def dataset(
	max_samples_per_lang: Annotated[int, typer.Option(help="Maximum samples per language")] = 50000,
	min_doc_words: Annotated[int, typer.Option(help="Minimum words in documentation")] = 3,
	max_doc_words: Annotated[int, typer.Option(help="Maximum words in documentation")] = 100,
	min_code_chars: Annotated[int, typer.Option(help="Minimum characters in code")] = 50,
	max_code_chars: Annotated[int, typer.Option(help="Maximum characters in code")] = 2000,
	output_dir: Annotated[str | None, typer.Option(help="Output directory for dataset")] = None,
	simple_format: Annotated[
		bool, typer.Option(help="Create only simple format (not multiple training formats)")
	] = False,
) -> None:
	"""Create optimized training dataset from CodeSearchNet for code search tasks."""
	from .dataset import main as dataset_main

	# Call the dataset main function with arguments
	dataset_main(
		max_samples_per_lang, min_doc_words, max_doc_words, min_code_chars, max_code_chars, output_dir, simple_format
	)


if __name__ == "__main__":
	app()