|
""" |
|
Shared configuration for the distiller package. |
|
|
|
This module centralizes all configuration constants, default values, and common |
|
settings used across distillation, evaluation, and benchmarking modules. |
|
""" |
|
|
|
import logging |
|
from pathlib import Path |
|
from typing import Any |
|
|
|
from beam import GpuType, Image |
|
from pydantic import BaseModel |
|
|
|
|
|
|
|
|
|
|
|
|
|
def setup_logging(level: int = logging.INFO) -> None: |
|
"""Set up consistent logging across the package.""" |
|
log_dir = Path("logs") |
|
log_dir.mkdir(parents=True, exist_ok=True) |
|
log_path = log_dir / "distiller.log" |
|
logging.basicConfig( |
|
level=level, |
|
format="%(asctime)s - %(levelname)s - %(message)s", |
|
handlers=[logging.StreamHandler(), logging.FileHandler(log_path, mode="a")], |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class BeamFunctionConfig(BaseModel): |
|
"""Complete configuration for Beam @function decorator parameters.""" |
|
|
|
|
|
cpu: float = 2.0 |
|
memory: int = 8192 |
|
gpu: GpuType | list[GpuType] = GpuType.A100_40 |
|
|
|
|
|
timeout: int = 3600 * 12 |
|
retries: int = 2 |
|
headless: bool = False |
|
|
|
|
|
callback_url: str | None = None |
|
name: str | None = None |
|
task_policy: Any | None = None |
|
retry_for: list[str] | None = None |
|
|
|
|
|
secrets: list[str] = ["HF_ACCESS_TOKEN"] |
|
env_vars: dict[str, str] = { |
|
"TOKENIZERS_PARALLELISM": "false", |
|
"CUDA_LAUNCH_BLOCKING": "0", |
|
"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", |
|
"TORCH_CUDNN_V8_API_ENABLED": "1", |
|
|
|
"FLASH_ATTENTION_FORCE_USE": "1", |
|
"TORCH_COMPILE_DISABLE": "1", |
|
} |
|
|
|
|
|
|
|
BEAM_CONFIGS: dict[str, BeamFunctionConfig] = { |
|
"distillation": BeamFunctionConfig( |
|
cpu=4.0, |
|
memory=16384, |
|
gpu=GpuType.A100_40, |
|
timeout=3600 * 12, |
|
retries=2, |
|
secrets=["HF_ACCESS_TOKEN"], |
|
), |
|
"training": BeamFunctionConfig( |
|
cpu=4.0, |
|
memory=16384, |
|
gpu=[GpuType.H100, GpuType.A100_40], |
|
timeout=3600 * 12, |
|
retries=2, |
|
secrets=["HF_ACCESS_TOKEN"], |
|
), |
|
"evaluation": BeamFunctionConfig( |
|
cpu=2.0, |
|
memory=8192, |
|
gpu=GpuType.A100_40, |
|
timeout=3600 * 4, |
|
retries=3, |
|
secrets=["HF_ACCESS_TOKEN"], |
|
), |
|
} |
|
|
|
|
|
DEFAULT_BEAM_CONFIG = BEAM_CONFIGS["distillation"] |
|
|
|
|
|
|
|
class VolumeConfig(BaseModel): |
|
"""Volume configuration container.""" |
|
|
|
name: str |
|
mount_path: str |
|
description: str = "" |
|
|
|
|
|
|
|
VOLUMES: dict[str, VolumeConfig] = { |
|
"primary": VolumeConfig( |
|
name="code_model2vec", |
|
mount_path="./code_model2vec", |
|
description="Primary volume for all distillation models, evaluations, benchmarks, and checkpoints", |
|
), |
|
|
|
"simplified": VolumeConfig( |
|
name="code_model2vec", |
|
mount_path="./code_model2vec", |
|
description="Primary volume for all distillation models, evaluations, benchmarks, and checkpoints", |
|
), |
|
} |
|
|
|
|
|
DEFAULT_VOLUME = "primary" |
|
|
|
|
|
BEAM_ENV_SETTINGS: dict[str, str] = DEFAULT_BEAM_CONFIG.env_vars |
|
|
|
|
|
COMMON_PACKAGES: list[str] = [ |
|
"torch>=2.7.0", |
|
"transformers>=4.40.0", |
|
"datasets>=3.2.0", |
|
"sentence-transformers>=4.1.0", |
|
"model2vec[train]>=0.5.0", |
|
"tokenlearn>=0.2.0", |
|
"numpy>=1.26.4", |
|
"scikit-learn>=1.6.1", |
|
"pandas>=2.0.0", |
|
"tqdm>=4.65.0", |
|
"plotly>=5.0.0", |
|
"matplotlib>=3.7.0", |
|
"seaborn>=0.12.0", |
|
"typer>=0.16.0", |
|
"pydantic>=2.11.5", |
|
"hatchling>=1.27.0", |
|
] |
|
|
|
|
|
IMAGE = Image(python_version="python3.12").add_python_packages(COMMON_PACKAGES) |
|
|
|
|
|
|
|
|
|
|
|
|
|
TEACHER_MODELS: list[str] = [ |
|
"Alibaba-NLP/gte-Qwen2-1.5B-instruct", |
|
"BAAI/bge-m3", |
|
"jinaai/jina-embeddings-v3", |
|
"lightonai/Reason-ModernColBERT", |
|
"Linq-AI-Research/Linq-Embed-Mistral", |
|
"microsoft/codebert-base", |
|
"microsoft/graphcodebert-base", |
|
"nomic-ai/nomic-embed-text-v2-moe", |
|
"Qodo/Qodo-Embed-1-1.5B", |
|
"sentence-transformers/all-MiniLM-L6-v2", |
|
"sentence-transformers/all-mpnet-base-v2", |
|
"sentence-transformers/paraphrase-MiniLM-L6-v2", |
|
"jinaai/jina-embeddings-v2-base-code", |
|
] |
|
|
|
|
|
DEFAULT_EVALUATION_MODELS: list[str] = [ |
|
"Alibaba-NLP/gte-Qwen2-1.5B-instruct", |
|
"BAAI/bge-m3", |
|
"huggingface/CodeBERTa-small-v1", |
|
"jinaai/jina-embeddings-v3", |
|
"lightonai/Reason-ModernColBERT", |
|
"Linq-AI-Research/Linq-Embed-Mistral", |
|
"microsoft/codebert-base", |
|
"microsoft/graphcodebert-base", |
|
"minishlab/potion-base-8M", |
|
"minishlab/potion-retrieval-32M", |
|
"minishlab/potion-multilingual-128M", |
|
"nomic-ai/nomic-embed-text-v2-moe", |
|
"Qodo/Qodo-Embed-1-1.5B", |
|
"Salesforce/codet5-base", |
|
"sentence-transformers/all-MiniLM-L12-v2", |
|
"sentence-transformers/all-MiniLM-L6-v2", |
|
"sentence-transformers/all-mpnet-base-v2", |
|
"sentence-transformers/paraphrase-MiniLM-L6-v2", |
|
"jinaai/jina-embeddings-v2-base-code", |
|
] |
|
|
|
|
|
|
|
class DistillationConfig(BaseModel): |
|
"""Configuration for Model2Vec distillation parameters.""" |
|
|
|
|
|
code_teacher_models: list[str] = TEACHER_MODELS |
|
|
|
|
|
optimal_pca_dims: int = 256 |
|
sif_coefficient: float = 1e-3 |
|
apply_zipf: bool = True |
|
|
|
|
|
tokenlearn_dataset: str = "allenai/c4" |
|
tokenlearn_dataset_name: str = "en" |
|
tokenlearn_text_key: str = "text" |
|
tokenlearn_timeout_featurize: int = 21600 |
|
tokenlearn_timeout_train: int = 7200 |
|
|
|
|
|
tokenlearn_max_samples: int = 50000 |
|
|
|
|
|
use_optimized_dataset: bool = True |
|
custom_dataset_path: str | None = "code_model2vec/dataset" |
|
|
|
|
|
distillation_config = DistillationConfig() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class LanguagesConfig(BaseModel): |
|
"""Configuration for languages used in evaluation.""" |
|
|
|
all: list[str] = [ |
|
"python", |
|
"java", |
|
"javascript", |
|
"php", |
|
"ruby", |
|
"go", |
|
] |
|
|
|
|
|
languages_config = LanguagesConfig() |
|
|
|
|
|
|
|
class CodeSearchNetConfig(BaseModel): |
|
"""Configuration for CodeSearchNet evaluation settings.""" |
|
|
|
dataset_name: str = "code_search_net" |
|
evaluation_languages: list[str] = languages_config.all |
|
max_queries_per_language: int = 1000 |
|
similarity_threshold: float = 0.7 |
|
evaluation_metrics: list[str] = ["ndcg@1", "ndcg@5", "ndcg@10", "mrr", "recall@1", "recall@5", "recall@10"] |
|
|
|
|
|
codesearchnet_config = CodeSearchNetConfig() |
|
|
|
|
|
TRAINING_DATASET: str = "sentence-transformers/codesearchnet" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class StandardDirectories(BaseModel): |
|
"""Standardized directory structure for code_model2vec workspace.""" |
|
|
|
|
|
root: str = "code_model2vec" |
|
|
|
|
|
base: str = "code_model2vec/base" |
|
final: str = "code_model2vec/final" |
|
models: str = "code_model2vec/models" |
|
|
|
|
|
evaluation_results: str = "code_model2vec/evaluation_results" |
|
benchmark_results: str = "code_model2vec/benchmark_results" |
|
analysis_results: str = "code_model2vec/analysis_results" |
|
|
|
|
|
checkpoints: str = "code_model2vec/checkpoints" |
|
cache: str = "code_model2vec/cache" |
|
temp: str = "code_model2vec/temp" |
|
|
|
|
|
|
|
directories = StandardDirectories() |
|
|
|
|
|
|
|
class OutputDirs(BaseModel): |
|
"""Base output directory structure for storing models, checkpoints, and results.""" |
|
|
|
base: str = "base" |
|
models: str = "final" |
|
checkpoints: str = "checkpoints" |
|
evaluation_results: str = "evaluation_results" |
|
benchmark_results: str = "benchmark_results" |
|
analysis_results: str = "analysis_results" |
|
cache: str = "cache" |
|
|
|
|
|
output_dirs = OutputDirs() |
|
|
|
|
|
|
|
class FilenamePatterns(BaseModel): |
|
"""File naming patterns for evaluation, benchmark, checkpoint, and model files.""" |
|
|
|
evaluation: str = "codesearchnet_eval_{model_name}.json" |
|
bencmark: str = "benchmark_{model_name}.json" |
|
checkpoint: str = "checkpoints_{stage}_step_{step}.json" |
|
model: str = "{teacher_model}_{dims}d" |
|
|
|
|
|
filename_patterns = FilenamePatterns() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class ChartConfig(BaseModel): |
|
"""Chart configuration for analysis and visualization.""" |
|
|
|
figsize: tuple[int, int] = (12, 8) |
|
dpi: int = 300 |
|
style: str = "whitegrid" |
|
color_palette: str = "Set2" |
|
save_formats: list[str] = ["png", "pdf"] |
|
|
|
|
|
chart_config = ChartConfig() |
|
|
|
|
|
|
|
class PerformanceThresholds(BaseModel): |
|
"""Performance thresholds for analysis results.""" |
|
|
|
excellent: float = 0.7 |
|
good: float = 0.5 |
|
fair: float = 0.3 |
|
pour: float = 0.1 |
|
|
|
|
|
performance_thresholds = PerformanceThresholds() |
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_volume_config() -> VolumeConfig: |
|
"""Get volume configuration for any workflow - always returns the primary code_model2vec volume.""" |
|
return VOLUMES["primary"] |
|
|
|
|
|
def get_output_path(base_path: str | Path, output_type: str) -> Path: |
|
"""Get standardized output path for different types of outputs.""" |
|
base = Path(base_path) |
|
if hasattr(output_dirs, output_type): |
|
return base / getattr(output_dirs, output_type) |
|
return base / output_type |
|
|
|
|
|
def get_standard_directory(dir_type: str) -> str: |
|
"""Get standardized directory path for any directory type.""" |
|
if hasattr(directories, dir_type): |
|
return getattr(directories, dir_type) |
|
|
|
return f"code_model2vec/{dir_type}" |
|
|
|
|
|
def ensure_checkpoint_directory(stage: str) -> str: |
|
"""Ensure checkpoint directory exists for a specific stage and return the path.""" |
|
checkpoint_dir = f"{directories.checkpoints}/{stage}" |
|
Path(checkpoint_dir).mkdir(parents=True, exist_ok=True) |
|
return checkpoint_dir |
|
|
|
|
|
def format_filename(pattern_key: str, **kwargs: Any) -> str: |
|
"""Format filename using predefined patterns.""" |
|
if hasattr(filename_patterns, pattern_key): |
|
return getattr(filename_patterns, pattern_key).format(**kwargs) |
|
msg = f"Unknown filename pattern: {pattern_key}" |
|
raise ValueError(msg) |
|
|
|
|
|
def get_safe_model_name(model_name: str) -> str: |
|
"""Convert model name to filesystem-safe name.""" |
|
return "".join(c for c in model_name if c.isalnum() or c in ("-", "_", ".")).replace("/", "_") |
|
|
|
|
|
def get_beam_config(job_type: str = "distillation") -> BeamFunctionConfig: |
|
"""Get Beam configuration for a specific job type.""" |
|
if job_type in BEAM_CONFIGS: |
|
return BEAM_CONFIGS[job_type] |
|
return DEFAULT_BEAM_CONFIG |
|
|
|
|
|
def create_beam_function_kwargs( |
|
job_type: str = "distillation", volume_config: VolumeConfig | None = None |
|
) -> dict[str, Any]: |
|
"""Create kwargs dictionary for @function decorator.""" |
|
from beam import Volume |
|
|
|
config = get_beam_config(job_type) |
|
volume_cfg = volume_config or get_volume_config() |
|
|
|
|
|
gpu_type = config.gpu |
|
|
|
kwargs: dict[str, Any] = { |
|
"cpu": config.cpu, |
|
"memory": config.memory, |
|
"gpu": gpu_type, |
|
"image": IMAGE, |
|
"timeout": config.timeout, |
|
"retries": config.retries, |
|
"headless": config.headless, |
|
"volumes": [Volume(name=volume_cfg.name, mount_path=volume_cfg.mount_path)], |
|
"secrets": config.secrets, |
|
"env": config.env_vars, |
|
} |
|
|
|
|
|
if config.callback_url: |
|
kwargs["callback_url"] = config.callback_url |
|
if config.name: |
|
kwargs["name"] = config.name |
|
if config.task_policy: |
|
kwargs["task_policy"] = config.task_policy |
|
if config.retry_for: |
|
kwargs["retry_for"] = config.retry_for |
|
|
|
return kwargs |
|
|
|
|
|
def get_distillation_function_kwargs() -> dict[str, Any]: |
|
"""Get function kwargs specifically for distillation jobs.""" |
|
return create_beam_function_kwargs("distillation") |
|
|
|
|
|
def get_training_function_kwargs() -> dict[str, Any]: |
|
"""Get function kwargs specifically for training jobs.""" |
|
return create_beam_function_kwargs("training") |
|
|
|
|
|
def get_evaluation_function_kwargs() -> dict[str, Any]: |
|
"""Get function kwargs specifically for evaluation jobs.""" |
|
return create_beam_function_kwargs("evaluation") |
|
|