File size: 73,364 Bytes

"""
Unified Code-Specialized Model2Vec Distillation Script.

This script provides a unified approach for creating code-specialized embeddings
using Model2Vec distillation with optional code-specific training.

Features:
- Basic distillation (default): Simple Model2Vec distillation
- Advanced training (--train flag): Additional CodeSearchNet fine-tuning
- Checkpoint support with Beam sync utilities
- Multi-teacher model processing
- Smart resume capabilities
- Hierarchical storage: base → final

Directory Structure:
- code_model2vec/base: Basic distilled models (first step)
- code_model2vec/final: Final models (copied from base or after training)

Usage:
    distiller distill [--use-beam] [--train]  # Basic distillation or with training
"""

import importlib.util
import json
import logging
import os
import time
from pathlib import Path
from typing import Annotated, Any

import torch
import typer
from beam import function
from sentence_transformers import SentenceTransformer

from distiller.model2vec.distill import distill

# Try to import flash_attn to check if it's available
from .beam_utils import (
	BeamCheckpointManager,
	create_beam_utilities,
	download_model_from_beam,
	sync_checkpoints_from_beam,
	sync_checkpoints_to_beam,
	upload_model_to_beam,
)
from .config import (
	codesearchnet_config,
	directories,
	distillation_config,
	get_distillation_function_kwargs,
	get_training_function_kwargs,
	get_volume_config,
	languages_config,
)

# Check if flash_attn is available and compatible
FLASH_ATTN_AVAILABLE = importlib.util.find_spec("flash_attn") is not None

# =============================================================================
# CONFIGURATION
# =============================================================================

VOLUME_CONFIG = get_volume_config()
LOCAL_BASE_DIR = directories.base
LOCAL_FINAL_DIR = directories.final

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Teacher models for distillation
DEFAULT_TEACHER_MODELS = list(distillation_config.code_teacher_models)

# =============================================================================
# FLASH ATTENTION UTILITIES
# =============================================================================


def configure_flash_attention() -> dict[str, Any]:
	"""Configure flash attention settings and return model kwargs."""
	model_kwargs: dict[str, Any] = {}

	if not FLASH_ATTN_AVAILABLE:
		logger.info("⚠️ Flash attention not available - using standard attention")
		return model_kwargs

	# Set environment variables for flash attention
	os.environ["FLASH_ATTENTION_FORCE_USE"] = "1"
	# Disable torch compile for flash attention compatibility
	os.environ["TORCH_COMPILE_DISABLE"] = "1"
	# Enable flash attention in transformers
	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	# Check if we're on a compatible GPU
	try:
		if torch.cuda.is_available():
			device_capability = torch.cuda.get_device_capability()
			# Flash attention requires compute capability >= 7.5 (Turing, Ampere, Ada, Hopper)
			if device_capability[0] >= 7 and (device_capability[0] > 7 or device_capability[1] >= 5):
				logger.info("✅ Flash attention enabled - compatible GPU detected")
				model_kwargs.update(
					{
						"model_kwargs": {
							"attn_implementation": "flash_attention_2",
							"torch_dtype": torch.float16,  # Flash attention works best with fp16
							"use_flash_attention_2": True,
							"_attn_implementation": "flash_attention_2",  # Alternative key for some models
						}
					}
				)
			else:
				logger.info(f"⚠️ GPU compute capability {device_capability} < 7.5 - flash attention disabled")
		else:
			logger.info("⚠️ No CUDA available - flash attention disabled")
	except Exception as e:
		logger.warning(f"⚠️ Failed to check GPU compatibility: {e} - flash attention disabled")

	return model_kwargs


def load_model_with_flash_attention(model_path: str, device: str = "auto") -> SentenceTransformer:
	"""Load a SentenceTransformer model with flash attention if available."""
	flash_kwargs = configure_flash_attention()

	try:
		# Try loading with flash attention first
		if flash_kwargs and "model_kwargs" in flash_kwargs:
			logger.info(f"🚀 Loading model with flash attention: {Path(model_path).name}")
			model = SentenceTransformer(model_path, device=device, trust_remote_code=True, **flash_kwargs)
			logger.info("✅ Model loaded successfully with flash attention")
			return model
	except Exception as e:
		logger.warning(f"⚠️ Failed to load with flash attention: {e}")
		logger.info("🔄 Falling back to standard attention")

	# Fallback to standard loading
	logger.info(f"📂 Loading model with standard attention: {Path(model_path).name}")
	model = SentenceTransformer(model_path, device=device, trust_remote_code=True)
	logger.info("✅ Model loaded successfully with standard attention")
	return model


# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================


def get_current_config_hash(enable_training: bool) -> str:
	"""Generate a hash of current configuration parameters for checkpoint validation."""
	import hashlib

	config_params = {
		"pca_dims": distillation_config.optimal_pca_dims,
		"sif_coefficient": distillation_config.sif_coefficient,
		"apply_zipf": distillation_config.apply_zipf,
		"enable_training": enable_training,
	}

	if enable_training:
		# Add a simple hash of tokenlearn parameters for config validation
		tokenlearn_hash = hash(
			f"{distillation_config.tokenlearn_dataset}_{distillation_config.tokenlearn_dataset_name}_{distillation_config.tokenlearn_text_key}"
		)
		config_params["tokenlearn_hash"] = float(abs(tokenlearn_hash) % 1000000)  # Convert to float for consistency

	config_str = str(sorted(config_params.items()))
	return hashlib.md5(config_str.encode()).hexdigest()[:12]  # noqa: S324


def check_existing_base_model(teacher_name: str) -> str | None:
	"""Check if base distilled model already exists locally."""
	base_dir = Path(LOCAL_BASE_DIR)
	model_dir = base_dir / f"code_model2vec_{teacher_name}"

	if model_dir.exists():
		# Check for essential model files
		has_config = (model_dir / "config.json").exists()
		has_model_file = any(
			[
				(model_dir / "model.safetensors").exists(),
				(model_dir / "model.bin").exists(),
				(model_dir / "pytorch_model.bin").exists(),
			]
		)

		if has_config and has_model_file:
			logger.info(f"✅ Found existing base model: {teacher_name}")
			return str(model_dir)

	return None


def check_existing_final_model(teacher_name: str, enable_training: bool = False) -> str | None:
	"""Check if final model already exists locally."""
	final_dir = Path(LOCAL_FINAL_DIR)

	# Add suffix for trained models
	model_name = f"code_model2vec_{teacher_name}"
	if enable_training:
		model_name += "_fine_tuned"
	final_path = final_dir / model_name

	if final_path.exists():
		# Check for essential model files
		has_config = (final_path / "config.json").exists()
		has_model_file = any(
			[
				(final_path / "model.safetensors").exists(),
				(final_path / "model.bin").exists(),
				(final_path / "pytorch_model.bin").exists(),
			]
		)

		if has_config and has_model_file:
			logger.info(f"✅ Found existing final model: {teacher_name}{'_fine_tuned' if enable_training else ''}")
			return str(final_path)

	return None


def copy_base_to_final(teacher_name: str, enable_training: bool = False) -> bool:
	"""Copy base model to final directory."""
	import shutil

	base_path = Path(LOCAL_BASE_DIR) / f"code_model2vec_{teacher_name}"

	# Add suffix for trained models
	final_model_name = f"code_model2vec_{teacher_name}"
	if enable_training:
		final_model_name += "_fine_tuned"
	final_path = Path(LOCAL_FINAL_DIR) / final_model_name

	try:
		final_path.parent.mkdir(parents=True, exist_ok=True)
		if final_path.exists():
			shutil.rmtree(final_path)
		shutil.copytree(base_path, final_path)
		logger.info(f"📁 Copied {teacher_name} from base to final{'_fine_tuned' if enable_training else ''}")
		return True
	except Exception:
		logger.exception(f"❌ Failed to copy {teacher_name} to final{'_fine_tuned' if enable_training else ''}")
		return False


def sync_model_from_beam(
	teacher_name: str,
	target_dir: str,
	use_beam_utilities: bool = False,
) -> bool:
	"""Sync model from Beam volume to local directory."""
	if not use_beam_utilities:
		return False

	try:
		target_path = Path(target_dir)
		target_path.mkdir(parents=True, exist_ok=True)

		beam_model_name = f"{teacher_name}_model"
		success = download_model_from_beam(VOLUME_CONFIG.name, beam_model_name, str(target_path))

		if success:
			logger.info(f"📥 Synced {teacher_name} from Beam to {target_dir}")
			return True
		logger.warning(f"⚠️ Failed to sync {teacher_name} from Beam")
		return False

	except Exception as e:
		logger.warning(f"Failed to sync {teacher_name} from Beam: {e}")
		return False


def sync_model_to_beam(
	teacher_name: str,
	source_dir: str,
	use_beam_utilities: bool = False,
) -> bool:
	"""Sync model from local directory to Beam volume."""
	if not use_beam_utilities:
		return False

	try:
		beam_model_name = f"{teacher_name}_model"
		success = upload_model_to_beam(VOLUME_CONFIG.name, beam_model_name, source_dir)

		if success:
			logger.info(f"📤 Synced {teacher_name} to Beam from {source_dir}")
			return True
		logger.warning(f"⚠️ Failed to sync {teacher_name} to Beam")
		return False

	except Exception as e:
		logger.warning(f"Failed to sync {teacher_name} to Beam: {e}")
		return False


# =============================================================================
# DISTILLATION FUNCTIONS
# =============================================================================


def simple_distillation(
	teacher_model: str,
	output_dir: str,
	pca_dims: int | None = None,
	retry_with_cache_clear: bool = False,
) -> Any:
	"""
	Perform simple Model2Vec distillation without additional training.

	Args:
		teacher_model: Name of teacher model
		output_dir: Output directory for the distilled model
		pca_dims: PCA dimensions (uses config default if None)
		retry_with_cache_clear: Whether this is a retry after clearing cache

	Returns:
		Distilled model or None if failed
	"""
	if pca_dims is None:
		pca_dims = int(distillation_config.optimal_pca_dims)

	output_path = Path(output_dir)
	output_path.mkdir(parents=True, exist_ok=True)

	retry_suffix = " (retry after cache clear)" if retry_with_cache_clear else ""
	logger.info(f"🔄 Simple distillation{retry_suffix}: {teacher_model} → {output_dir}")
	logger.info(f"📊 PCA dims: {pca_dims}, SIF: {distillation_config.sif_coefficient}")

	start_time = time.time()

	try:
		# Perform distillation with optimal parameters
		model = distill(
			model_name=teacher_model,
			pca_dims=int(pca_dims),
			apply_zipf=bool(distillation_config.apply_zipf),
			sif_coefficient=float(distillation_config.sif_coefficient),
			trust_remote_code=True,
		)

		logger.info("✅ Core distillation completed successfully")

		# Validate model before saving
		if hasattr(model, "tokenizer") and hasattr(model, "embedding"):
			vocab_size = len(model.tokenizer.get_vocab())
			embedding_size = model.embedding.shape[0]

			logger.info("📊 Model validation:")
			logger.info(f"  - Vocabulary size: {vocab_size}")
			logger.info(f"  - Embedding matrix size: {embedding_size}")

			if vocab_size != embedding_size:
				logger.warning(f"⚠️ Vocabulary size mismatch: vocab={vocab_size}, embeddings={embedding_size}")
				logger.warning("⚠️ This may cause issues in downstream usage")
			else:
				logger.info("✅ Vocabulary and embedding sizes match")

		# Save the model
		model.save_pretrained(str(output_path))
		logger.info(f"💾 Model saved to {output_path}")

		# Log model info
		logger.info(f"Model type: {type(model)}")
		if hasattr(model, "embedding"):
			logger.info(f"Embedding shape: {model.embedding.shape}")
			logger.info(f"Embedding dtype: {model.embedding.dtype}")

		total_time = time.time() - start_time
		logger.info(f"🎉 Simple distillation completed in {total_time:.2f} seconds")
		return model

	except ValueError as e:
		if "Number of tokens" in str(e) and "does not match number of vectors" in str(e):
			logger.warning(f"⚠️ Token-vector mismatch with {teacher_model} - this is a Model2Vec library issue")
			logger.warning(f"Error details: {e}")
			logger.warning("💡 This model has incompatible tokenization. Skipping...")
			return None
		if "weight is on the meta device" in str(e):
			logger.warning(f"⚠️ Device placement issue with {teacher_model} - model weights on meta device")
			logger.warning(f"Error details: {e}")
			logger.warning("💡 This model has device placement issues. Skipping...")
			return None
		raise
	except AttributeError as e:
		if "backend_tokenizer" in str(e):
			logger.warning(f"⚠️ Tokenizer compatibility issue with {teacher_model}")
			logger.warning(f"Error details: {e}")
			logger.warning("💡 This model's tokenizer is incompatible with Model2Vec. Skipping...")
			return None
		raise
	except FileNotFoundError as e:
		if "transformers_modules" in str(e) or "xlm_padding.py" in str(e):
			logger.warning(f"⚠️ Missing custom model files for {teacher_model}")
			logger.warning(f"Error details: {e}")

			# Try clearing cache and retrying once
			if not retry_with_cache_clear:
				logger.info("🔧 Attempting to clear cache and retry...")
				if clear_model_cache(teacher_model):
					logger.info("🔄 Retrying distillation after cache clear...")
					return simple_distillation(teacher_model, output_dir, pca_dims, retry_with_cache_clear=True)

			logger.warning("💡 This model has missing dependencies. Manual intervention may be required.")
			return None
		raise
	except Exception:
		logger.exception(f"❌ Simple distillation failed for {teacher_model}")
		return None


def load_optimized_dataset(
	max_samples: int | None = None,
	checkpoint_manager: BeamCheckpointManager | None = None,
	dataset_path: str | None = None,
) -> list[str]:
	"""Load our pre-created optimized dataset for tokenlearn training."""
	from .dataset import DATASET_OUTPUT_DIR
	from .dataset import load_optimized_dataset as load_dataset_func

	# Use configuration if not provided as parameter
	if dataset_path is None:
		dataset_path = distillation_config.custom_dataset_path

	dataset_dir = Path(dataset_path) if dataset_path else DATASET_OUTPUT_DIR

	# Use configuration default if not specified
	if max_samples is None:
		max_samples = distillation_config.tokenlearn_max_samples

	logger.info(f"🎯 Loading optimized dataset from {dataset_dir}")
	logger.info(f"📊 Target samples: {max_samples}")

	try:
		# Load the training split of our optimized dataset
		df = load_dataset_func(output_dir=dataset_dir, split="train")

		# Extract the text column (which contains our formatted query + code)
		texts = df["text"].tolist()

		# Shuffle for better training distribution
		import random

		random.seed(42)
		random.shuffle(texts)

		# Limit to max_samples
		if len(texts) > max_samples:
			texts = texts[:max_samples]

		logger.info(f"✅ Loaded {len(texts)} optimized training samples")

		# Log language distribution
		languages = df["language"].value_counts()
		logger.info("📊 Language distribution:")
		for lang, count in languages.items():
			percentage = (count / len(df)) * 100
			logger.info(f"  {lang}: {count} samples ({percentage:.1f}%)")

		return texts

	except Exception as e:
		logger.warning(f"⚠️ Failed to load optimized dataset: {e}")
		logger.info("🔄 Falling back to original CodeSearchNet loading...")
		return load_codesearchnet_dataset(max_samples, checkpoint_manager)


def load_codesearchnet_dataset(
	max_samples: int | None = None,
	checkpoint_manager: BeamCheckpointManager | None = None,
) -> list[str]:
	"""Load and format the CodeSearchNet dataset for token frequency computation."""
	from datasets import load_dataset

	# Use configuration default if not specified
	if max_samples is None:
		max_samples = distillation_config.tokenlearn_max_samples

	logger.info(f"Loading CodeSearchNet dataset from {codesearchnet_config.dataset_name}")
	logger.info(f"Limiting to {max_samples} samples for training efficiency")
	logger.info(f"Languages: {', '.join(languages_config.all)}")

	# Check for existing dataset checkpoint
	texts = []
	start_from = 0

	if checkpoint_manager:
		checkpoint_data = checkpoint_manager.load_checkpoint("dataset", 0)
		if checkpoint_data:
			cached_texts = checkpoint_data.get("data", {}).get("texts", [])
			if len(cached_texts) >= max_samples:
				logger.info(f"✅ Resumed dataset loading: {len(cached_texts)} texts from checkpoint")
				return cached_texts[:max_samples]
			logger.info(f"📋 Partial dataset found: {len(cached_texts)} texts, continuing...")
			texts = cached_texts
			start_from = len(texts)

	try:
		# Calculate samples per language for balanced distribution
		num_languages = len(languages_config.all)
		samples_per_language = max_samples // num_languages
		remaining_samples = max_samples % num_languages

		logger.info(f"📊 Target distribution: {samples_per_language} samples per language")
		if remaining_samples > 0:
			logger.info(f"📊 Extra {remaining_samples} samples will be distributed to first languages")

		# Load training data from each language separately for balanced distribution
		language_texts: dict[str, list[str]] = {}
		total_collected = len(texts)

		for i, language in enumerate(languages_config.all):
			if total_collected >= max_samples:
				break

			logger.info(f"🔍 Loading {language} training data...")

			# Determine how many samples to collect for this language
			target_for_lang = samples_per_language
			if i < remaining_samples:  # Distribute extra samples to first languages
				target_for_lang += 1

			# Skip if we already have enough from this language
			if language in language_texts and len(language_texts[language]) >= target_for_lang:
				continue

			try:
				# Load training split for the specific language (same format as evaluate.py)
				from datasets import load_dataset

				dataset = load_dataset(
					codesearchnet_config.dataset_name,
					language,
					split="train",
					trust_remote_code=True,
				)

				lang_texts: list[str] = []
				processed_count = 0

				for processed_count, example in enumerate(dataset, 1):
					if len(lang_texts) >= target_for_lang:
						break

					# Use same field names as evaluate.py
					doc_string = example.get("func_documentation_string", "").strip()
					code_string = example.get("func_code_string", "").strip()

					if doc_string and code_string and len(doc_string.split()) >= 3 and len(code_string) > 50:
						# Format as documentation-code pair for training (same as evaluate.py)
						text = f"Documentation: {doc_string}\nCode:\n{code_string}"

						# Ensure reasonable length for embedding models
						if len(text) <= 2048:
							lang_texts.append(text)

					if processed_count % 5000 == 0:
						logger.info(f"  {language}: processed {processed_count}, collected {len(lang_texts)}")

				language_texts[language] = lang_texts
				total_collected += len(lang_texts)
				logger.info(f"✅ {language}: collected {len(lang_texts)} samples")

			except Exception as e:
				logger.warning(f"⚠️ Failed to load {language} data: {e}")
				continue

		# Combine all language texts in a balanced way
		combined_texts = []

		# Add existing texts first (from checkpoint)
		if start_from > 0:
			combined_texts = texts[:start_from]

		# Interleave texts from different languages for better training distribution
		max_lang_samples = max(len(lang_texts) for lang_texts in language_texts.values()) if language_texts else 0

		for sample_idx in range(max_lang_samples):
			for language in languages_config.all:
				if len(combined_texts) >= max_samples:
					break

				if language in language_texts and sample_idx < len(language_texts[language]):
					combined_texts.append(language_texts[language][sample_idx])

			if len(combined_texts) >= max_samples:
				break

		# Truncate to exact max_samples
		combined_texts = combined_texts[:max_samples]

		# Log final distribution
		logger.info("📊 Final dataset distribution:")
		lang_counts: dict[str, int] = {}
		for text in combined_texts:
			# Simple heuristic to identify language from code patterns
			if "def " in text and ":" in text:
				lang_counts["python"] = lang_counts.get("python", 0) + 1
			elif "function " in text and "{" in text:
				lang_counts["javascript"] = lang_counts.get("javascript", 0) + 1
			elif "public " in text and "class " in text:
				lang_counts["java"] = lang_counts.get("java", 0) + 1
			elif "<?php" in text or "$" in text:
				lang_counts["php"] = lang_counts.get("php", 0) + 1
			elif "func " in text and "end" in text:
				lang_counts["ruby"] = lang_counts.get("ruby", 0) + 1
			elif "func " in text and "}" in text:
				lang_counts["go"] = lang_counts.get("go", 0) + 1
			else:
				lang_counts["other"] = lang_counts.get("other", 0) + 1

		for lang, count in lang_counts.items():
			percentage = (count / len(combined_texts)) * 100
			logger.info(f"  {lang}: {count} samples ({percentage:.1f}%)")

		# Final checkpoint save
		if checkpoint_manager:
			checkpoint_data = {
				"config_hash": get_current_config_hash(enable_training=True),
				"stage": "dataset",
				"step": 0,
				"timestamp": time.time(),
				"data": {"texts": combined_texts},
			}
			checkpoint_manager.save_checkpoint("dataset", checkpoint_data, 0)

		logger.info(f"Successfully loaded {len(combined_texts)} balanced code-documentation pairs from CodeSearchNet")
		return combined_texts

	except Exception:
		logger.exception("Error loading CodeSearchNet dataset")
		return texts  # Return what we have so far


def generate_teacher_embeddings(
	teacher_model: SentenceTransformer,
	texts: list[str],
	checkpoint_manager: BeamCheckpointManager | None = None,
) -> torch.Tensor:
	"""Generate teacher embeddings for code training with checkpoint support."""
	logger.info(f"Generating teacher embeddings for {len(texts)} texts...")

	# Check for existing embeddings checkpoint
	if checkpoint_manager:
		volume_path = Path(VOLUME_CONFIG.mount_path)
		embeddings_path = volume_path / "embeddings_cache.pt"
		config_path = volume_path / "embeddings_config.json"

		if embeddings_path.exists() and config_path.exists():
			try:
				# Load config first to validate compatibility
				with config_path.open("r") as f:
					config_data = json.load(f)

				current_hash = get_current_config_hash(enable_training=True)
				if config_data.get("config_hash") == current_hash:
					# Load the embeddings tensor
					final_embeddings = torch.load(embeddings_path, map_location="cpu")
					num_expected = config_data.get("num_texts", len(texts))

					if final_embeddings.shape[0] >= num_expected:
						logger.info(f"✅ Loaded embeddings from cache ({final_embeddings.shape[0]} embeddings)")
						return final_embeddings[: len(texts)]

			except Exception as e:
				logger.warning(f"Failed to load embeddings cache: {e}, regenerating...")

	# Generate embeddings from scratch
	logger.info("Generating fresh teacher embeddings...")

	batch_size = 16  # Fixed batch size for teacher embedding generation
	embeddings_list = []

	for i in range(0, len(texts), batch_size):
		batch_texts = texts[i : i + batch_size]

		try:
			batch_embeddings = teacher_model.encode(
				batch_texts,
				convert_to_tensor=True,
				batch_size=batch_size,
				show_progress_bar=False,
				normalize_embeddings=True,
			)
			embeddings_list.append(batch_embeddings)

			if i % (batch_size * 10) == 0:
				logger.info(f"Generated embeddings for {i + len(batch_texts)}/{len(texts)} texts")

		except torch.cuda.OutOfMemoryError:
			logger.warning(f"GPU OOM with batch size {batch_size}, reducing...")
			torch.cuda.empty_cache()
			batch_size = max(1, batch_size // 2)

			# Retry with smaller batch size
			batch_embeddings = teacher_model.encode(
				batch_texts,
				convert_to_tensor=True,
				batch_size=batch_size,
				show_progress_bar=False,
				normalize_embeddings=True,
			)
			embeddings_list.append(batch_embeddings)

	# Combine all embeddings
	teacher_embeddings = torch.cat(embeddings_list, dim=0)

	# Ensure fp32 precision
	if teacher_embeddings.dtype != torch.float32:
		teacher_embeddings = teacher_embeddings.to(torch.float32)

	logger.info(f"Generated {teacher_embeddings.shape[0]} teacher embeddings in {teacher_embeddings.dtype}")

	# Save embeddings cache for future runs
	if checkpoint_manager:
		try:
			volume_path = Path(VOLUME_CONFIG.mount_path)
			embeddings_path = volume_path / "embeddings_cache.pt"
			config_path = volume_path / "embeddings_config.json"

			# Save embeddings tensor
			torch.save(teacher_embeddings, embeddings_path)

			# Save configuration
			config_data = {
				"config_hash": get_current_config_hash(enable_training=True),
				"num_texts": len(texts),
				"embedding_shape": list(teacher_embeddings.shape),
				"timestamp": time.time(),
			}

			with config_path.open("w") as f:
				json.dump(config_data, f, indent=2)

			logger.info("💾 Saved embeddings cache for future runs")

		except Exception as e:
			logger.warning(f"Failed to save embeddings cache: {e}")

	return teacher_embeddings


def tokenlearn_training(
	student_model: Any,
	teacher_model: SentenceTransformer,
	checkpoint_manager: BeamCheckpointManager | None = None,  # noqa: ARG001
) -> Any:
	"""
	Perform tokenlearn training following the official POTION approach.

	This follows the 4-step process:
	1. Model2Vec distillation (already done - student_model)
	2. Sentence transformer inference (create features)
	3. Tokenlearn training
	"""
	from pathlib import Path

	logger.info("🧪 Starting tokenlearn training (POTION approach)...")

	# Create persistent directories for tokenlearn workflow (for checkpoint preservation)
	teacher_model_name = getattr(teacher_model, "model_name", None)
	if not teacher_model_name and hasattr(teacher_model, "_modules") and len(teacher_model._modules) > 0:  # noqa: SLF001
		# Try to extract from the first module if it's a SentenceTransformer
		first_module = next(iter(teacher_model._modules.values()))  # noqa: SLF001
		if hasattr(first_module, "auto_model") and hasattr(first_module.auto_model, "name_or_path"):
			teacher_model_name = first_module.auto_model.name_or_path

	if not teacher_model_name:
		teacher_model_name = "unknown_teacher"

	# Use persistent directory for tokenlearn checkpoints
	teacher_slug = teacher_model_name.replace("/", "_").replace("-", "_")
	persistent_tokenlearn_dir = Path(directories.base).parent / "tokenlearn_cache" / teacher_slug

	features_dir = persistent_tokenlearn_dir / "features"
	model_dir = persistent_tokenlearn_dir / "base_model"
	trained_dir = persistent_tokenlearn_dir / "trained_model"

	features_dir.mkdir(parents=True, exist_ok=True)
	model_dir.mkdir(parents=True, exist_ok=True)
	trained_dir.mkdir(parents=True, exist_ok=True)

	logger.info(f"📁 Using persistent tokenlearn directory: {persistent_tokenlearn_dir}")

	# Save the base distilled model for tokenlearn
	student_model.save_pretrained(str(model_dir))
	logger.info(f"💾 Saved base model to {model_dir}")

	# Step 2: Create features using sentence transformer
	logger.info("🔍 Step 2: Creating features using sentence transformer...")

	# Get teacher model name/path for tokenlearn
	teacher_model_name = getattr(teacher_model, "model_name", None)
	if not teacher_model_name and hasattr(teacher_model, "_modules") and len(teacher_model._modules) > 0:  # noqa: SLF001
		# Try to extract from the first module if it's a SentenceTransformer
		# _modules is a dict-like container, get the first module by iterating
		first_module = next(iter(teacher_model._modules.values()))  # noqa: SLF001
		if hasattr(first_module, "auto_model") and hasattr(first_module.auto_model, "name_or_path"):
			teacher_model_name = first_module.auto_model.name_or_path

	logger.info(f"📊 Using teacher model: {teacher_model_name}")

	# Prepare dataset for tokenlearn featurization
	dataset_path, dataset_name, text_key = _prepare_tokenlearn_dataset(persistent_tokenlearn_dir)

	# Check if featurization already completed (checkpoint detection)
	featurization_complete_marker = features_dir / ".featurization_complete"
	if featurization_complete_marker.exists() and verify_featurization_output(features_dir):
		logger.info("✅ Found existing featurization checkpoint with valid output files")
		logger.info(f"📂 Using cached features from: {features_dir}")

		# Verify marker is still valid
		output_files = list(features_dir.glob("*.npy")) + list(features_dir.glob("*.json"))
		logger.info(f"📁 Found {len(output_files)} cached feature files")
	else:
		if featurization_complete_marker.exists():
			logger.warning("⚠️ Featurization marker exists but output files are missing - re-running featurization")
			featurization_complete_marker.unlink()
		logger.info("🔄 No valid featurization checkpoint found - starting featurization...")

		if not teacher_model_name:
			logger.warning("⚠️ Could not determine teacher model name, using fallback")
			teacher_model_name = "BAAI/bge-base-en-v1.5"  # Fallback to a common model

		logger.info(f"📊 Using teacher model: {teacher_model_name}")

		try:
			# Use direct function call instead of subprocess
			from datasets import load_dataset

			from distiller.tokenlearn.featurize import featurize

			logger.info("🔄 Running tokenlearn featurization...")
			logger.info(f"📊 Dataset: {dataset_path} (config: {dataset_name})")
			logger.info(f"📝 Text field: {text_key}")

			# Load the dataset
			if dataset_name is None:
				# For local JSON files, don't pass name parameter
				dataset = load_dataset(
					"json",
					data_files=dataset_path,
					split="train",
					streaming=True,
				)
			else:
				# For remote datasets with specific configurations
				dataset = load_dataset(
					dataset_path,
					name=dataset_name,
					split="train",
					streaming=True,
				)

			# Call featurization function directly
			featurize(
				dataset=iter(dataset),
				model=teacher_model,
				output_dir=str(features_dir),
				max_means=50000,  # IMPROVEMENT: Limit means to prevent overfitting
				batch_size=512,  # IMPROVEMENT: Smaller batch for better gradients
				text_key=text_key,
			)

			logger.info("✅ Featurization completed successfully")

			# Create checkpoint marker to indicate featurization is complete
			featurization_complete_marker.touch()
			logger.info(f"💾 Created featurization checkpoint: {featurization_complete_marker}")

		except Exception as e:
			logger.exception("💥 Tokenlearn featurization failed")
			logger.exception("💥 Tokenlearn featurization is required for training - cannot proceed")
			msg = f"Tokenlearn featurization failed: {e}"
			raise RuntimeError(msg) from e

	# Step 3: Train using tokenlearn-train
	logger.info("🎓 Step 3: Training using tokenlearn...")

	# Check if training already completed (checkpoint detection)
	training_complete_marker = trained_dir / ".training_complete"
	training_fallback_marker = trained_dir / ".training_fallback"

	if training_complete_marker.exists() and verify_training_output(trained_dir):
		logger.info("✅ Found existing training checkpoint with valid model files")
		logger.info(f"📂 Using cached trained model from: {trained_dir}")

		# Show available model files
		model_files = []
		for pattern in ["*.json", "*.safetensors", "*.bin"]:
			model_files.extend(list(trained_dir.glob(pattern)))
			for subdir in ["model", "model_weighted"]:
				subdir_path = trained_dir / subdir
				if subdir_path.exists():
					model_files.extend(list(subdir_path.glob(pattern)))
		logger.info(f"📁 Found {len(model_files)} cached model files")
	elif training_fallback_marker.exists():
		logger.warning("⚠️ Training fallback marker found - tokenlearn failed previously")
		logger.info("🔄 Proceeding with fallback to base model (simple distillation)")
		# Skip training and proceed to model loading (will fallback to base model)
	else:
		if training_complete_marker.exists():
			logger.warning("⚠️ Training marker exists but model files are missing - re-running training")
			training_complete_marker.unlink()
		logger.info("🔄 No valid training checkpoint found - starting training...")

		try:
			# Use direct function call instead of subprocess
			from distiller.tokenlearn.train import train_model
			from distiller.tokenlearn.utils import collect_means_and_texts

			# IMPROVED APPROACH: Try optimized parameters first
			logger.info("🚀 Attempting IMPROVED tokenlearn training with optimized parameters...")
			logger.info("📊 Using smaller vocabulary and conservative PCA to prevent overfitting")

			# Collect training data from features directory
			paths = sorted(features_dir.glob("*.json"))
			train_txt, train_vec = collect_means_and_texts(paths)

			logger.info(f"📊 Collected {len(train_txt)} texts and {train_vec.shape[0]} vectors for training")

			try:
				# Try improved parameters first
				trained_model = train_model(
					model_name=str(teacher_model_name),
					train_txt=train_txt,
					train_vec=train_vec,
					device="cuda" if torch.cuda.is_available() else "cpu",
					vocab_size=25000,  # IMPROVEMENT: Smaller vocabulary to prevent overfitting
					pca_dims=256,  # IMPROVEMENT: Conservative PCA dimensions
				)

				# Save the trained model
				trained_model.save_pretrained(str(trained_dir))
				logger.info("✅ IMPROVED tokenlearn training completed successfully")
				training_complete_marker.touch()
				logger.info(f"💾 Created improved training checkpoint: {training_complete_marker}")

			except Exception as e:
				logger.warning(f"⚠️ Improved training failed: {e}")
				logger.info("🔄 Falling back to CONSERVATIVE tokenlearn training...")

				# FALLBACK: Ultra-conservative training approach
				try:
					trained_model = train_model(
						model_name=str(teacher_model_name),
						train_txt=train_txt,
						train_vec=train_vec,
						device="cuda" if torch.cuda.is_available() else "cpu",
						vocab_size=15000,  # FALLBACK: Even smaller vocabulary
						pca_dims=128,  # FALLBACK: Smaller PCA dimensions
					)

					# Save the trained model
					trained_model.save_pretrained(str(trained_dir))
					logger.info("✅ Conservative tokenlearn training completed successfully")
					training_complete_marker.touch()
					logger.info(f"💾 Created conservative training checkpoint: {training_complete_marker}")

				except Exception as e2:
					logger.exception("❌ Conservative tokenlearn training also failed")
					logger.exception("💥 All training approaches failed - check output above for details")

					# Create training marker to indicate we tried but failed
					training_fallback_marker = trained_dir / ".training_fallback"
					training_fallback_marker.touch()

					logger.exception("💥 Tokenlearn training failed completely")
					msg = f"All tokenlearn training approaches failed: {e2}"
					raise RuntimeError(msg) from e2

		except Exception as e:
			logger.warning("💥 All tokenlearn training approaches failed")
			logger.exception("💥 All training approaches failed completely - cannot proceed")
			msg = f"All training approaches failed: {e}"
			raise RuntimeError(msg) from e

	# Step 4: Load the trained model and apply post-training re-regularization
	logger.info("📦 Step 4: Loading trained model and applying post-training re-regularization...")

	# Check if we need to use fallback due to tokenlearn failure
	training_fallback_marker = trained_dir / ".training_fallback"
	if training_fallback_marker.exists():
		logger.error("❌ Tokenlearn training failed previously - cannot return trained model")
		logger.error("💥 Training was requested but failed - this would be misleading to return base model")
		msg = "Tokenlearn training failed - cannot proceed with training pipeline"
		raise RuntimeError(msg)

	try:
		from distiller.model2vec.model import StaticModel

		# Load the trained model from tokenlearn
		trained_model_path = trained_dir / "model"
		if not trained_model_path.exists():
			# Try alternative paths
			possible_paths = [
				trained_dir / "model_weighted",
				trained_dir,
			]

			for path in possible_paths:
				if path.exists() and any(path.glob("*.json")):
					trained_model_path = path
					break
			else:
				logger.error(f"❌ Could not find trained model in {trained_dir}")
				logger.error("💥 Training was requested but no trained model found - cannot proceed")
				msg = f"Trained model not found in {trained_dir} - training pipeline failed"
				raise RuntimeError(msg)

		# Load the model before re-regularization
		logger.info("🔄 Loading model from tokenlearn training...")
		trained_model = StaticModel.from_pretrained(str(trained_model_path))

		# Return the trained model directly
		logger.info("✅ Tokenlearn training pipeline completed successfully")
		return trained_model

	except ValueError as e:
		if "Number of tokens" in str(e) and "does not match number of vectors" in str(e):
			logger.exception("💥 Token-vector mismatch in tokenlearn training")
			logger.exception("Error details")
			logger.exception("🔧 This is a known issue with tokenlearn/Model2Vec integration")
			logger.exception("💥 Training was requested but failed due to token-vector mismatch")
			msg = f"Tokenlearn training failed due to token-vector mismatch: {e}"
			raise RuntimeError(msg) from e
		logger.exception("💥 Failed to load tokenlearn trained model")
		msg = f"Failed to load tokenlearn trained model: {e}"
		raise RuntimeError(msg) from e
	except Exception as e:
		logger.exception("💥 Failed to load tokenlearn trained model")
		logger.exception("💥 Cannot load trained model - training failed")
		msg = f"Failed to load tokenlearn trained model: {e}"
		raise RuntimeError(msg) from e


def distill_single_teacher(
	teacher_model: str,
	enable_training: bool = False,
	use_beam_utilities: bool = False,
	pca_dims: int | None = None,
) -> dict[str, Any]:
	"""
	Distill a single teacher model with optional training.

	Args:
		teacher_model: Name of teacher model
		enable_training: Whether to enable advanced training
		use_beam_utilities: Whether to use Beam utilities
		pca_dims: PCA dimensions

	Returns:
		Dictionary with distillation results
	"""
	teacher_name = teacher_model.split("/")[-1].replace("-", "_")
	base_dir = Path(LOCAL_BASE_DIR) / f"code_model2vec_{teacher_name}"

	# Add suffix for trained models
	final_model_name = f"code_model2vec_{teacher_name}"
	if enable_training:
		final_model_name += "_fine_tuned"
	final_dir = Path(LOCAL_FINAL_DIR) / final_model_name

	logger.info(f"\n{'=' * 60}")
	logger.info(f"🔄 Processing teacher model: {teacher_model}")
	logger.info(f"📁 Teacher name: {teacher_name}")
	logger.info(f"🎓 Training enabled: {enable_training}")
	logger.info(f"{'=' * 60}")

	# Check model compatibility first
	is_compatible, warning_msg = check_model_compatibility(teacher_model)
	if not is_compatible:
		logger.warning(f"⚠️ Known compatibility issue: {warning_msg}")
		logger.info("🔧 Attempting distillation anyway, but may fail...")

		# Try model-specific workarounds
		workaround_type = try_model_workarounds(teacher_model)
		# Don't skip if we have a workaround - we'll use it later

	start_time = time.time()

	# Initialize Beam utilities if requested
	checkpoint_mgr = None
	if use_beam_utilities:
		try:
			_, checkpoint_mgr, model_mgr, _ = create_beam_utilities(VOLUME_CONFIG.name, VOLUME_CONFIG.mount_path)
		except Exception as e:
			logger.warning(f"Failed to initialize Beam utilities: {e}")

	try:
		# Step 1: Check for existing final model
		existing_final = check_existing_final_model(teacher_name, enable_training)
		if existing_final:
			logger.info(f"✅ Final model already exists: {teacher_name}{'_fine_tuned' if enable_training else ''}")
			total_time = time.time() - start_time
			return {
				"teacher_model": teacher_model,
				"teacher_name": teacher_name,
				"status": "skipped_existing_final",
				"final_path": existing_final,
				"distillation_time": total_time,
			}

		# Step 1.5: Sync existing checkpoints from Beam if using Beam utilities
		if use_beam_utilities and checkpoint_mgr:
			logger.info(f"🔄 Syncing existing checkpoints for {teacher_name}...")
			sync_checkpoints_from_beam(VOLUME_CONFIG.name, f"distillation_{teacher_name}", directories.checkpoints)
			if enable_training:
				sync_checkpoints_from_beam(VOLUME_CONFIG.name, f"training_{teacher_name}", directories.checkpoints)

		# Step 2: Check for existing base model or create it
		existing_base = check_existing_base_model(teacher_name)
		base_model = None

		if existing_base:
			logger.info(f"✅ Found existing base model: {teacher_name}")
			if enable_training:
				# Load base model for training
				from distiller.model2vec.model import StaticModel

				base_model = StaticModel.from_pretrained(existing_base)
		elif use_beam_utilities:
			synced = sync_model_from_beam(teacher_name, str(base_dir), use_beam_utilities)
			if synced:
				existing_base = str(base_dir)
				if enable_training:
					from distiller.model2vec.model import StaticModel

					base_model = StaticModel.from_pretrained(existing_base)

		if not existing_base:
			# Perform simple distillation to create base model
			logger.info(f"🔄 Creating base model for {teacher_name}")

			# Check if we need specialized distillation
			workaround_type = try_model_workarounds(teacher_model)

			if workaround_type == "salesforce":
				base_model = salesforce_model_distillation(teacher_model, str(base_dir), pca_dims)
			elif workaround_type == "baai":
				base_model = baai_bge_model_distillation(teacher_model, str(base_dir), pca_dims)
			else:
				base_model = simple_distillation(teacher_model, str(base_dir), pca_dims)

			if base_model is None:
				total_time = time.time() - start_time
				return {
					"teacher_model": teacher_model,
					"teacher_name": teacher_name,
					"status": "failed_base_distillation",
					"error": "Simple distillation failed",
					"distillation_time": total_time,
				}

			# Sync base model and checkpoints to Beam
			if use_beam_utilities:
				sync_model_to_beam(teacher_name, str(base_dir), use_beam_utilities)
				if checkpoint_mgr:
					sync_checkpoints_to_beam(
						VOLUME_CONFIG.name, f"distillation_{teacher_name}", directories.checkpoints
					)

			existing_base = str(base_dir)

		# Step 3: Handle final model creation
		if enable_training and base_model is not None:
			# Perform tokenlearn training (POTION approach)
			logger.info(f"🧪 Starting tokenlearn training for {teacher_name}")

			try:
				# Load teacher model for training
				device = "cuda" if torch.cuda.is_available() else "cpu"
				teacher_st_model = load_model_with_flash_attention(teacher_model, device)

				# Perform tokenlearn training (POTION approach)
				final_model = tokenlearn_training(
					base_model,
					teacher_st_model,
					checkpoint_mgr,
				)

				# Save final model
				final_dir.mkdir(parents=True, exist_ok=True)
				final_model.save_pretrained(str(final_dir))

				# Sync final model and training checkpoints to Beam
				if use_beam_utilities:
					sync_model_to_beam(f"{teacher_name}_final", str(final_dir), use_beam_utilities)
					if checkpoint_mgr:
						sync_checkpoints_to_beam(
							VOLUME_CONFIG.name, f"training_{teacher_name}", directories.checkpoints
						)

				del teacher_st_model
				if torch.cuda.is_available():
					torch.cuda.empty_cache()

			except RuntimeError as e:
				# Training failed - clean up and return failure
				logger.exception(f"❌ Training failed for {teacher_name}")

				# Clean up teacher model if it was loaded
				if "teacher_st_model" in locals():
					del teacher_st_model
				if torch.cuda.is_available():
					torch.cuda.empty_cache()

				total_time = time.time() - start_time
				return {
					"teacher_model": teacher_model,
					"teacher_name": teacher_name,
					"status": "failed_training",
					"error": f"Training failed: {e!s}",
					"base_path": existing_base,  # Base model was created successfully
					"distillation_time": total_time,
				}

		else:
			# Copy base to final (no training)
			logger.info(f"📁 Copying base to final for {teacher_name}")
			if not copy_base_to_final(teacher_name, enable_training):
				total_time = time.time() - start_time
				return {
					"teacher_model": teacher_model,
					"teacher_name": teacher_name,
					"status": "failed_copy_to_final",
					"error": "Failed to copy base to final",
					"distillation_time": total_time,
				}

		total_time = time.time() - start_time
		return {
			"teacher_model": teacher_model,
			"teacher_name": teacher_name,
			"status": "success",
			"enable_training": enable_training,
			"base_path": existing_base,
			"final_path": str(final_dir),
			"distillation_time": total_time,
		}

	except Exception as e:
		logger.exception(f"❌ Failed to process {teacher_model}")
		total_time = time.time() - start_time
		return {
			"teacher_model": teacher_model,
			"teacher_name": teacher_name,
			"status": "failed",
			"error": str(e),
			"distillation_time": total_time,
		}


# =============================================================================
# MAIN EXECUTION FUNCTIONS
# =============================================================================


def run_local_distillation(
	teacher_models: list[str] | None = None,
	enable_training: bool = False,
	pca_dims: int | None = None,
	clear_cache: bool = False,
) -> dict[str, Any]:
	"""Run distillation locally."""
	logger.info("🖥️ Running distillation locally")

	if teacher_models is None:
		teacher_models = DEFAULT_TEACHER_MODELS

	results = {}
	successful_models = []

	logger.info("🚀 Starting distillation workflow")
	logger.info(f"📊 Processing {len(teacher_models)} teacher models")
	logger.info(f"🎓 Training enabled: {enable_training}")

	# Use default models if none specified
	models_to_distill = teacher_models if teacher_models else DEFAULT_TEACHER_MODELS

	logger.info(f"📊 Teacher models to process: {len(models_to_distill)}")
	for i, model in enumerate(models_to_distill, 1):
		logger.info(f"  {i}. {model}")

	# Clear cache for problematic models if requested
	if clear_cache:
		logger.info("🧹 Clearing cache for known problematic models...")
		problematic_models = ["BAAI/bge-code-v1", "jinaai/jina-embeddings-v3", "Salesforce/SFR-Embedding-Code-2B_R"]
		for model in problematic_models:
			if model in models_to_distill:
				clear_model_cache(model)

	# Clear tokenlearn checkpoints if requested (for training mode)
	# Note: Checkpoint clearing is handled at the main function level
	# Run distillation workflow
	for teacher_model in models_to_distill:
		result = distill_single_teacher(
			teacher_model=teacher_model,
			enable_training=enable_training,
			use_beam_utilities=False,
			pca_dims=pca_dims,
		)

		teacher_name = result["teacher_name"]
		results[teacher_name] = result

		if result["status"] == "success" or result["status"].startswith("skipped"):
			successful_models.append(teacher_name)
		elif result["status"] == "failed_training":
			# Note: Training failed but base model may still be available
			logger.warning(f"⚠️ Training failed for {teacher_name}, but base distillation may have succeeded")

	# Summary
	logger.info("\n🏆 DISTILLATION WORKFLOW COMPLETE!")
	logger.info(f"📊 Successful models: {len(successful_models)}")
	logger.info(f"🎓 Training mode: {'Enabled' if enable_training else 'Basic distillation only'}")

	for model_name in successful_models:
		result = results[model_name]
		logger.info(f"✅ {model_name}: {result['teacher_model']}")

	# Save results summary
	results_summary = {
		"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
		"enable_training": enable_training,
		"successful_models": successful_models,
		"all_results": results,
		"total_successful": len(successful_models),
		"total_attempted": len(teacher_models or DEFAULT_TEACHER_MODELS),
	}

	# Save results to file
	results_file = Path(LOCAL_BASE_DIR).parent / "distillation_results.json"
	results_file.parent.mkdir(parents=True, exist_ok=True)
	with results_file.open("w") as f:
		json.dump(results_summary, f, indent=2)

	logger.info(f"📊 Results summary saved to: {results_file}")

	return results_summary


def _beam_distill_internal(
	teacher_models: list[str] | None = None,
	enable_training: bool = False,
	pca_dims: int | None = None,
	clear_cache: bool = False,
) -> dict[str, Any]:
	"""Shared internal implementation for beam distillation."""
	if teacher_models is None:
		teacher_models = DEFAULT_TEACHER_MODELS

	# Clear cache for problematic models if requested
	if clear_cache:
		logger.info("🧹 Clearing cache for known problematic models...")
		problematic_models = ["BAAI/bge-code-v1", "jinaai/jina-embeddings-v3", "Salesforce/SFR-Embedding-Code-2B_R"]
		for model in problematic_models:
			if model in teacher_models:
				clear_model_cache(model)

	results = {}
	successful_models = []

	logger.info("🚀 Starting Beam distillation workflow")
	logger.info(f"📊 Processing {len(teacher_models)} teacher models")
	logger.info(f"🎓 Training enabled: {enable_training}")

	# Use default models if none specified
	models_to_distill = teacher_models if teacher_models else DEFAULT_TEACHER_MODELS

	logger.info(f"📊 Teacher models to process: {len(models_to_distill)}")
	for i, model in enumerate(models_to_distill, 1):
		logger.info(f"  {i}. {model}")

	for teacher_model in models_to_distill:
		result = distill_single_teacher(
			teacher_model=teacher_model,
			enable_training=enable_training,
			use_beam_utilities=True,
			pca_dims=pca_dims,
		)

		teacher_name = result["teacher_name"]
		results[teacher_name] = result

		if result["status"] == "success" or result["status"].startswith("skipped"):
			successful_models.append(teacher_name)
		elif result["status"] == "failed_training":
			# Note: Training failed but base model may still be available
			logger.warning(f"⚠️ Training failed for {teacher_name}, but base distillation may have succeeded")

	# Summary
	logger.info("\n🏆 BEAM DISTILLATION WORKFLOW COMPLETE!")
	logger.info(f"📊 Successful models: {len(successful_models)}")

	# Save results to Beam volume
	volume_path = Path(VOLUME_CONFIG.mount_path)
	results_summary = {
		"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
		"enable_training": enable_training,
		"successful_models": successful_models,
		"all_results": results,
		"total_successful": len(successful_models),
		"total_attempted": len(teacher_models or DEFAULT_TEACHER_MODELS),
	}

	results_file = volume_path / "distillation_results.json"
	with results_file.open("w") as f:
		json.dump(results_summary, f, indent=2)

	logger.info(f"📊 Beam results saved to: {results_file}")

	return results_summary


@function(**get_training_function_kwargs())
def _beam_train_models(
	teacher_models: list[str] | None = None,
	enable_training: bool = True,
	pca_dims: int | None = None,
	clear_cache: bool = False,
) -> dict[str, Any]:
	"""Beam function for training (distillation + tokenlearn)."""
	logger.info("☁️ Running training on Beam")
	return _beam_distill_internal(teacher_models, enable_training, pca_dims, clear_cache)


@function(**get_distillation_function_kwargs())
def _beam_distill_models(
	teacher_models: list[str] | None = None,
	enable_training: bool = False,
	pca_dims: int | None = None,
	clear_cache: bool = False,
) -> dict[str, Any]:
	"""Beam function for basic distillation only."""
	logger.info("☁️ Running distillation on Beam")
	return _beam_distill_internal(teacher_models, enable_training, pca_dims, clear_cache)


def run_beam_distillation(
	teacher_models: list[str] | None = None,
	enable_training: bool = False,
	pca_dims: int | None = None,
	clear_cache: bool = False,
) -> dict[str, Any]:
	"""Run distillation on Beam and sync results."""
	logger.info("☁️ Running distillation on Beam with local sync")

	try:
		# Choose appropriate beam function based on training flag
		beam_function = _beam_train_models if enable_training else _beam_distill_models

		# Run distillation on Beam
		results = beam_function.remote(teacher_models, enable_training, pca_dims, clear_cache)

		# Check if Beam execution was successful
		if not results:
			logger.error("❌ Beam execution failed or returned no results")
			return {
				"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
				"enable_training": enable_training,
				"successful_models": [],
				"all_results": {},
				"total_successful": 0,
				"total_attempted": len(teacher_models or DEFAULT_TEACHER_MODELS),
				"error": "Beam execution failed",
			}

		# Sync models back to local directories
		if results.get("successful_models"):
			logger.info("📥 Syncing models from Beam to local directories...")

			for teacher_name in results["successful_models"]:
				# Sync base model
				base_dir = Path(LOCAL_BASE_DIR) / f"code_model2vec_{teacher_name}"
				sync_model_from_beam(teacher_name, str(base_dir), use_beam_utilities=True)

				# Sync final model if training was enabled
				if enable_training:
					final_dir = Path(LOCAL_FINAL_DIR) / f"code_model2vec_{teacher_name}"
					sync_model_from_beam(f"{teacher_name}_final", str(final_dir), use_beam_utilities=True)
				else:
					# Copy base to final
					copy_base_to_final(teacher_name, enable_training)

			logger.info("✅ All models synced from Beam")

		return results

	except Exception as e:
		logger.exception("❌ Beam distillation failed with exception")
		return {
			"timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
			"enable_training": enable_training,
			"successful_models": [],
			"all_results": {},
			"total_successful": 0,
			"total_attempted": len(teacher_models or DEFAULT_TEACHER_MODELS),
			"error": str(e),
		}


# =============================================================================
# CLI INTERFACE
# =============================================================================


def main(
	use_beam: Annotated[bool, typer.Option(help="Use Beam for distillation")] = False,
	train: Annotated[bool, typer.Option(help="Enable advanced training (CodeSearchNet fine-tuning)")] = False,
	teacher_models: Annotated[list[str] | None, typer.Option(help="Specific teacher models to distill")] = None,
	pca_dims: Annotated[int | None, typer.Option(help="PCA dimensions (uses config default if not specified)")] = None,
	clear_cache: Annotated[
		bool, typer.Option(help="Clear HuggingFace cache for problematic models before distillation")
	] = False,
	clear_checkpoints: Annotated[
		bool, typer.Option(help="Clear tokenlearn checkpoints to force fresh featurization and training")
	] = False,
	use_optimized_dataset: Annotated[
		bool,
		typer.Option(
			"--use-optimized-dataset", help="Use the pre-created optimized dataset from code_model2vec/dataset"
		),
	] = False,
	dataset_path: Annotated[
		str | None,
		typer.Option("--dataset-path", help="Path to custom dataset directory (defaults to code_model2vec/dataset)"),
	] = None,
) -> None:
	"""Unified distillation command with optional training."""
	logger.info("🚀 Starting unified Model2Vec distillation workflow")

	# Set dataset configuration
	distillation_config.use_optimized_dataset = use_optimized_dataset
	distillation_config.custom_dataset_path = dataset_path

	if use_optimized_dataset and train:
		dataset_source = dataset_path or "code_model2vec/dataset"
		logger.info(f"🎯 Using optimized dataset from: {dataset_source}")
	elif train:
		logger.info("🎯 Using C4 dataset for training (following POTION approach)")

	logger.info(f"🎓 Training mode: {'Tokenlearn (POTION) training' if train else 'Basic distillation only'}")
	logger.info(f"☁️  Execution: {'Beam' if use_beam else 'Local'}")

	# Use default models if none specified
	models_to_distill = teacher_models if teacher_models else DEFAULT_TEACHER_MODELS

	logger.info(f"📊 Teacher models to process: {len(models_to_distill)}")
	for i, model in enumerate(models_to_distill, 1):
		logger.info(f"  {i}. {model}")

	# Clear cache for problematic models if requested
	if clear_cache:
		logger.info("🧹 Clearing cache for known problematic models...")
		problematic_models = ["BAAI/bge-code-v1", "jinaai/jina-embeddings-v3", "Salesforce/SFR-Embedding-Code-2B_R"]
		for model in problematic_models:
			if model in models_to_distill:
				clear_model_cache(model)

	# Clear tokenlearn checkpoints if requested (for training mode)
	if clear_checkpoints and train:
		logger.info("🧹 Clearing tokenlearn checkpoints to force fresh featurization and training...")
		for teacher_model in models_to_distill:
			teacher_model.split("/")[-1].replace("-", "_")

			# Use the same persistent directory structure as the training function
			teacher_slug = teacher_model.replace("/", "_").replace("-", "_")
			persistent_tokenlearn_dir = Path(LOCAL_BASE_DIR).parent / "tokenlearn_cache" / teacher_slug

			features_dir = persistent_tokenlearn_dir / "features"
			trained_dir = persistent_tokenlearn_dir / "trained_model"

			# Clear persistent tokenlearn checkpoints
			if features_dir.exists() or trained_dir.exists():
				clear_tokenlearn_checkpoints(features_dir, trained_dir)
				logger.info(f"🗑️ Cleared persistent tokenlearn checkpoints for {teacher_model}")
			else:
				logger.info(f"ℹ️ No tokenlearn checkpoints found for {teacher_model}")
	elif clear_checkpoints and not train:
		logger.warning("⚠️ --clear-checkpoints flag is only relevant when training is enabled (--train)")

	# Run distillation workflow
	if use_beam:
		results = run_beam_distillation(
			teacher_models=models_to_distill,
			enable_training=train,
			pca_dims=pca_dims,
			clear_cache=clear_cache,
		)
	else:
		results = run_local_distillation(
			teacher_models=models_to_distill,
			enable_training=train,
			pca_dims=pca_dims,
			clear_cache=clear_cache,
		)

	# Handle case where results might be None or invalid
	if not results or not isinstance(results, dict):
		logger.error("❌ Distillation workflow failed - no valid results returned")
		results = {
			"total_successful": 0,
			"total_attempted": len(models_to_distill),
			"error": "Workflow failed",
		}

	# Final summary
	successful_count = results.get("total_successful", 0)
	total_attempted = results.get("total_attempted", 0)

	logger.info("\n🎉 UNIFIED DISTILLATION WORKFLOW COMPLETED!")
	logger.info(f"📊 Successfully processed: {successful_count}/{total_attempted} models")
	logger.info(f"📁 Base models saved to: {LOCAL_BASE_DIR}")
	logger.info(f"📁 Final models saved to: {LOCAL_FINAL_DIR}")

	if train:
		logger.info("🎓 Advanced training was enabled - models include CodeSearchNet specialization")
	else:
		logger.info("📖 Basic distillation only - use --train flag to enable advanced training")


def check_model_compatibility(teacher_model: str) -> tuple[bool, str | None]:
	"""
	Check if a model has known compatibility issues with Model2Vec.

	Returns:
		Tuple of (is_compatible, warning_message)
	"""
	known_incompatible = {
		"BAAI/bge-code-v1": "Qwen2Tokenizer lacks backend_tokenizer attribute",
		"jinaai/jina-embeddings-v3": "Missing custom transformers module dependencies",
		"Salesforce/SFR-Embedding-Code-2B_R": "Device placement issues with meta tensors",
	}

	if teacher_model in known_incompatible:
		return False, known_incompatible[teacher_model]

	# Check for model families that might have issues
	if "qwen2" in teacher_model.lower() and "bge" in teacher_model.lower():
		return False, "BGE models with Qwen2 tokenizers may have compatibility issues"

	if "jina" in teacher_model.lower() and "embeddings-v3" in teacher_model.lower():
		return False, "Jina embeddings v3 models may have missing dependencies"

	if "salesforce" in teacher_model.lower() and "sfr-embedding" in teacher_model.lower():
		return False, "Salesforce SFR embedding models may have device placement issues"

	return True, None


def clear_model_cache(model_name: str) -> bool:
	"""Clear HuggingFace cache for a specific model."""
	try:
		import shutil
		from pathlib import Path

		# Get HuggingFace cache directory
		cache_dir = Path.home() / ".cache" / "huggingface"

		# Find model-specific cache directories
		model_slug = model_name.replace("/", "--")

		# Clear transformers cache
		transformers_cache = cache_dir / "transformers" / model_slug
		if transformers_cache.exists():
			shutil.rmtree(transformers_cache)
			logger.info(f"🗑️ Cleared transformers cache for {model_name}")

		# Clear hub cache
		hub_cache = cache_dir / "hub" / f"models--{model_slug}"
		if hub_cache.exists():
			shutil.rmtree(hub_cache)
			logger.info(f"🗑️ Cleared hub cache for {model_name}")

		# Clear modules cache
		modules_cache = cache_dir / "modules" / "transformers_modules" / model_name.split("/")[0]
		if modules_cache.exists():
			shutil.rmtree(modules_cache)
			logger.info(f"🗑️ Cleared modules cache for {model_name}")

		return True

	except Exception as e:
		logger.warning(f"Failed to clear cache for {model_name}: {e}")
		return False


def try_model_workarounds(teacher_model: str) -> str | None:
	"""
	Try specific workarounds for problematic models.

	Returns:
		The type of workaround needed ("salesforce", "baai", etc.) or None if no workaround available
	"""
	if "salesforce" in teacher_model.lower() and "sfr-embedding" in teacher_model.lower():
		logger.info("🔧 Salesforce SFR model detected - will use specialized distillation")
		return "salesforce"

	if "baai" in teacher_model.lower() and ("bge-code" in teacher_model.lower() or "bge-m3" in teacher_model.lower()):
		logger.info("🔧 BAAI BGE model detected - will use specialized distillation")
		return "baai"

	return None


def salesforce_model_distillation(
	teacher_model: str,
	output_dir: str,
	pca_dims: int | None = None,
) -> Any:
	"""Special distillation function for Salesforce SFR models that handles device placement issues."""
	if pca_dims is None:
		pca_dims = int(distillation_config.optimal_pca_dims)

	output_path = Path(output_dir)
	output_path.mkdir(parents=True, exist_ok=True)

	logger.info(f"🔄 Salesforce-specific distillation: {teacher_model} → {output_dir}")
	logger.info(f"📊 PCA dims: {pca_dims}, SIF: {distillation_config.sif_coefficient}")

	start_time = time.time()

	try:
		import torch
		from transformers import AutoModel, AutoTokenizer

		# Enhanced custom model loading for Salesforce models
		logger.info("🔧 Loading model with enhanced device settings...")

		# Method 1: Try with to_empty() for meta tensor handling
		try:
			logger.info("🔄 Attempting with to_empty() method...")

			# Load tokenizer first
			tokenizer = AutoTokenizer.from_pretrained(teacher_model, trust_remote_code=True)

			# Load model with meta device initially
			model = AutoModel.from_pretrained(
				teacher_model,
				trust_remote_code=True,
				torch_dtype=torch.float16,
				device_map="meta",  # Load on meta device first
			)

			# Move from meta to actual device using to_empty()
			if torch.cuda.is_available():
				device = torch.device("cuda")
				# Create empty tensors on target device and copy weights
				model = model.to_empty(device=device)
			else:
				device = torch.device("cpu")
				model = model.to_empty(device=device)

			# Ensure model is in the right dtype
			model = model.to(torch.float16 if torch.cuda.is_available() else torch.float32)

			logger.info("✅ Successfully loaded with to_empty() method")

		except Exception as e:
			logger.warning(f"to_empty() method failed: {e}")

			# Method 2: Try SentenceTransformer with specific settings
			logger.info("🔄 Falling back to SentenceTransformer method...")
			sentence_model = load_model_with_flash_attention(
				teacher_model,
				device="cpu",  # Force CPU loading first
			)

			# Move to GPU if available
			if torch.cuda.is_available():
				sentence_model = sentence_model.to("cuda")

			# Extract components
			model = sentence_model[0].auto_model
			tokenizer = sentence_model.tokenizer

			logger.info("✅ Successfully loaded with SentenceTransformer method")

		# Now use Model2Vec's distill_from_model function directly
		from distiller.model2vec.distill.distillation import distill_from_model

		distilled_model = distill_from_model(
			model=model,
			tokenizer=tokenizer,
			pca_dims=int(pca_dims),
			apply_zipf=bool(distillation_config.apply_zipf),
			sif_coefficient=float(distillation_config.sif_coefficient),
		)

		logger.info("✅ Core distillation completed successfully")

		# Save the model
		distilled_model.save_pretrained(str(output_path))
		logger.info(f"💾 Model saved to {output_path}")

		# Log model info
		logger.info(f"Model type: {type(distilled_model)}")
		if hasattr(distilled_model, "embedding"):
			logger.info(f"Embedding shape: {distilled_model.embedding.shape}")
			logger.info(f"Embedding dtype: {distilled_model.embedding.dtype}")

		total_time = time.time() - start_time
		logger.info(f"🎉 Salesforce distillation completed in {total_time:.2f} seconds")

		# Clean up
		if "sentence_model" in locals():
			del sentence_model
		del model
		if torch.cuda.is_available():
			torch.cuda.empty_cache()

		return distilled_model

	except Exception:
		logger.exception(f"❌ Salesforce-specific distillation failed for {teacher_model}")
		return None


def baai_bge_model_distillation(
	teacher_model: str,
	output_dir: str,
	pca_dims: int | None = None,
) -> Any:
	"""Special distillation function for BAAI BGE models that handles Qwen2Tokenizer compatibility issues."""
	if pca_dims is None:
		pca_dims = int(distillation_config.optimal_pca_dims)

	output_path = Path(output_dir)
	output_path.mkdir(parents=True, exist_ok=True)

	logger.info(f"🔄 BAAI BGE-specific distillation: {teacher_model} → {output_dir}")
	logger.info(f"📊 PCA dims: {pca_dims}, SIF: {distillation_config.sif_coefficient}")

	start_time = time.time()

	try:
		import torch
		from transformers import AutoModel, AutoTokenizer

		logger.info("🔧 Loading BAAI model with tokenizer workaround...")

		# Try multiple approaches for BAAI models
		success = False

		# Method 1: Try SentenceTransformer first (often handles tokenizer issues better)
		try:
			logger.info("🔄 Attempting with SentenceTransformer wrapper...")
			sentence_model = load_model_with_flash_attention(teacher_model)

			# Extract components
			model = sentence_model[0].auto_model
			tokenizer = sentence_model.tokenizer

			# Test if tokenizer works by encoding a simple text
			test_encoding = tokenizer.encode("test", return_tensors="pt")
			logger.info("✅ SentenceTransformer method successful")
			success = True

		except Exception as e:
			logger.warning(f"SentenceTransformer method failed: {e}")

			# Method 2: Try direct loading with tokenizer replacement
			try:
				logger.info("🔄 Attempting with tokenizer replacement...")
				from transformers import BertTokenizerFast

				# Load model directly
				model = AutoModel.from_pretrained(teacher_model, trust_remote_code=True)

				# Try to use a compatible tokenizer instead
				try:
					# First try the original tokenizer
					tokenizer = AutoTokenizer.from_pretrained(teacher_model, trust_remote_code=True)
				except Exception:
					# Fall back to BERT tokenizer for BGE models
					logger.info("🔄 Falling back to BERT tokenizer...")
					tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

				logger.info("✅ Tokenizer replacement method successful")
				success = True

			except Exception as e2:
				logger.warning(f"Tokenizer replacement method failed: {e2}")

		if not success:
			logger.error("❌ All BAAI model loading methods failed")
			return None

		# Now use Model2Vec's distill_from_model function directly
		from distiller.model2vec.distill.distillation import distill_from_model

		distilled_model = distill_from_model(
			model=model,
			tokenizer=tokenizer,
			pca_dims=int(pca_dims),
			apply_zipf=bool(distillation_config.apply_zipf),
			sif_coefficient=float(distillation_config.sif_coefficient),
		)

		logger.info("✅ Core distillation completed successfully")

		# Save the model
		distilled_model.save_pretrained(str(output_path))
		logger.info(f"💾 Model saved to {output_path}")

		# Log model info
		logger.info(f"Model type: {type(distilled_model)}")
		if hasattr(distilled_model, "embedding"):
			logger.info(f"Embedding shape: {distilled_model.embedding.shape}")
			logger.info(f"Embedding dtype: {distilled_model.embedding.dtype}")

		total_time = time.time() - start_time
		logger.info(f"🎉 BAAI BGE distillation completed in {total_time:.2f} seconds")

		# Clean up
		if "sentence_model" in locals():
			del sentence_model
		del model
		if torch.cuda.is_available():
			torch.cuda.empty_cache()

		return distilled_model

	except Exception:
		logger.exception(f"❌ BAAI BGE-specific distillation failed for {teacher_model}")
		return None


def clear_tokenlearn_checkpoints(features_dir: Path, trained_dir: Path) -> None:
	"""Clear tokenlearn checkpoint markers to force re-execution of steps."""
	featurization_marker = features_dir / ".featurization_complete"
	training_marker = trained_dir / ".training_complete"

	if featurization_marker.exists():
		featurization_marker.unlink()
		logger.info(f"🗑️ Cleared featurization checkpoint: {featurization_marker}")

	if training_marker.exists():
		training_marker.unlink()
		logger.info(f"🗑️ Cleared training checkpoint: {training_marker}")


def verify_featurization_output(features_dir: Path) -> bool:
	"""Verify that featurization output files actually exist and are valid."""
	if not features_dir.exists():
		return False

	# Check for expected tokenlearn output files

	# Check if any expected files exist
	return any(list(features_dir.glob(file_pattern)) for file_pattern in ["*.npy", "*.json", "*.pt", "*.pkl"])


def verify_training_output(trained_dir: Path) -> bool:
	"""Verify that training output files actually exist and are valid."""
	if not trained_dir.exists():
		return False

	# Check for model files
	model_files = ["config.json", "model.safetensors", "modules.json", "tokenizer.json"]
	for model_file in model_files:
		if (trained_dir / model_file).exists():
			return True

	# Check for alternative model directory structure
	for subdir in ["model", "model_weighted"]:
		subdir_path = trained_dir / subdir
		if subdir_path.exists():
			for model_file in model_files:
				if (subdir_path / model_file).exists():
					return True

	return False


def _prepare_tokenlearn_dataset(tokenlearn_dir: Path) -> tuple[str, str | None, str]:
	"""
	Prepare dataset for tokenlearn featurization.

	Returns:
		Tuple of (dataset_path, dataset_name, text_key) for tokenlearn
	"""
	if distillation_config.use_optimized_dataset:
		return _prepare_custom_dataset_for_tokenlearn(tokenlearn_dir)
	return _prepare_original_dataset_for_tokenlearn()


def _prepare_custom_dataset_for_tokenlearn(tokenlearn_dir: Path) -> tuple[str, str | None, str]:
	"""Prepare custom optimized dataset for tokenlearn featurization."""
	logger.info("🎯 Preparing custom optimized dataset for tokenlearn...")

	# Import the dataset module
	from .dataset import create_optimized_dataset, load_optimized_dataset

	# Define paths
	custom_dataset_dir = (
		Path(distillation_config.custom_dataset_path)
		if distillation_config.custom_dataset_path
		else Path("code_model2vec/dataset")
	)
	tokenlearn_dataset_dir = tokenlearn_dir / "custom_dataset"

	# Check if we need to create the custom dataset
	if not custom_dataset_dir.exists() or not (custom_dataset_dir / "train.parquet").exists():
		logger.info("📊 Custom dataset not found - creating optimized dataset...")
		create_optimized_dataset(
			max_samples_per_lang=distillation_config.tokenlearn_max_samples // 6,  # Divide by number of languages
			output_dir=custom_dataset_dir,
			create_multiple_formats=False,  # Use simple format for tokenlearn
		)

	# Load the custom dataset
	logger.info(f"📂 Loading custom dataset from {custom_dataset_dir}")
	train_df = load_optimized_dataset(output_dir=custom_dataset_dir, split="train")

	# Prepare dataset for tokenlearn (save as JSON files that load_dataset can read)
	tokenlearn_dataset_dir.mkdir(parents=True, exist_ok=True)

	# Save as JSON file that tokenlearn can load with load_dataset()
	train_json_path = tokenlearn_dataset_dir / "train.json"

	# Create JSON lines format
	import json

	with train_json_path.open("w") as f:
		for text in train_df["text"]:
			json.dump({"text": text}, f)
			f.write("\n")

	logger.info(f"✅ Prepared custom dataset with {len(train_df)} samples for tokenlearn")
	logger.info(f"💾 Saved JSON dataset to {train_json_path}")

	# Return the JSON file path directly (not directory) and no config name for JSON loading
	return str(train_json_path), None, "text"


def _prepare_original_dataset_for_tokenlearn() -> tuple[str, str | None, str]:
	"""Prepare original dataset for tokenlearn featurization (uses C4 by default following POTION approach)."""
	logger.info("📊 Using C4 dataset for tokenlearn (following POTION approach)...")
	return (
		str(distillation_config.tokenlearn_dataset),  # "allenai/c4"
		str(distillation_config.tokenlearn_dataset_name),  # "en"
		str(distillation_config.tokenlearn_text_key),  # "text"
	)


if __name__ == "__main__":
	typer.run(main)