Sarthak

chore: update dependencies and configuration for improved training

7837959 4 months ago

21.6 kB

	"""
	Custom Dataset Generation for Code-Specialized Model Training.

	This module creates optimized training datasets from CodeSearchNet that are specifically
	designed to improve performance on code search evaluation tasks.

	Features:
	- High-quality doc-code pairs optimized for retrieval
	- Balanced sampling across programming languages
	- Multiple training formats (doc-only, code-only, combined)
	- Quality filtering and data cleaning
	- Train/test/eval splits with proper stratification
	- Efficient parquet format output
	"""

	import json
	import logging
	import time
	from pathlib import Path
	from typing import Annotated, Any

	import pandas as pd
	import typer
	from datasets import load_dataset
	from tqdm import tqdm

	from .config import languages_config

	# Set up logging
	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
	logger = logging.getLogger(__name__)

	# Dataset configuration
	DATASET_OUTPUT_DIR = Path("code_model2vec/dataset")
	DEFAULT_MAX_SAMPLES_PER_LANG = 50000
	DEFAULT_MIN_DOC_WORDS = 3
	DEFAULT_MAX_DOC_WORDS = 100
	DEFAULT_MIN_CODE_CHARS = 50
	DEFAULT_MAX_CODE_CHARS = 2000


	def create_optimized_dataset(
	max_samples_per_lang: int = DEFAULT_MAX_SAMPLES_PER_LANG,
	min_doc_words: int = DEFAULT_MIN_DOC_WORDS,
	max_doc_words: int = DEFAULT_MAX_DOC_WORDS,
	min_code_chars: int = DEFAULT_MIN_CODE_CHARS,
	max_code_chars: int = DEFAULT_MAX_CODE_CHARS,
	output_dir: Path \| None = None,
	create_multiple_formats: bool = True,
	) -> dict[str, Any]:
	"""
	Create optimized training dataset from CodeSearchNet for code search tasks.

	Args:
	max_samples_per_lang: Maximum samples per programming language
	min_doc_words: Minimum words in documentation
	max_doc_words: Maximum words in documentation
	min_code_chars: Minimum characters in code
	max_code_chars: Maximum characters in code
	output_dir: Output directory for dataset
	create_multiple_formats: Create multiple training formats

	Returns:
	Dictionary with dataset statistics and file paths
	"""
	output_dir = DATASET_OUTPUT_DIR if output_dir is None else Path(output_dir)

	output_dir.mkdir(parents=True, exist_ok=True)

	logger.info("🚀 Starting optimized CodeSearchNet dataset creation...")
	logger.info(f"📁 Output directory: {output_dir}")
	logger.info(f"📊 Target: {max_samples_per_lang} samples per language")
	logger.info(f"🔍 Languages: {', '.join(languages_config.all)}")

	start_time = time.time()
	all_samples = []
	language_stats = {}

	# Process each programming language
	for language in languages_config.all:
	logger.info(f"\n🔄 Processing {language}...")

	try:
	# Load CodeSearchNet dataset for this language
	dataset = load_dataset("code_search_net", language, split="train", trust_remote_code=True)

	language_samples = []
	processed_count = 0
	quality_filtered = 0

	# Process examples with quality filtering
	for example in tqdm(dataset, desc=f"Processing {language}", unit="examples"):
	processed_count += 1

	# Extract documentation and code
	doc_string = example.get("func_documentation_string", "").strip()
	code_string = example.get("func_code_string", "").strip()
	func_name = example.get("func_name", "").strip()

	# Quality filters
	if not _passes_quality_filters(
	doc_string, code_string, func_name, min_doc_words, max_doc_words, min_code_chars, max_code_chars
	):
	continue

	quality_filtered += 1

	# Create optimized training samples
	samples = _create_training_samples(
	doc_string, code_string, func_name, language, create_multiple_formats
	)
	language_samples.extend(samples)

	# Stop if we have enough samples
	if len(language_samples) >= max_samples_per_lang:
	break

	# Truncate to exact target size
	language_samples = language_samples[:max_samples_per_lang]
	all_samples.extend(language_samples)

	# Track statistics
	language_stats[language] = {
	"processed": processed_count,
	"quality_filtered": quality_filtered,
	"final_samples": len(language_samples),
	"quality_rate": quality_filtered / processed_count if processed_count > 0 else 0,
	}

	logger.info(f"✅ {language}: {len(language_samples)} samples from {quality_filtered} quality examples")

	except Exception:
	logger.exception(f"❌ Failed to process {language}")
	language_stats[language] = {
	"processed": 0,
	"quality_filtered": 0,
	"final_samples": 0,
	"quality_rate": 0.0,
	}

	# Create DataFrame
	logger.info(f"\n📊 Creating dataset with {len(all_samples)} total samples...")
	df = pd.DataFrame(all_samples)

	# Create stratified splits
	train_df, test_df = _create_stratified_splits(df)

	# Save datasets
	dataset_files = _save_datasets(output_dir, train_df, test_df)

	# Save metadata
	metadata = {
	"creation_time": time.strftime("%Y-%m-%d %H:%M:%S"),
	"total_samples": len(all_samples),
	"train_samples": len(train_df),
	"test_samples": len(test_df),
	"languages": languages_config.all,
	"language_stats": language_stats,
	"quality_filters": {
	"min_doc_words": min_doc_words,
	"max_doc_words": max_doc_words,
	"min_code_chars": min_code_chars,
	"max_code_chars": max_code_chars,
	},
	"files": dataset_files,
	"processing_time": time.time() - start_time,
	}

	metadata_file = output_dir / "metadata.json"
	with metadata_file.open("w") as f:
	json.dump(metadata, f, indent=2)

	logger.info(f"\n🎉 Dataset creation completed in {metadata['processing_time']:.2f} seconds!")
	logger.info("📊 Final statistics:")
	logger.info(f" - Total samples: {metadata['total_samples']}")
	logger.info(f" - Train: {metadata['train_samples']}")
	logger.info(f" - Test: {metadata['test_samples']}")
	logger.info(f"💾 Metadata saved to: {metadata_file}")

	return metadata


	def _passes_quality_filters(
	doc_string: str,
	code_string: str,
	func_name: str,
	min_doc_words: int,
	max_doc_words: int,
	min_code_chars: int,
	max_code_chars: int,
	) -> bool:
	"""Apply quality filters optimized for code retrieval following RAG best practices."""
	# Basic existence checks
	if not doc_string or not code_string or not func_name:
	return False

	# Documentation quality filters for code retrieval
	doc_words = len(doc_string.split())
	if doc_words < min_doc_words or doc_words > max_doc_words:
	return False

	# Code quality filters
	code_length = len(code_string)
	if code_length < min_code_chars or code_length > max_code_chars:
	return False

	# Content quality filters for code retrieval
	doc_lower = doc_string.lower()
	code_string.lower()

	# Skip low-quality documentation (expanded for code context)
	skip_phrases = [
	"todo",
	"fixme",
	"hack",
	"temp",
	"test",
	"placeholder",
	"not implemented",
	"coming soon",
	"tbd",
	"xxx",
	"broken",
	"deprecated",
	"legacy",
	"old version",
	"outdated",
	]
	if any(phrase in doc_lower for phrase in skip_phrases):
	return False

	# Ensure meaningful documentation for code retrieval
	if func_name.lower() in doc_lower and doc_words < 5:
	return False

	# Code structure validation (more comprehensive for retrieval)
	has_function = any(
	pattern in code_string for pattern in ["def ", "function ", "class ", "public ", "private ", "static "]
	)
	if not has_function:
	return False

	# Skip trivial or incomplete code
	trivial_code_patterns = [
	"pass",
	"return None",
	"return;",
	"throw new Error",
	"# TODO",
	"// TODO",
	"print(",
	"console.log(",
	]
	if any(pattern in code_string for pattern in trivial_code_patterns) and len(code_string) < 100:
	return False

	# Ensure documentation describes functionality (not just naming)
	generic_docs = [
	"returns a value",
	"does something",
	"helper function",
	"utility method",
	"this function",
	"this method",
	"returns the result",
	"performs operation",
	]
	if any(generic in doc_lower for generic in generic_docs):
	return False

	# Ensure documentation has descriptive content for retrieval
	descriptive_words = [
	"parse",
	"convert",
	"transform",
	"calculate",
	"validate",
	"format",
	"filter",
	"sort",
	"search",
	"find",
	"create",
	"generate",
	"process",
	"handle",
	"manage",
	"update",
	"modify",
	"remove",
	"delete",
	"add",
	]
	if not any(word in doc_lower for word in descriptive_words) and doc_words < 8:
	return False

	# Code-documentation alignment check (key for retrieval quality)
	return _check_code_doc_alignment(doc_string, code_string, func_name)


	def _check_code_doc_alignment(doc_string: str, code_string: str, func_name: str) -> bool:
	"""Check if documentation and code are well-aligned for retrieval tasks."""
	doc_lower = doc_string.lower()
	code_lower = code_string.lower()

	# Function name should relate to documentation
	func_base = func_name.lower().replace("_", " ").replace("-", " ")

	# Check for obvious mismatches
	doc_has_return = any(word in doc_lower for word in ["return", "returns", "gives", "outputs"])
	code_has_return = "return " in code_lower

	# If doc mentions returning something, code should have returns
	if doc_has_return and not code_has_return and len(code_string.split("\n")) > 3:
	return False

	# Check for parameter mentions alignment
	any(word in doc_lower for word in ["parameter", "param", "argument", "input"])
	"(" in func_name and func_name.count("(") == 1

	# Basic semantic alignment
	action_words = ["sort", "parse", "convert", "validate", "format", "filter", "search", "calculate"]
	doc_actions = [word for word in action_words if word in doc_lower]
	[word for word in action_words if word in code_lower or word in func_base]

	# If documentation mentions specific actions, code or function name should reflect them
	return not (doc_actions and not any(action in code_lower or action in func_base for action in doc_actions))


	def _create_training_samples(
	doc_string: str,
	code_string: str,
	func_name: str,
	language: str,
	create_multiple_formats: bool,
	) -> list[dict[str, Any]]:
	"""Create optimized training samples for code retrieval with proper training schema."""
	samples = []

	if create_multiple_formats:
	# Format 1: Documentation query → Code (direct evaluation format)
	query_1 = doc_string
	text_1 = _format_training_text(query_1, code_string, language)
	samples.append(
	{
	"language": language,
	"query": query_1,
	"code": code_string,
	"text": text_1,
	}
	)

	# Format 2: How-to query (realistic developer search)
	query_2 = _generate_how_to_query(doc_string, func_name, language)
	text_2 = _format_training_text(query_2, code_string, language)
	samples.append(
	{
	"language": language,
	"query": query_2,
	"code": code_string,
	"text": text_2,
	}
	)

	# Format 3: Functional requirement query
	query_3 = _generate_functional_query(doc_string, func_name)
	text_3 = _format_training_text(query_3, code_string, language)
	samples.append(
	{
	"language": language,
	"query": query_3,
	"code": code_string,
	"text": text_3,
	}
	)

	# Format 4: Implementation-specific query
	query_4 = _generate_implementation_query(doc_string, func_name, language)
	text_4 = _format_training_text(query_4, code_string, language)
	samples.append(
	{
	"language": language,
	"query": query_4,
	"code": code_string,
	"text": text_4,
	}
	)

	else:
	# Simple format - direct documentation to code
	query = doc_string
	text = _format_training_text(query, code_string, language)
	samples.append(
	{
	"language": language,
	"query": query,
	"code": code_string,
	"text": text,
	}
	)

	return samples


	def _format_training_text(query: str, code: str, language: str) -> str:
	"""Format query and code into a single training text chunk with markdown-style code blocks."""
	# Clean up query but preserve internal code formatting
	query_clean = query.strip()
	code_clean = code.strip()

	# Create training text with proper markdown format and newline separation
	# Structure: query + empty line + markdown code block with language
	return f"{query_clean}\n\n```{language}\n{code_clean}\n```"


	def _generate_how_to_query(doc_string: str, func_name: str, language: str) -> str:
	"""Generate realistic 'how to' queries that developers might actually search for."""
	# Extract key action words from documentation
	doc_lower = doc_string.lower()
	func_lower = func_name.lower()

	# Common developer query patterns
	if "sort" in doc_lower or "sort" in func_lower:
	return f"How to sort data in {language}"
	if "parse" in doc_lower or "parse" in func_lower:
	return f"How to parse data in {language}"
	if "convert" in doc_lower or "transform" in doc_lower or "convert" in func_lower:
	return f"How to convert data in {language}"
	if "validate" in doc_lower or "check" in doc_lower or "validate" in func_lower:
	return f"How to validate input in {language}"
	if "calculate" in doc_lower or "compute" in doc_lower or "calc" in func_lower:
	return f"How to calculate values in {language}"
	if "format" in doc_lower or "format" in func_lower:
	return f"How to format output in {language}"
	if "filter" in doc_lower or "filter" in func_lower:
	return f"How to filter data in {language}"
	if "search" in doc_lower or "find" in doc_lower or "search" in func_lower or "find" in func_lower:
	return f"How to search through data in {language}"
	# Use function name for more specific queries
	if func_name and len(func_name) > 2:
	# Extract meaningful words from function name
	func_words = func_name.replace("_", " ").replace("-", " ").strip()
	if func_words:
	return f"How to {func_words.lower()} in {language}"
	# Fallback to more generic query
	action = doc_string.split()[0] if doc_string.split() else "implement"
	return f"How to {action.lower()} in {language}"


	def _generate_functional_query(doc_string: str, func_name: str) -> str:
	"""Generate functional requirement queries focusing on what the code accomplishes."""
	# Clean up documentation to create natural query
	doc_clean = doc_string.strip().rstrip(".")

	# Transform to question format
	if doc_clean.startswith(("Returns", "Return")):
	return f"Function that {doc_clean.lower()}"
	if doc_clean.startswith(("Creates", "Create")):
	return f"Code to {doc_clean.lower()}"
	if doc_clean.startswith(("Checks", "Check")):
	return f"Function to {doc_clean.lower()}"

	# Use function name to enhance the query if available
	if func_name and len(func_name) > 2:
	func_words = func_name.replace("_", " ").replace("-", " ").strip()
	if func_words and len(doc_clean) < 30: # Only for short docs
	return f"Function named '{func_name}' that {doc_clean.lower()}"

	return f"Implementation that {doc_clean.lower()}"


	def _generate_implementation_query(doc_string: str, func_name: str, language: str) -> str:
	"""Generate implementation-specific queries with technical details."""
	doc_lower = doc_string.lower()
	func_lower = func_name.lower() if func_name else ""

	# Add language-specific implementation details
	if language == "python":
	if "list" in doc_lower or "array" in doc_lower or "list" in func_lower:
	return f"Python function to {doc_string.lower()} using lists"
	if "dict" in doc_lower or "hash" in doc_lower or "dict" in func_lower:
	return f"Python function to {doc_string.lower()} using dictionaries"
	# Include function name for context if available
	if func_name and len(func_name) > 2:
	return f"Python implementation of {func_name}: {doc_string.lower()}"
	return f"Python implementation: {doc_string.lower()}"
	if language == "java":
	func_suffix = f" ({func_name})" if func_name and len(func_name) > 2 else ""
	return f"Java method to {doc_string.lower()}{func_suffix}"
	if language == "javascript":
	func_suffix = f" ({func_name})" if func_name and len(func_name) > 2 else ""
	return f"JavaScript function to {doc_string.lower()}{func_suffix}"
	if language == "php":
	func_suffix = f" ({func_name})" if func_name and len(func_name) > 2 else ""
	return f"PHP function to {doc_string.lower()}{func_suffix}"
	if language == "ruby":
	func_suffix = f" ({func_name})" if func_name and len(func_name) > 2 else ""
	return f"Ruby method to {doc_string.lower()}{func_suffix}"
	if language == "go":
	func_suffix = f" ({func_name})" if func_name and len(func_name) > 2 else ""
	return f"Go function to {doc_string.lower()}{func_suffix}"
	return f"{language} code to {doc_string.lower()}"


	def _create_stratified_splits(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
	"""Create stratified train/test splits preserving language distribution."""
	# Define split ratios
	train_ratio = 0.9
	# test_ratio = 0.1 (remainder)

	train_dfs = []
	test_dfs = []

	# Split by language to ensure balanced representation
	for language in df["language"].unique():
	lang_df = df[df["language"] == language].copy()
	n_samples = len(lang_df)

	# Calculate split sizes
	n_train = int(n_samples * train_ratio)
	# Remainder goes to test

	# Shuffle and split
	lang_df = lang_df.sample(frac=1, random_state=42).reset_index(drop=True)

	train_dfs.append(lang_df[:n_train])
	test_dfs.append(lang_df[n_train:])

	# Combine and shuffle again
	train_df = pd.concat(train_dfs, ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)
	test_df = pd.concat(test_dfs, ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

	logger.info("📊 Created stratified splits:")
	logger.info(f" - Train: {len(train_df)} samples")
	logger.info(f" - Test: {len(test_df)} samples")

	return train_df, test_df


	def _save_datasets(
	output_dir: Path,
	train_df: pd.DataFrame,
	test_df: pd.DataFrame,
	) -> dict[str, str]:
	"""Save datasets in parquet format with compression."""
	dataset_files = {}

	# Save each split
	for split_name, df in [("train", train_df), ("test", test_df)]:
	filepath = output_dir / f"{split_name}.parquet"
	df.to_parquet(
	filepath,
	compression="snappy",
	index=False,
	)
	dataset_files[split_name] = str(filepath)
	logger.info(f"💾 Saved {split_name}: {len(df)} samples → {filepath}")

	# Also save a combined dataset for convenience
	combined_df = pd.concat([train_df, test_df], ignore_index=True)
	combined_filepath = output_dir / "combined.parquet"
	combined_df.to_parquet(combined_filepath, compression="snappy", index=False)
	dataset_files["combined"] = str(combined_filepath)
	logger.info(f"💾 Saved combined: {len(combined_df)} samples → {combined_filepath}")

	return dataset_files


	def load_optimized_dataset(
	output_dir: Path \| None = None,
	split: str = "train",
	) -> pd.DataFrame:
	"""
	Load a previously created optimized dataset.

	Args:
	output_dir: Directory containing the dataset files
	split: Which split to load ('train', 'test', 'combined')

	Returns:
	DataFrame with the requested dataset split
	"""
	if output_dir is None:
	output_dir = DATASET_OUTPUT_DIR

	filepath = output_dir / f"{split}.parquet"

	if not filepath.exists():
	available_files = list(output_dir.glob("*.parquet"))
	available_splits = [f.stem for f in available_files]
	msg = f"Dataset split '{split}' not found at {filepath}. Available splits: {available_splits}"
	raise FileNotFoundError(msg)

	logger.info(f"📂 Loading {split} dataset from {filepath}")
	df = pd.read_parquet(filepath)
	logger.info(f"✅ Loaded {len(df)} samples")

	return df


	def main(
	max_samples_per_lang: Annotated[
	int, typer.Option(help="Maximum samples per language")
	] = DEFAULT_MAX_SAMPLES_PER_LANG,
	min_doc_words: Annotated[int, typer.Option(help="Minimum words in documentation")] = DEFAULT_MIN_DOC_WORDS,
	max_doc_words: Annotated[int, typer.Option(help="Maximum words in documentation")] = DEFAULT_MAX_DOC_WORDS,
	min_code_chars: Annotated[int, typer.Option(help="Minimum characters in code")] = DEFAULT_MIN_CODE_CHARS,
	max_code_chars: Annotated[int, typer.Option(help="Maximum characters in code")] = DEFAULT_MAX_CODE_CHARS,
	output_dir: Annotated[str \| None, typer.Option(help="Output directory for dataset")] = None,
	simple_format: Annotated[
	bool, typer.Option(help="Create only simple format (not multiple training formats)")
	] = False,
	) -> None:
	"""Create optimized training dataset from CodeSearchNet for code search tasks."""
	logger.info("🚀 Starting optimized dataset creation command...")

	# Convert output_dir to Path if provided
	output_path = Path(output_dir) if output_dir else None

	# Create the dataset
	try:
	metadata = create_optimized_dataset(
	max_samples_per_lang=max_samples_per_lang,
	min_doc_words=min_doc_words,
	max_doc_words=max_doc_words,
	min_code_chars=min_code_chars,
	max_code_chars=max_code_chars,
	output_dir=output_path,
	create_multiple_formats=not simple_format,
	)

	logger.info("✅ Dataset creation completed successfully!")
	logger.info(f"📁 Output directory: {metadata['files']['train']}")

	# Print summary statistics
	print("\n" + "=" * 60)
	print("📊 DATASET CREATION SUMMARY")
	print("=" * 60)
	print(f"Total samples created: {metadata['total_samples']:,}")
	print(f"Processing time: {metadata['processing_time']:.2f} seconds")
	print("\nSplit distribution:")
	print(f" • Train: {metadata['train_samples']:,} samples")
	print(f" • Test: {metadata['test_samples']:,} samples")

	print("\nLanguage distribution:")
	for lang, stats in metadata["language_stats"].items():
	if "error" not in stats:
	print(f" • {lang}: {stats['final_samples']:,} samples ({stats['quality_rate']:.1%} quality rate)")

	print(f"\nDataset files saved to: {output_path or DATASET_OUTPUT_DIR}")
	print("=" * 60)

	except Exception as e:
	logger.exception("❌ Dataset creation failed")
	raise typer.Exit(1) from e


	if __name__ == "__main__":
	typer.run(main)