Spaces:

ArthyP
/

enhanced-rag-demo

Running

enhanced-rag-demo / src /components /calibration /parameter_registry.py

Arthur Passuello

initial commit

5e1a30c about 1 month ago

14.9 kB

	"""
	Parameter registry for the calibration system.

	Implements the parameter registry component from calibration-system-spec.md
	for centralized management of all tunable system parameters.
	"""

	import logging
	from typing import Dict, Any, List, Optional, Union
	from dataclasses import dataclass
	from pathlib import Path
	import yaml

	logger = logging.getLogger(__name__)


	@dataclass
	class Parameter:
	"""Represents a tunable parameter in the system."""
	name: str
	component: str
	path: str # YAML path like "retriever.sparse.config.k1"
	current: Union[float, int, str, bool]
	min_value: Optional[Union[float, int]] = None
	max_value: Optional[Union[float, int]] = None
	step: Optional[Union[float, int]] = None
	param_type: str = "float"
	impacts: List[str] = None
	description: Optional[str] = None

	def __post_init__(self):
	if self.impacts is None:
	self.impacts = []


	class ParameterRegistry:
	"""
	Central registry of all tunable parameters following calibration-system-spec.md.

	Manages parameter definitions, search spaces, and validation according to the
	specification for systematic parameter optimization.
	"""

	def __init__(self):
	"""Initialize parameter registry."""
	self.parameters: Dict[str, Parameter] = {}
	self._initialize_default_parameters()

	def _initialize_default_parameters(self):
	"""Initialize default parameter definitions from spec."""
	# BM25 parameters - critical for document ranking
	self.register_parameter(Parameter(
	name="bm25_k1",
	component="sparse_retriever",
	path="retriever.sparse.config.k1",
	current=1.2,
	min_value=0.5,
	max_value=2.5,
	step=0.1,
	param_type="float",
	impacts=["retrieval_precision", "retrieval_recall"],
	description="BM25 term frequency saturation parameter"
	))

	self.register_parameter(Parameter(
	name="bm25_b",
	component="sparse_retriever",
	path="retriever.sparse.config.b",
	current=0.25, # Our optimized value
	min_value=0.0,
	max_value=1.0,
	step=0.05,
	param_type="float",
	impacts=["retrieval_precision", "document_length_bias"],
	description="BM25 document length normalization parameter"
	))

	# RRF fusion parameters - critical for score combination
	self.register_parameter(Parameter(
	name="rrf_k",
	component="fusion_strategy",
	path="retriever.fusion.config.k",
	current=30, # Our optimized value
	min_value=10,
	max_value=100,
	step=10,
	param_type="int",
	impacts=["fusion_quality", "score_discrimination"],
	description="RRF k parameter controlling score discriminative power"
	))

	self.register_parameter(Parameter(
	name="dense_weight",
	component="fusion_strategy",
	path="retriever.fusion.config.weights.dense",
	current=0.8, # Our optimized value
	min_value=0.1,
	max_value=0.9,
	step=0.05,
	param_type="float",
	impacts=["fusion_balance", "semantic_vs_lexical"],
	description="Weight for dense (semantic) retrieval in fusion"
	))

	self.register_parameter(Parameter(
	name="sparse_weight",
	component="fusion_strategy",
	path="retriever.fusion.config.weights.sparse",
	current=0.2, # Our optimized value
	min_value=0.1,
	max_value=0.9,
	step=0.05,
	param_type="float",
	impacts=["fusion_balance", "semantic_vs_lexical"],
	description="Weight for sparse (BM25) retrieval in fusion"
	))

	# Score-aware fusion parameters (Epic 2)
	self.register_parameter(Parameter(
	name="score_aware_score_weight",
	component="score_aware_fusion",
	path="retriever.fusion.config.score_weight",
	current=0.8,
	min_value=0.1,
	max_value=0.95,
	step=0.05,
	param_type="float",
	impacts=["score_preservation", "fusion_quality"],
	description="Score-aware fusion: weight for preserving semantic scores"
	))

	self.register_parameter(Parameter(
	name="score_aware_rank_weight",
	component="score_aware_fusion",
	path="retriever.fusion.config.rank_weight",
	current=0.15,
	min_value=0.05,
	max_value=0.4,
	step=0.05,
	param_type="float",
	impacts=["rank_stability", "fusion_robustness"],
	description="Score-aware fusion: weight for rank-based stability"
	))

	# Neural reranking parameters (Epic 2)
	self.register_parameter(Parameter(
	name="neural_batch_size",
	component="neural_reranker",
	path="retriever.reranker.config.batch_size",
	current=32,
	min_value=8,
	max_value=128,
	step=8,
	param_type="int",
	impacts=["neural_performance", "memory_usage"],
	description="Neural reranker batch size for cross-encoder processing"
	))

	self.register_parameter(Parameter(
	name="neural_max_candidates",
	component="neural_reranker",
	path="retriever.reranker.config.max_candidates",
	current=100,
	min_value=20,
	max_value=200,
	step=20,
	param_type="int",
	impacts=["neural_quality", "processing_time"],
	description="Maximum candidates for neural reranking"
	))

	# Graph enhancement parameters - for Epic 2 graph-based retrieval
	self.register_parameter(Parameter(
	name="graph_weight",
	component="fusion_strategy",
	path="retriever.fusion.config.weights.graph",
	current=0.3,
	min_value=0.1,
	max_value=0.5,
	step=0.05,
	param_type="float",
	impacts=["graph_quality", "fusion_balance"],
	description="Weight for graph-based retrieval in fusion"
	))

	self.register_parameter(Parameter(
	name="similarity_threshold",
	component="fusion_strategy",
	path="retriever.fusion.config.similarity_threshold",
	current=0.65,
	min_value=0.5,
	max_value=0.8,
	step=0.05,
	param_type="float",
	impacts=["graph_connectivity", "graph_quality"],
	description="Similarity threshold for document connections in graph"
	))

	self.register_parameter(Parameter(
	name="max_connections_per_document",
	component="fusion_strategy",
	path="retriever.fusion.config.max_connections_per_document",
	current=15,
	min_value=5,
	max_value=25,
	step=5,
	param_type="int",
	impacts=["graph_density", "computation_time"],
	description="Maximum connections per document in graph"
	))

	self.register_parameter(Parameter(
	name="pagerank_damping",
	component="fusion_strategy",
	path="retriever.fusion.config.pagerank_damping",
	current=0.85,
	min_value=0.7,
	max_value=0.95,
	step=0.05,
	param_type="float",
	impacts=["pagerank_quality", "graph_convergence"],
	description="PageRank damping factor for graph ranking"
	))

	# Confidence scoring parameters
	self.register_parameter(Parameter(
	name="confidence_threshold",
	component="answer_generator",
	path="answer_generator.config.confidence_threshold",
	current=0.85,
	min_value=0.3,
	max_value=0.95,
	step=0.05,
	param_type="float",
	impacts=["answer_quality", "refusal_rate"],
	description="Minimum confidence threshold for generating answers"
	))

	def register_parameter(self, parameter: Parameter) -> None:
	"""Register a new parameter."""
	if parameter.name in self.parameters:
	logger.warning(f"Parameter {parameter.name} already exists, overwriting")

	self.parameters[parameter.name] = parameter
	logger.debug(f"Registered parameter: {parameter.name}")

	def get_parameter(self, name: str) -> Optional[Parameter]:
	"""Get parameter by name."""
	return self.parameters.get(name)

	def get_parameters_for_component(self, component: str) -> List[Parameter]:
	"""Get all parameters for a specific component."""
	return [p for p in self.parameters.values() if p.component == component]

	def get_search_space(self, parameter_names: List[str]) -> Dict[str, List[Any]]:
	"""
	Generate search space for specified parameters.

	Args:
	parameter_names: List of parameter names to include in search

	Returns:
	Dictionary mapping parameter names to lists of values to search
	"""
	search_space = {}

	for name in parameter_names:
	param = self.parameters.get(name)
	if not param:
	logger.warning(f"Parameter {name} not found in registry")
	continue

	if param.param_type in ["float", "int"]:
	if param.min_value is not None and param.max_value is not None and param.step:
	# Generate range based on min, max, step
	if param.param_type == "float":
	values = []
	current = param.min_value
	while current <= param.max_value:
	values.append(round(current, 3))
	current += param.step
	else: # int
	values = list(range(param.min_value, param.max_value + 1, param.step))
	search_space[name] = values
	else:
	# Use current value if no range specified
	search_space[name] = [param.current]
	else:
	# For non-numeric parameters, use current value
	search_space[name] = [param.current]

	return search_space

	def get_parameter_impacts(self) -> Dict[str, List[str]]:
	"""Get mapping of metrics to parameters that impact them."""
	impacts = {}
	for param in self.parameters.values():
	for impact in param.impacts:
	if impact not in impacts:
	impacts[impact] = []
	impacts[impact].append(param.name)
	return impacts

	def validate_parameter_value(self, name: str, value: Any) -> bool:
	"""Validate if a value is valid for the given parameter."""
	param = self.parameters.get(name)
	if not param:
	return False

	# Type check
	expected_type = {"float": float, "int": int, "str": str, "bool": bool}[param.param_type]
	if not isinstance(value, expected_type):
	return False

	# Range check for numeric types
	if param.param_type in ["float", "int"]:
	if param.min_value is not None and value < param.min_value:
	return False
	if param.max_value is not None and value > param.max_value:
	return False

	return True

	def update_parameter_current_value(self, name: str, value: Any) -> bool:
	"""Update the current value of a parameter."""
	if not self.validate_parameter_value(name, value):
	logger.error(f"Invalid value {value} for parameter {name}")
	return False

	param = self.parameters[name]
	old_value = param.current
	param.current = value
	logger.info(f"Updated parameter {name}: {old_value} -> {value}")
	return True

	def export_parameter_values(self) -> Dict[str, Any]:
	"""Export current parameter values for configuration update."""
	return {name: param.current for name, param in self.parameters.items()}

	def get_parameter_summary(self) -> str:
	"""Get human-readable summary of all parameters."""
	summary = ["Parameter Registry Summary:", "=" * 40]

	for component in set(p.component for p in self.parameters.values()):
	summary.append(f"\n{component.upper()}:")
	component_params = self.get_parameters_for_component(component)
	for param in component_params:
	range_info = ""
	if param.min_value is not None and param.max_value is not None:
	range_info = f" (range: {param.min_value}-{param.max_value})"
	summary.append(f" {param.name}: {param.current}{range_info}")
	if param.description:
	summary.append(f" {param.description}")

	return "\n".join(summary)

	def load_from_config(self, config_path: Path) -> None:
	"""Load parameter values from a configuration file."""
	try:
	with open(config_path, 'r') as f:
	config = yaml.safe_load(f)

	# Extract parameter values based on their paths
	for param in self.parameters.values():
	value = self._extract_value_from_path(config, param.path)
	if value is not None:
	self.update_parameter_current_value(param.name, value)

	logger.info(f"Loaded parameters from {config_path}")

	except Exception as e:
	logger.error(f"Failed to load parameters from {config_path}: {e}")

	def _extract_value_from_path(self, config: Dict[str, Any], path: str) -> Any:
	"""Extract value from nested config using dot-separated path."""
	parts = path.split('.')
	current = config

	try:
	for part in parts:
	current = current[part]
	return current
	except (KeyError, TypeError):
	return None


	if __name__ == "__main__":
	# Test the parameter registry
	registry = ParameterRegistry()
	print(registry.get_parameter_summary())

	# Test search space generation
	search_space = registry.get_search_space(["bm25_k1", "bm25_b", "rrf_k"])
	print(f"\nSearch space example: {search_space}")