Spaces:

ArthyP
/

enhanced-rag-demo

Sleeping

enhanced-rag-demo / src /components /retrievers /graph /graph_analytics.py

Arthur Passuello

initial commit

5e1a30c about 1 month ago

27.3 kB

	"""
	Graph analytics for Epic 2 Week 2.

	This module provides analytics capabilities for knowledge graphs including
	metrics collection, performance monitoring, and optional visualization
	of graph structures and retrieval patterns.
	"""

	import logging
	import time
	from typing import List, Dict, Any, Optional, Set, Tuple
	from dataclasses import dataclass, field
	from collections import defaultdict, Counter
	import json

	try:
	import networkx as nx
	import numpy as np
	except ImportError:
	nx = None
	np = None

	try:
	import plotly.graph_objects as go
	import plotly.express as px
	from plotly.subplots import make_subplots
	PLOTLY_AVAILABLE = True
	except ImportError:
	PLOTLY_AVAILABLE = False

	from .config.graph_config import GraphAnalyticsConfig
	from .document_graph_builder import DocumentGraphBuilder
	from .graph_retriever import GraphRetriever

	logger = logging.getLogger(__name__)


	@dataclass
	class GraphMetrics:
	"""Graph structure metrics."""
	nodes: int = 0
	edges: int = 0
	density: float = 0.0
	avg_degree: float = 0.0
	connected_components: int = 0
	diameter: Optional[int] = None
	clustering_coefficient: float = 0.0
	node_type_distribution: Dict[str, int] = field(default_factory=dict)
	edge_type_distribution: Dict[str, int] = field(default_factory=dict)


	@dataclass
	class RetrievalMetrics:
	"""Graph retrieval performance metrics."""
	total_queries: int = 0
	avg_latency_ms: float = 0.0
	cache_hit_rate: float = 0.0
	algorithm_usage: Dict[str, int] = field(default_factory=dict)
	avg_results_per_query: float = 0.0
	query_patterns: Dict[str, int] = field(default_factory=dict)


	@dataclass
	class AnalyticsSnapshot:
	"""Complete analytics snapshot."""
	timestamp: float
	graph_metrics: GraphMetrics
	retrieval_metrics: RetrievalMetrics
	memory_usage_mb: float = 0.0
	processing_stats: Dict[str, Any] = field(default_factory=dict)


	class GraphAnalyticsError(Exception):
	"""Raised when graph analytics operations fail."""
	pass


	class GraphAnalytics:
	"""
	Analytics and monitoring for graph-based retrieval.

	This class provides comprehensive analytics capabilities including:
	- Graph structure analysis and metrics
	- Retrieval performance monitoring
	- Query pattern analysis
	- Optional visualization of graphs and metrics
	- Time-series tracking of performance

	Features:
	- Real-time metrics collection
	- Historical performance tracking
	- Graph visualization (when Plotly is available)
	- Performance trend analysis
	- Memory usage monitoring
	"""

	def __init__(self, config: GraphAnalyticsConfig):
	"""
	Initialize graph analytics.

	Args:
	config: Analytics configuration
	"""
	self.config = config

	# Metrics storage
	self.snapshots: List[AnalyticsSnapshot] = []
	self.current_metrics = {
	"graph": GraphMetrics(),
	"retrieval": RetrievalMetrics()
	}

	# Query tracking
	self.query_history: List[Dict[str, Any]] = []
	self.performance_history: List[Dict[str, Any]] = []

	# Statistics
	self.stats = {
	"analytics_started": time.time(),
	"snapshots_created": 0,
	"metrics_collected": 0,
	"visualizations_generated": 0
	}

	logger.info(f"GraphAnalytics initialized (visualization: {PLOTLY_AVAILABLE})")

	def collect_graph_metrics(self, graph_builder: DocumentGraphBuilder) -> GraphMetrics:
	"""
	Collect comprehensive graph structure metrics.

	Args:
	graph_builder: Document graph builder

	Returns:
	Graph metrics object
	"""
	if not self.config.collect_graph_metrics:
	return GraphMetrics()

	try:
	graph = graph_builder.get_graph()

	if not graph or graph.number_of_nodes() == 0:
	return GraphMetrics()

	# Basic metrics
	num_nodes = graph.number_of_nodes()
	num_edges = graph.number_of_edges()

	metrics = GraphMetrics(
	nodes=num_nodes,
	edges=num_edges
	)

	# Calculate density
	if num_nodes > 1:
	metrics.density = nx.density(graph)

	# Calculate average degree
	if num_nodes > 0:
	degrees = dict(graph.degree())
	metrics.avg_degree = sum(degrees.values()) / num_nodes

	# Connected components
	if graph.is_directed():
	metrics.connected_components = nx.number_weakly_connected_components(graph)
	else:
	metrics.connected_components = nx.number_connected_components(graph)

	# Diameter (for smaller graphs)
	if num_nodes < 1000 and nx.is_connected(graph.to_undirected()):
	try:
	metrics.diameter = nx.diameter(graph.to_undirected())
	except nx.NetworkXError:
	metrics.diameter = None

	# Clustering coefficient
	if num_nodes > 2:
	try:
	metrics.clustering_coefficient = nx.average_clustering(graph.to_undirected())
	except (nx.NetworkXError, ZeroDivisionError):
	metrics.clustering_coefficient = 0.0

	# Node type distribution
	node_types = defaultdict(int)
	for node_id in graph.nodes():
	node_data = graph.nodes[node_id]
	node_type = node_data.get("node_type", "unknown")
	node_types[node_type] += 1
	metrics.node_type_distribution = dict(node_types)

	# Edge type distribution
	edge_types = defaultdict(int)
	for source, target, edge_data in graph.edges(data=True):
	edge_type = edge_data.get("edge_type", "unknown")
	edge_types[edge_type] += 1
	metrics.edge_type_distribution = dict(edge_types)

	self.current_metrics["graph"] = metrics
	self.stats["metrics_collected"] += 1

	logger.debug(f"Collected graph metrics: {num_nodes} nodes, {num_edges} edges")

	return metrics

	except Exception as e:
	logger.error(f"Failed to collect graph metrics: {str(e)}")
	return GraphMetrics()

	def collect_retrieval_metrics(self, graph_retriever: GraphRetriever) -> RetrievalMetrics:
	"""
	Collect retrieval performance metrics.

	Args:
	graph_retriever: Graph retriever component

	Returns:
	Retrieval metrics object
	"""
	if not self.config.collect_retrieval_metrics:
	return RetrievalMetrics()

	try:
	retriever_stats = graph_retriever.get_statistics()

	metrics = RetrievalMetrics(
	total_queries=retriever_stats.get("queries_processed", 0),
	avg_latency_ms=retriever_stats.get("avg_search_time", 0.0) * 1000,
	cache_hit_rate=retriever_stats.get("cache_hit_rate", 0.0),
	algorithm_usage=dict(retriever_stats.get("algorithm_usage", {})),
	avg_results_per_query=(
	retriever_stats.get("total_results_returned", 0) /
	max(retriever_stats.get("queries_processed", 1), 1)
	)
	)

	self.current_metrics["retrieval"] = metrics
	self.stats["metrics_collected"] += 1

	return metrics

	except Exception as e:
	logger.error(f"Failed to collect retrieval metrics: {str(e)}")
	return RetrievalMetrics()

	def create_snapshot(self, graph_builder: DocumentGraphBuilder,
	graph_retriever: GraphRetriever) -> AnalyticsSnapshot:
	"""
	Create a complete analytics snapshot.

	Args:
	graph_builder: Document graph builder
	graph_retriever: Graph retriever component

	Returns:
	Analytics snapshot
	"""
	timestamp = time.time()

	# Collect metrics
	graph_metrics = self.collect_graph_metrics(graph_builder)
	retrieval_metrics = self.collect_retrieval_metrics(graph_retriever)

	# Get memory usage
	memory_usage = self._estimate_memory_usage(graph_builder, graph_retriever)

	# Get processing stats
	processing_stats = {
	"graph_builder": graph_builder.get_graph_statistics(),
	"graph_retriever": graph_retriever.get_statistics()
	}

	snapshot = AnalyticsSnapshot(
	timestamp=timestamp,
	graph_metrics=graph_metrics,
	retrieval_metrics=retrieval_metrics,
	memory_usage_mb=memory_usage,
	processing_stats=processing_stats
	)

	# Store snapshot
	self.snapshots.append(snapshot)
	self.stats["snapshots_created"] += 1

	# Clean old snapshots based on retention policy
	self._clean_old_snapshots()

	logger.info(f"Created analytics snapshot ({len(self.snapshots)} total)")

	return snapshot

	def track_query(self, query: str, results_count: int, latency_ms: float,
	algorithm_used: str, success: bool = True) -> None:
	"""
	Track an individual query for analysis.

	Args:
	query: Query string
	results_count: Number of results returned
	latency_ms: Query latency in milliseconds
	algorithm_used: Algorithm used for retrieval
	success: Whether query was successful
	"""
	query_record = {
	"timestamp": time.time(),
	"query": query,
	"results_count": results_count,
	"latency_ms": latency_ms,
	"algorithm": algorithm_used,
	"success": success,
	"query_length": len(query),
	"query_words": len(query.split())
	}

	self.query_history.append(query_record)

	# Update query patterns
	if hasattr(self.current_metrics["retrieval"], "query_patterns"):
	query_type = self._classify_query(query)
	self.current_metrics["retrieval"].query_patterns[query_type] = (
	self.current_metrics["retrieval"].query_patterns.get(query_type, 0) + 1
	)

	# Limit history size
	max_history = 10000
	if len(self.query_history) > max_history:
	self.query_history = self.query_history[-max_history//2:]

	def generate_report(self) -> Dict[str, Any]:
	"""
	Generate comprehensive analytics report.

	Returns:
	Dictionary with analytics report
	"""
	if not self.snapshots:
	return {"error": "No analytics data available"}

	latest_snapshot = self.snapshots[-1]

	# Basic metrics
	report = {
	"timestamp": latest_snapshot.timestamp,
	"graph_metrics": {
	"nodes": latest_snapshot.graph_metrics.nodes,
	"edges": latest_snapshot.graph_metrics.edges,
	"density": latest_snapshot.graph_metrics.density,
	"avg_degree": latest_snapshot.graph_metrics.avg_degree,
	"connected_components": latest_snapshot.graph_metrics.connected_components,
	"clustering_coefficient": latest_snapshot.graph_metrics.clustering_coefficient,
	"node_type_distribution": latest_snapshot.graph_metrics.node_type_distribution,
	"edge_type_distribution": latest_snapshot.graph_metrics.edge_type_distribution
	},
	"retrieval_metrics": {
	"total_queries": latest_snapshot.retrieval_metrics.total_queries,
	"avg_latency_ms": latest_snapshot.retrieval_metrics.avg_latency_ms,
	"cache_hit_rate": latest_snapshot.retrieval_metrics.cache_hit_rate,
	"algorithm_usage": latest_snapshot.retrieval_metrics.algorithm_usage,
	"avg_results_per_query": latest_snapshot.retrieval_metrics.avg_results_per_query
	},
	"performance": {
	"memory_usage_mb": latest_snapshot.memory_usage_mb,
	"snapshots_count": len(self.snapshots),
	"queries_tracked": len(self.query_history)
	}
	}

	# Historical trends
	if len(self.snapshots) > 1:
	report["trends"] = self._calculate_trends()

	# Query analysis
	if self.query_history:
	report["query_analysis"] = self._analyze_queries()

	return report

	def visualize_graph(self, graph_builder: DocumentGraphBuilder,
	layout: str = "spring", max_nodes: Optional[int] = None) -> Optional[str]:
	"""
	Generate graph visualization.

	Args:
	graph_builder: Document graph builder
	layout: Layout algorithm (spring, circular, etc.)
	max_nodes: Maximum nodes to visualize

	Returns:
	HTML string of visualization or None if disabled/failed
	"""
	if not self.config.enable_visualization or not PLOTLY_AVAILABLE:
	return None

	try:
	graph = graph_builder.get_graph()

	if not graph or graph.number_of_nodes() == 0:
	return None

	# Limit graph size for visualization
	max_viz_nodes = max_nodes or self.config.visualization_max_nodes
	if graph.number_of_nodes() > max_viz_nodes:
	# Sample most connected nodes
	node_degrees = dict(graph.degree())
	top_nodes = sorted(node_degrees.items(), key=lambda x: x[1], reverse=True)
	nodes_to_keep = [node for node, _ in top_nodes[:max_viz_nodes]]
	graph = graph.subgraph(nodes_to_keep)

	# Convert to undirected for layout
	layout_graph = graph.to_undirected()

	# Generate layout
	if layout == "spring":
	pos = nx.spring_layout(layout_graph)
	elif layout == "circular":
	pos = nx.circular_layout(layout_graph)
	elif layout == "kamada_kawai":
	pos = nx.kamada_kawai_layout(layout_graph)
	else:
	pos = nx.spring_layout(layout_graph)

	# Create visualization
	fig = self._create_plotly_graph(graph, pos)

	self.stats["visualizations_generated"] += 1

	return fig.to_html()

	except Exception as e:
	logger.error(f"Graph visualization failed: {str(e)}")
	return None

	def visualize_metrics(self) -> Optional[str]:
	"""
	Generate metrics visualization.

	Returns:
	HTML string of metrics visualization or None if disabled/failed
	"""
	if not self.config.enable_visualization or not PLOTLY_AVAILABLE or not self.snapshots:
	return None

	try:
	# Create subplots
	fig = make_subplots(
	rows=2, cols=2,
	subplot_titles=("Graph Growth", "Retrieval Latency", "Node Types", "Algorithm Usage"),
	specs=[[{"secondary_y": True}, {"secondary_y": False}],
	[{"type": "pie"}, {"type": "pie"}]]
	)

	# Extract time series data
	timestamps = [s.timestamp for s in self.snapshots]
	nodes = [s.graph_metrics.nodes for s in self.snapshots]
	edges = [s.graph_metrics.edges for s in self.snapshots]
	latencies = [s.retrieval_metrics.avg_latency_ms for s in self.snapshots]

	# Graph growth
	fig.add_trace(
	go.Scatter(x=timestamps, y=nodes, name="Nodes", line=dict(color="blue")),
	row=1, col=1
	)
	fig.add_trace(
	go.Scatter(x=timestamps, y=edges, name="Edges", line=dict(color="red")),
	row=1, col=1, secondary_y=True
	)

	# Retrieval latency
	fig.add_trace(
	go.Scatter(x=timestamps, y=latencies, name="Latency (ms)", line=dict(color="green")),
	row=1, col=2
	)

	# Latest snapshot data for pie charts
	latest = self.snapshots[-1]

	# Node types pie chart
	if latest.graph_metrics.node_type_distribution:
	fig.add_trace(
	go.Pie(
	labels=list(latest.graph_metrics.node_type_distribution.keys()),
	values=list(latest.graph_metrics.node_type_distribution.values()),
	name="Node Types"
	),
	row=2, col=1
	)

	# Algorithm usage pie chart
	if latest.retrieval_metrics.algorithm_usage:
	fig.add_trace(
	go.Pie(
	labels=list(latest.retrieval_metrics.algorithm_usage.keys()),
	values=list(latest.retrieval_metrics.algorithm_usage.values()),
	name="Algorithm Usage"
	),
	row=2, col=2
	)

	fig.update_layout(
	title="Graph Analytics Dashboard",
	height=800
	)

	self.stats["visualizations_generated"] += 1

	return fig.to_html()

	except Exception as e:
	logger.error(f"Metrics visualization failed: {str(e)}")
	return None

	def _estimate_memory_usage(self, graph_builder: DocumentGraphBuilder,
	graph_retriever: GraphRetriever) -> float:
	"""Estimate memory usage in MB."""
	try:
	# Get graph builder stats
	builder_stats = graph_builder.get_graph_statistics()
	builder_memory = builder_stats.get("memory_usage_mb", 0.0)

	# Estimate retriever memory (cache size)
	retriever_stats = graph_retriever.get_statistics()
	cache_entries = len(graph_retriever.query_cache) if hasattr(graph_retriever, 'query_cache') else 0
	retriever_memory = cache_entries * 0.01 # Rough estimate: 10KB per cache entry

	# Analytics memory
	analytics_memory = len(self.snapshots) * 0.001 # Rough estimate: 1KB per snapshot

	return builder_memory + retriever_memory + analytics_memory

	except Exception:
	return 0.0

	def _clean_old_snapshots(self) -> None:
	"""Clean old snapshots based on retention policy."""
	if not self.snapshots:
	return

	current_time = time.time()
	retention_seconds = self.config.metrics_retention_hours * 3600

	# Remove snapshots older than retention period
	self.snapshots = [
	s for s in self.snapshots
	if current_time - s.timestamp <= retention_seconds
	]

	def _classify_query(self, query: str) -> str:
	"""Classify query type for pattern analysis."""
	query_lower = query.lower()

	if any(word in query_lower for word in ["risc-v", "riscv", "isa"]):
	return "architecture"
	elif any(word in query_lower for word in ["extension", "implement", "support"]):
	return "extension"
	elif any(word in query_lower for word in ["protocol", "interface", "communication"]):
	return "protocol"
	elif len(query.split()) <= 2:
	return "short"
	elif len(query.split()) > 10:
	return "long"
	else:
	return "general"

	def _calculate_trends(self) -> Dict[str, Any]:
	"""Calculate performance trends from historical data."""
	if len(self.snapshots) < 2:
	return {}

	# Calculate growth rates
	first = self.snapshots[0]
	last = self.snapshots[-1]
	time_diff = last.timestamp - first.timestamp

	if time_diff == 0:
	return {}

	node_growth_rate = (last.graph_metrics.nodes - first.graph_metrics.nodes) / time_diff
	edge_growth_rate = (last.graph_metrics.edges - first.graph_metrics.edges) / time_diff

	# Average performance metrics
	recent_snapshots = self.snapshots[-5:] # Last 5 snapshots
	avg_latency = sum(s.retrieval_metrics.avg_latency_ms for s in recent_snapshots) / len(recent_snapshots)
	avg_memory = sum(s.memory_usage_mb for s in recent_snapshots) / len(recent_snapshots)

	return {
	"node_growth_rate_per_second": node_growth_rate,
	"edge_growth_rate_per_second": edge_growth_rate,
	"avg_recent_latency_ms": avg_latency,
	"avg_recent_memory_mb": avg_memory,
	"total_time_span_hours": time_diff / 3600
	}

	def _analyze_queries(self) -> Dict[str, Any]:
	"""Analyze query history for patterns."""
	if not self.query_history:
	return {}

	# Query statistics
	latencies = [q["latency_ms"] for q in self.query_history if q["success"]]

	analysis = {
	"total_queries": len(self.query_history),
	"successful_queries": sum(1 for q in self.query_history if q["success"]),
	"avg_latency_ms": sum(latencies) / len(latencies) if latencies else 0,
	"max_latency_ms": max(latencies) if latencies else 0,
	"min_latency_ms": min(latencies) if latencies else 0
	}

	# Query length distribution
	lengths = [q["query_length"] for q in self.query_history]
	analysis["avg_query_length"] = sum(lengths) / len(lengths) if lengths else 0

	# Algorithm usage
	algorithms = [q["algorithm"] for q in self.query_history]
	analysis["algorithm_distribution"] = dict(Counter(algorithms))

	return analysis

	def _create_plotly_graph(self, graph: nx.DiGraph, pos: Dict[str, Tuple[float, float]]) -> go.Figure:
	"""Create Plotly graph visualization."""
	# Extract edges
	edge_x = []
	edge_y = []
	for edge in graph.edges():
	x0, y0 = pos[edge[0]]
	x1, y1 = pos[edge[1]]
	edge_x.extend([x0, x1, None])
	edge_y.extend([y0, y1, None])

	# Create edge trace
	edge_trace = go.Scatter(
	x=edge_x, y=edge_y,
	line=dict(width=0.5, color='#888'),
	hoverinfo='none',
	mode='lines'
	)

	# Extract nodes
	node_x = []
	node_y = []
	node_text = []
	node_colors = []

	color_map = {
	"concept": "blue",
	"protocol": "red",
	"architecture": "green",
	"extension": "purple"
	}

	for node in graph.nodes():
	x, y = pos[node]
	node_x.append(x)
	node_y.append(y)

	node_data = graph.nodes[node]
	node_text.append(node_data.get("text", node))
	node_type = node_data.get("node_type", "concept")
	node_colors.append(color_map.get(node_type, "gray"))

	# Create node trace
	node_trace = go.Scatter(
	x=node_x, y=node_y,
	mode='markers+text',
	hoverinfo='text',
	text=node_text,
	textposition="middle center",
	marker=dict(
	size=10,
	color=node_colors,
	line=dict(width=2, color="black")
	)
	)

	# Create figure
	fig = go.Figure(
	data=[edge_trace, node_trace],
	layout=go.Layout(
	title="Knowledge Graph Visualization",
	titlefont_size=16,
	showlegend=False,
	hovermode='closest',
	margin=dict(b=20,l=5,r=5,t=40),
	annotations=[ dict(
	text="Graph Visualization",
	showarrow=False,
	xref="paper", yref="paper",
	x=0.005, y=-0.002,
	xanchor="left", yanchor="bottom",
	font=dict(color="gray", size=12)
	)],
	xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
	yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
	)
	)

	return fig

	def export_data(self, format: str = "json") -> str:
	"""
	Export analytics data.

	Args:
	format: Export format ("json", "csv")

	Returns:
	Exported data as string
	"""
	if format == "json":
	export_data = {
	"snapshots": [
	{
	"timestamp": s.timestamp,
	"graph_metrics": {
	"nodes": s.graph_metrics.nodes,
	"edges": s.graph_metrics.edges,
	"density": s.graph_metrics.density,
	"avg_degree": s.graph_metrics.avg_degree
	},
	"retrieval_metrics": {
	"total_queries": s.retrieval_metrics.total_queries,
	"avg_latency_ms": s.retrieval_metrics.avg_latency_ms,
	"cache_hit_rate": s.retrieval_metrics.cache_hit_rate
	},
	"memory_usage_mb": s.memory_usage_mb
	}
	for s in self.snapshots
	],
	"query_history": self.query_history,
	"stats": self.stats
	}
	return json.dumps(export_data, indent=2)
	else:
	return "Unsupported export format"

	def get_statistics(self) -> Dict[str, Any]:
	"""Get analytics statistics."""
	return {
	**self.stats,
	"snapshots_count": len(self.snapshots),
	"queries_tracked": len(self.query_history),
	"current_memory_estimate_mb": (
	self.snapshots[-1].memory_usage_mb if self.snapshots else 0.0
	)
	}