Spaces:

Agents-MCP-Hackathon
/

Intelligent_Content_Organizer

Running

App Files Files Community

Intelligent_Content_Organizer / mcp_tools /search_tool.py

Nihal2000

Gradio mcp

9145e48 3 months ago

raw

history blame contribute delete

17.8 kB

	import logging
	from typing import List, Dict, Any, Optional
	import asyncio

	from core.models import SearchResult
	from services.vector_store_service import VectorStoreService
	from services.embedding_service import EmbeddingService
	from services.document_store_service import DocumentStoreService
	import config

	logger = logging.getLogger(__name__)

	class SearchTool:
	def __init__(self, vector_store: VectorStoreService, embedding_service: EmbeddingService,
	document_store: Optional[DocumentStoreService] = None):
	self.vector_store = vector_store
	self.embedding_service = embedding_service
	self.document_store = document_store
	self.config = config.config

	async def search(self, query: str, top_k: int = 5, filters: Optional[Dict[str, Any]] = None,
	similarity_threshold: Optional[float] = None) -> List[SearchResult]:
	"""Perform semantic search"""
	try:
	if not query.strip():
	logger.warning("Empty search query provided")
	return []

	# Use default threshold if not provided
	if similarity_threshold is None:
	similarity_threshold = self.config.SIMILARITY_THRESHOLD

	logger.info(f"Performing semantic search for: '{query}' (top_k={top_k})")

	# Generate query embedding
	query_embedding = await self.embedding_service.generate_single_embedding(query)

	if not query_embedding:
	logger.error("Failed to generate query embedding")
	return []

	# Perform vector search
	results = await self.vector_store.search(
	query_embedding=query_embedding,
	top_k=top_k,
	filters=filters
	)

	# Filter by similarity threshold
	filtered_results = [
	result for result in results
	if result.score >= similarity_threshold
	]

	logger.info(f"Found {len(filtered_results)} results above threshold {similarity_threshold}")

	# Enhance results with additional metadata if document store is available
	if self.document_store:
	enhanced_results = await self._enhance_results_with_metadata(filtered_results)
	return enhanced_results

	return filtered_results

	except Exception as e:
	logger.error(f"Error performing semantic search: {str(e)}")
	return []

	async def _enhance_results_with_metadata(self, results: List[SearchResult]) -> List[SearchResult]:
	"""Enhance search results with document metadata"""
	try:
	enhanced_results = []

	for result in results:
	try:
	# Get document metadata
	document = await self.document_store.get_document(result.document_id)

	if document:
	# Add document metadata to result
	enhanced_metadata = {
	**result.metadata,
	"document_filename": document.filename,
	"document_type": document.doc_type.value,
	"document_tags": document.tags,
	"document_category": document.category,
	"document_created_at": document.created_at.isoformat(),
	"document_summary": document.summary
	}

	enhanced_result = SearchResult(
	chunk_id=result.chunk_id,
	document_id=result.document_id,
	content=result.content,
	score=result.score,
	metadata=enhanced_metadata
	)

	enhanced_results.append(enhanced_result)
	else:
	# Document not found, use original result
	enhanced_results.append(result)

	except Exception as e:
	logger.warning(f"Error enhancing result {result.chunk_id}: {str(e)}")
	enhanced_results.append(result)

	return enhanced_results

	except Exception as e:
	logger.error(f"Error enhancing results: {str(e)}")
	return results

	async def multi_query_search(self, queries: List[str], top_k: int = 5,
	aggregate_method: str = "merge") -> List[SearchResult]:
	"""Perform search with multiple queries and aggregate results"""
	try:
	all_results = []

	# Perform search for each query
	for query in queries:
	if query.strip():
	query_results = await self.search(query, top_k)
	all_results.extend(query_results)

	if not all_results:
	return []

	# Aggregate results
	if aggregate_method == "merge":
	return await self._merge_results(all_results, top_k)
	elif aggregate_method == "intersect":
	return await self._intersect_results(all_results, top_k)
	elif aggregate_method == "average":
	return await self._average_results(all_results, top_k)
	else:
	# Default to merge
	return await self._merge_results(all_results, top_k)

	except Exception as e:
	logger.error(f"Error in multi-query search: {str(e)}")
	return []

	async def _merge_results(self, results: List[SearchResult], top_k: int) -> List[SearchResult]:
	"""Merge results and remove duplicates, keeping highest scores"""
	try:
	# Group by chunk_id and keep highest score
	chunk_scores = {}
	chunk_results = {}

	for result in results:
	chunk_id = result.chunk_id
	if chunk_id not in chunk_scores or result.score > chunk_scores[chunk_id]:
	chunk_scores[chunk_id] = result.score
	chunk_results[chunk_id] = result

	# Sort by score and return top_k
	merged_results = list(chunk_results.values())
	merged_results.sort(key=lambda x: x.score, reverse=True)

	return merged_results[:top_k]

	except Exception as e:
	logger.error(f"Error merging results: {str(e)}")
	return results[:top_k]

	async def _intersect_results(self, results: List[SearchResult], top_k: int) -> List[SearchResult]:
	"""Find chunks that appear in multiple queries"""
	try:
	# Count occurrences of each chunk
	chunk_counts = {}
	chunk_results = {}

	for result in results:
	chunk_id = result.chunk_id
	chunk_counts[chunk_id] = chunk_counts.get(chunk_id, 0) + 1

	if chunk_id not in chunk_results or result.score > chunk_results[chunk_id].score:
	chunk_results[chunk_id] = result

	# Filter chunks that appear more than once
	intersect_results = [
	result for chunk_id, result in chunk_results.items()
	if chunk_counts[chunk_id] > 1
	]

	# Sort by score
	intersect_results.sort(key=lambda x: x.score, reverse=True)

	return intersect_results[:top_k]

	except Exception as e:
	logger.error(f"Error intersecting results: {str(e)}")
	return []

	async def _average_results(self, results: List[SearchResult], top_k: int) -> List[SearchResult]:
	"""Average scores for chunks that appear multiple times"""
	try:
	# Group by chunk_id and calculate average scores
	chunk_groups = {}

	for result in results:
	chunk_id = result.chunk_id
	if chunk_id not in chunk_groups:
	chunk_groups[chunk_id] = []
	chunk_groups[chunk_id].append(result)

	# Calculate average scores
	averaged_results = []
	for chunk_id, group in chunk_groups.items():
	avg_score = sum(r.score for r in group) / len(group)

	# Use the result with the highest individual score but update the score to average
	best_result = max(group, key=lambda x: x.score)
	averaged_result = SearchResult(
	chunk_id=best_result.chunk_id,
	document_id=best_result.document_id,
	content=best_result.content,
	score=avg_score,
	metadata={
	**best_result.metadata,
	"query_count": len(group),
	"score_range": f"{min(r.score for r in group):.3f}-{max(r.score for r in group):.3f}"
	}
	)
	averaged_results.append(averaged_result)

	# Sort by average score
	averaged_results.sort(key=lambda x: x.score, reverse=True)

	return averaged_results[:top_k]

	except Exception as e:
	logger.error(f"Error averaging results: {str(e)}")
	return results[:top_k]

	async def search_by_document(self, document_id: str, query: str, top_k: int = 5) -> List[SearchResult]:
	"""Search within a specific document"""
	try:
	filters = {"document_id": document_id}
	return await self.search(query, top_k, filters)

	except Exception as e:
	logger.error(f"Error searching within document {document_id}: {str(e)}")
	return []

	async def search_by_category(self, category: str, query: str, top_k: int = 5) -> List[SearchResult]:
	"""Search within documents of a specific category"""
	try:
	if not self.document_store:
	logger.warning("Document store not available for category search")
	return await self.search(query, top_k)

	# Get documents in the category
	documents = await self.document_store.list_documents(
	limit=1000, # Adjust as needed
	filters={"category": category}
	)

	if not documents:
	logger.info(f"No documents found in category '{category}'")
	return []

	# Extract document IDs
	document_ids = [doc.id for doc in documents]

	# Search with document ID filter
	filters = {"document_ids": document_ids}
	return await self.search(query, top_k, filters)

	except Exception as e:
	logger.error(f"Error searching by category {category}: {str(e)}")
	return []

	async def search_with_date_range(self, query: str, start_date, end_date, top_k: int = 5) -> List[SearchResult]:
	"""Search documents within a date range"""
	try:
	if not self.document_store:
	logger.warning("Document store not available for date range search")
	return await self.search(query, top_k)

	# Get documents in the date range
	documents = await self.document_store.list_documents(
	limit=1000, # Adjust as needed
	filters={
	"created_after": start_date,
	"created_before": end_date
	}
	)

	if not documents:
	logger.info(f"No documents found in date range")
	return []

	# Extract document IDs
	document_ids = [doc.id for doc in documents]

	# Search with document ID filter
	filters = {"document_ids": document_ids}
	return await self.search(query, top_k, filters)

	except Exception as e:
	logger.error(f"Error searching with date range: {str(e)}")
	return []

	async def get_search_suggestions(self, partial_query: str, limit: int = 5) -> List[str]:
	"""Get search suggestions based on partial query"""
	try:
	# This is a simple implementation
	# In a production system, you might want to use a more sophisticated approach

	if len(partial_query) < 2:
	return []

	# Search for the partial query
	results = await self.search(partial_query, top_k=20)

	# Extract potential query expansions from content
	suggestions = set()

	for result in results:
	content_words = result.content.lower().split()
	for i, word in enumerate(content_words):
	if partial_query.lower() in word:
	# Add the word itself
	suggestions.add(word.strip('.,!?;:'))

	# Add phrases that include this word
	if i > 0:
	phrase = f"{content_words[i-1]} {word}".strip('.,!?;:')
	suggestions.add(phrase)
	if i < len(content_words) - 1:
	phrase = f"{word} {content_words[i+1]}".strip('.,!?;:')
	suggestions.add(phrase)

	# Filter and sort suggestions
	filtered_suggestions = [
	s for s in suggestions
	if len(s) > len(partial_query) and s.startswith(partial_query.lower())
	]

	return sorted(filtered_suggestions)[:limit]

	except Exception as e:
	logger.error(f"Error getting search suggestions: {str(e)}")
	return []

	async def explain_search(self, query: str, top_k: int = 3) -> Dict[str, Any]:
	"""Provide detailed explanation of search process and results"""
	try:
	explanation = {
	"query": query,
	"steps": [],
	"results_analysis": {},
	"performance_metrics": {}
	}

	# Step 1: Query processing
	explanation["steps"].append({
	"step": "query_processing",
	"description": "Processing and normalizing the search query",
	"details": {
	"original_query": query,
	"cleaned_query": query.strip(),
	"query_length": len(query)
	}
	})

	# Step 2: Embedding generation
	import time
	start_time = time.time()

	query_embedding = await self.embedding_service.generate_single_embedding(query)

	embedding_time = time.time() - start_time

	explanation["steps"].append({
	"step": "embedding_generation",
	"description": "Converting query to vector embedding",
	"details": {
	"embedding_dimension": len(query_embedding) if query_embedding else 0,
	"generation_time_ms": round(embedding_time * 1000, 2)
	}
	})

	# Step 3: Vector search
	start_time = time.time()

	results = await self.vector_store.search(query_embedding, top_k)

	search_time = time.time() - start_time

	explanation["steps"].append({
	"step": "vector_search",
	"description": "Searching vector database for similar content",
	"details": {
	"search_time_ms": round(search_time * 1000, 2),
	"results_found": len(results),
	"top_score": results[0].score if results else 0,
	"score_range": f"{min(r.score for r in results):.3f}-{max(r.score for r in results):.3f}" if results else "N/A"
	}
	})

	# Results analysis
	if results:
	explanation["results_analysis"] = {
	"total_results": len(results),
	"average_score": sum(r.score for r in results) / len(results),
	"unique_documents": len(set(r.document_id for r in results)),
	"content_lengths": [len(r.content) for r in results]
	}

	# Performance metrics
	explanation["performance_metrics"] = {
	"total_time_ms": round((embedding_time + search_time) * 1000, 2),
	"embedding_time_ms": round(embedding_time * 1000, 2),
	"search_time_ms": round(search_time * 1000, 2)
	}

	return explanation

	except Exception as e:
	logger.error(f"Error explaining search: {str(e)}")
	return {"error": str(e)}