Document_intelligence / src /retriever.py
Abhinav Gavireddi
[fix]: optimized the entire pipeline
6c61722
import os
from typing import List, Dict, Any
import numpy as np
from src import RetrieverConfig, logger, get_chroma_client, get_embedder
class Retriever:
"""
Retrieves documents from a ChromaDB collection.
"""
def __init__(self, collection_name: str, config: RetrieverConfig):
self.collection_name = collection_name
self.config = config
self.client = get_chroma_client()
self.embedder = get_embedder()
self.collection = self.client.get_or_create_collection(name=self.collection_name)
def retrieve(self, query: str, top_k: int = None) -> List[Dict[str, Any]]:
"""
Embeds a query and retrieves the top_k most similar documents from ChromaDB.
"""
if top_k is None:
top_k = self.config.TOP_K
if self.collection.count() == 0:
logger.warning(f"Chroma collection '{self.collection_name}' is empty. Cannot retrieve.")
return []
try:
# 1. Embed the query
query_embedding = self.embedder.embed([query])[0]
# 2. Query ChromaDB
results = self.collection.query(
query_embeddings=[query_embedding],
n_results=top_k,
include=["metadatas", "documents"]
)
# 3. Format results into chunks
# Chroma returns lists of lists, so we access the first element.
if not results or not results.get('ids', [[]])[0]:
return []
ids = results['ids'][0]
documents = results['documents'][0]
metadatas = results['metadatas'][0]
retrieved_chunks = []
for i, doc_id in enumerate(ids):
chunk = {
'id': doc_id,
'narration': documents[i],
**metadatas[i] # Add all other metadata from Chroma
}
retrieved_chunks.append(chunk)
return retrieved_chunks
except Exception as e:
logger.error(f"ChromaDB retrieval failed for collection '{self.collection_name}': {e}")
return []