Spaces:

AbhinavGavireddi
/

Document_intelligence

Running

File size: 2,201 Bytes

import os
from typing import List, Dict, Any
import numpy as np

from src import RetrieverConfig, logger, get_chroma_client, get_embedder

class Retriever:
    """
    Retrieves documents from a ChromaDB collection.
    """
    def __init__(self, collection_name: str, config: RetrieverConfig):
        self.collection_name = collection_name
        self.config = config
        self.client = get_chroma_client()
        self.embedder = get_embedder()
        self.collection = self.client.get_or_create_collection(name=self.collection_name)

    def retrieve(self, query: str, top_k: int = None) -> List[Dict[str, Any]]:
        """
        Embeds a query and retrieves the top_k most similar documents from ChromaDB.
        """
        if top_k is None:
            top_k = self.config.TOP_K
        
        if self.collection.count() == 0:
            logger.warning(f"Chroma collection '{self.collection_name}' is empty. Cannot retrieve.")
            return []

        try:
            # 1. Embed the query
            query_embedding = self.embedder.embed([query])[0]
            
            # 2. Query ChromaDB
            results = self.collection.query(
                query_embeddings=[query_embedding],
                n_results=top_k,
                include=["metadatas", "documents"] 
            )
            
            # 3. Format results into chunks
            # Chroma returns lists of lists, so we access the first element.
            if not results or not results.get('ids', [[]])[0]:
                return []

            ids = results['ids'][0]
            documents = results['documents'][0]
            metadatas = results['metadatas'][0]
            
            retrieved_chunks = []
            for i, doc_id in enumerate(ids):
                chunk = {
                    'id': doc_id,
                    'narration': documents[i],
                    **metadatas[i]  # Add all other metadata from Chroma
                }
                retrieved_chunks.append(chunk)

            return retrieved_chunks

        except Exception as e:
            logger.error(f"ChromaDB retrieval failed for collection '{self.collection_name}': {e}")
            return []