File size: 5,170 Bytes
75e2b6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2d11be
 
75e2b6c
d2d11be
75e2b6c
 
 
 
 
 
d2d11be
75e2b6c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2d11be
 
 
 
 
 
 
75e2b6c
 
 
d2d11be
 
75e2b6c
 
d2d11be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75e2b6c
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import os
import uuid
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_qdrant import Qdrant
from qdrant_client import QdrantClient, models
from dotenv import load_dotenv

load_dotenv()

os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API_KEY")
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
QDRANT_COLLECTION_NAME = os.getenv("QDRANT_COLLECTION_NAME")

class VectorDatabaseSearch:
    def __init__(self, collection_name=QDRANT_COLLECTION_NAME):
        self.collection_name = collection_name
        self.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
        self.client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
        self._initialize_collection()
        
        self.vectorstore = Qdrant(
            client=self.client,
            collection_name=collection_name,
            embeddings=self.embeddings
        )

    def _initialize_collection(self):
        """Initialize Qdrant collection if it doesn't exist"""
        try:
            collections = self.client.get_collections()
            if not any(c.name == self.collection_name for c in collections.collections):
                self.client.create_collection(
                    collection_name=self.collection_name,
                    vectors_config=models.VectorParams(
                        size=768,  
                        distance=models.Distance.COSINE
                    )
                )
                print(f"Created collection: {self.collection_name}")
        except Exception as e:
            print(f"Error initializing collection: {e}")

    def add_pdf(self, pdf_path):
        """Add PDF to vector database"""
        try:
            loader = PyPDFLoader(pdf_path)
            docs = loader.load()
            splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
            split_docs = splitter.split_documents(docs)
            
            book_name = os.path.splitext(os.path.basename(pdf_path))[0]
            print(f"Processing {book_name} with {len(split_docs)} chunks")
            
            for doc in split_docs:
                # Ensure metadata is stored in a consistent way
                doc.metadata = {
                    "source": book_name,
                    "page": doc.metadata.get('page', 1),
                    "id": str(uuid.uuid4())
                }
            
            # Add documents to vector store
            self.vectorstore.add_documents(split_docs)
            print(f"Added {len(split_docs)} chunks from {book_name}")
            return True
        except Exception as e:
            print(f"Error adding PDF: {e}")
            return False

    def search(self, query, top_k=5):
        """Search documents based on query"""
        try:
            results = self.vectorstore.similarity_search_with_score(query, k=top_k)
            
            formatted = []
            for doc, score in results:
                formatted.append({
                    "source": doc.metadata['source'],
                    "page": doc.metadata['page'],
                    "content": doc.page_content[:500],
                    "confidence": round(score * 100, 2)
                })
            return formatted
        except Exception as e:
            print(f"Search error: {e}")
            return []

    def get_book_info(self):
        """Retrieve list of unique book sources in the collection"""
        try:
            # First check if the collection exists
            collections = self.client.get_collections()
            if not any(c.name == self.collection_name for c in collections.collections):
                print(f"Collection {self.collection_name} does not exist yet")
                return []
                
            # Get all points with payload from the collection
            points = self.client.scroll(
                collection_name=self.collection_name,
                limit=1000,
                with_payload=True,
                with_vectors=False  # We don't need vector data
            )[0]
            
            # Debug information
            print(f"Retrieved {len(points)} points from collection")
            
            # Extract unique book sources from payloads
            books = set()
            for point in points:
                # Check if payload exists and has 'metadata' field with 'source'
                if hasattr(point, 'payload') and point.payload:
                    # Check different possible payload structures
                    if 'metadata' in point.payload and 'source' in point.payload['metadata']:
                        books.add(point.payload['metadata']['source'])
                    elif 'source' in point.payload:
                        books.add(point.payload['source'])
                    
            print(f"Found {len(books)} unique books")
            return list(books)
        except Exception as e:
            print(f"Error retrieving book info: {e}")
            return []