muhammadnoman76 commited on
Commit
d2d11be
·
1 Parent(s): b3b4203
app/services/vector_database_search.py CHANGED
@@ -52,13 +52,17 @@ class VectorDatabaseSearch:
52
  split_docs = splitter.split_documents(docs)
53
 
54
  book_name = os.path.splitext(os.path.basename(pdf_path))[0]
 
 
55
  for doc in split_docs:
 
56
  doc.metadata = {
57
  "source": book_name,
58
  "page": doc.metadata.get('page', 1),
59
  "id": str(uuid.uuid4())
60
  }
61
 
 
62
  self.vectorstore.add_documents(split_docs)
63
  print(f"Added {len(split_docs)} chunks from {book_name}")
64
  return True
@@ -87,13 +91,35 @@ class VectorDatabaseSearch:
87
  def get_book_info(self):
88
  """Retrieve list of unique book sources in the collection"""
89
  try:
 
 
 
 
 
 
 
90
  points = self.client.scroll(
91
  collection_name=self.collection_name,
92
  limit=1000,
93
- with_payload=True
 
94
  )[0]
95
 
96
- books = set(point.payload.get('source', '') for point in points if point.payload)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  return list(books)
98
  except Exception as e:
99
  print(f"Error retrieving book info: {e}")
 
52
  split_docs = splitter.split_documents(docs)
53
 
54
  book_name = os.path.splitext(os.path.basename(pdf_path))[0]
55
+ print(f"Processing {book_name} with {len(split_docs)} chunks")
56
+
57
  for doc in split_docs:
58
+ # Ensure metadata is stored in a consistent way
59
  doc.metadata = {
60
  "source": book_name,
61
  "page": doc.metadata.get('page', 1),
62
  "id": str(uuid.uuid4())
63
  }
64
 
65
+ # Add documents to vector store
66
  self.vectorstore.add_documents(split_docs)
67
  print(f"Added {len(split_docs)} chunks from {book_name}")
68
  return True
 
91
  def get_book_info(self):
92
  """Retrieve list of unique book sources in the collection"""
93
  try:
94
+ # First check if the collection exists
95
+ collections = self.client.get_collections()
96
+ if not any(c.name == self.collection_name for c in collections.collections):
97
+ print(f"Collection {self.collection_name} does not exist yet")
98
+ return []
99
+
100
+ # Get all points with payload from the collection
101
  points = self.client.scroll(
102
  collection_name=self.collection_name,
103
  limit=1000,
104
+ with_payload=True,
105
+ with_vectors=False # We don't need vector data
106
  )[0]
107
 
108
+ # Debug information
109
+ print(f"Retrieved {len(points)} points from collection")
110
+
111
+ # Extract unique book sources from payloads
112
+ books = set()
113
+ for point in points:
114
+ # Check if payload exists and has 'metadata' field with 'source'
115
+ if hasattr(point, 'payload') and point.payload:
116
+ # Check different possible payload structures
117
+ if 'metadata' in point.payload and 'source' in point.payload['metadata']:
118
+ books.add(point.payload['metadata']['source'])
119
+ elif 'source' in point.payload:
120
+ books.add(point.payload['source'])
121
+
122
+ print(f"Found {len(books)} unique books")
123
  return list(books)
124
  except Exception as e:
125
  print(f"Error retrieving book info: {e}")