from sentence_transformers import SentenceTransformer import faiss import numpy as np import hashlib # Load model once embedder = SentenceTransformer('all-MiniLM-L6-v2') class VectorStore: def __init__(self): self.texts = [] self.embeddings = [] self.index = None self.text_hashes = set() def add_texts(self, texts): """Add list of texts to the store, avoiding duplicates""" new_texts = [] for text in texts: text_hash = hashlib.md5(text.encode()).hexdigest() if text_hash not in self.text_hashes: new_texts.append(text) self.text_hashes.add(text_hash) if not new_texts: return # Encode new texts new_embeds = embedder.encode(new_texts) self.texts.extend(new_texts) self.embeddings.extend(new_embeds) # Update FAISS index if self.index is None: self.index = faiss.IndexFlatL2(new_embeds[0].shape[0]) # Convert to numpy array and add to index embeds_array = np.array(self.embeddings).astype('float32') self.index.reset() self.index.add(embeds_array) def retrieve(self, query, top_k=3): """Return top-k relevant texts and their indices""" if not self.index or not self.texts: return [], [] # Encode query query_embed = embedder.encode([query]) query_array = np.array(query_embed).astype('float32') # Search distances, indices = self.index.search(query_array, k=min(top_k, len(self.texts))) # Return texts and indices return [self.texts[i] for i in indices[0]], indices[0].tolist() def clear(self): """Clear the vector store""" self.texts = [] self.embeddings = [] self.index = None self.text_hashes = set()