""" Tests for vector store module. """ import pytest import numpy as np from unittest.mock import Mock import tempfile from pathlib import Path import sys sys.path.append(str(Path(__file__).parent.parent)) from src.vector_store import VectorStore, VectorEntry from src.document_processor import DocumentChunk @pytest.fixture def sample_config(): """Sample configuration for testing.""" return { "cache": { "cache_dir": "./test_cache" } } @pytest.fixture def vector_store(sample_config): """Vector store instance.""" return VectorStore(sample_config, embedding_dim=384) @pytest.fixture def sample_chunks(): """Sample document chunks for testing.""" chunks = [] for i in range(3): content = f"This is test document content number {i}. It contains some text for testing." metadata = { "source": f"test_doc_{i}.txt", "chunk_index": i, "content": content } chunk = DocumentChunk(content, metadata, f"chunk_{i}") chunks.append(chunk) return chunks @pytest.fixture def sample_embeddings(): """Sample embeddings for testing.""" np.random.seed(42) # For reproducible results return np.random.rand(3, 384).astype(np.float32) class TestVectorStore: """Test vector store functionality.""" def test_init(self, sample_config): """Test vector store initialization.""" vs = VectorStore(sample_config, embedding_dim=384) assert vs.embedding_dim == 384 assert len(vs._vectors) == 0 assert len(vs._id_to_index) == 0 def test_add_documents(self, vector_store, sample_chunks, sample_embeddings): """Test adding documents to vector store.""" added_ids = vector_store.add_documents(sample_chunks, sample_embeddings) assert len(added_ids) == 3 assert len(vector_store._vectors) == 3 assert len(vector_store._id_to_index) == 3 # Check that all IDs are in the index for chunk_id in added_ids: assert chunk_id in vector_store._id_to_index def test_search_similar_vectors(self, vector_store, sample_chunks, sample_embeddings): """Test similarity search.""" # Add documents first vector_store.add_documents(sample_chunks, sample_embeddings) # Search with first embedding (should return itself as most similar) query_embedding = sample_embeddings[0] results = vector_store.search(query_embedding, k=2) assert len(results) > 0 assert len(results) <= 2 # Results should be tuples of (id, similarity, metadata) for result in results: assert len(result) == 3 vector_id, similarity, metadata = result assert isinstance(vector_id, str) assert isinstance(similarity, float) assert isinstance(metadata, dict) assert 0 <= similarity <= 1 def test_search_with_metadata_filter(self, vector_store, sample_chunks, sample_embeddings): """Test search with metadata filtering.""" # Add documents vector_store.add_documents(sample_chunks, sample_embeddings) # Search with metadata filter query_embedding = sample_embeddings[0] metadata_filter = {"source": "test_doc_1.txt"} results = vector_store.search(query_embedding, k=5, metadata_filter=metadata_filter) # Should only return documents matching the filter for result in results: _, _, metadata = result assert metadata["source"] == "test_doc_1.txt" def test_get_by_id(self, vector_store, sample_chunks, sample_embeddings): """Test retrieving vectors by ID.""" added_ids = vector_store.add_documents(sample_chunks, sample_embeddings) # Get first document first_id = added_ids[0] result = vector_store.get_by_id(first_id) assert result is not None vector, metadata = result assert isinstance(vector, np.ndarray) assert vector.shape == (384,) assert isinstance(metadata, dict) assert metadata["source"] == "test_doc_0.txt" def test_delete_by_id(self, vector_store, sample_chunks, sample_embeddings): """Test deleting vectors by ID.""" added_ids = vector_store.add_documents(sample_chunks, sample_embeddings) original_count = len(vector_store._vectors) # Delete first document first_id = added_ids[0] success = vector_store.delete_by_id(first_id) assert success assert len(vector_store._vectors) == original_count - 1 assert first_id not in vector_store._id_to_index # Should not be able to retrieve deleted document result = vector_store.get_by_id(first_id) assert result is None def test_clear(self, vector_store, sample_chunks, sample_embeddings): """Test clearing all vectors.""" vector_store.add_documents(sample_chunks, sample_embeddings) assert len(vector_store._vectors) > 0 vector_store.clear() assert len(vector_store._vectors) == 0 assert len(vector_store._id_to_index) == 0 assert vector_store._vector_matrix is None def test_get_stats(self, vector_store, sample_chunks, sample_embeddings): """Test getting statistics.""" # Test empty store stats = vector_store.get_stats() assert stats["total_vectors"] == 0 assert stats["searches_performed"] == 0 # Add documents and test again vector_store.add_documents(sample_chunks, sample_embeddings) stats = vector_store.get_stats() assert stats["total_vectors"] == 3 assert stats["embedding_dimension"] == 384 def test_save_and_load(self, vector_store, sample_chunks, sample_embeddings): """Test saving and loading vector store.""" # Add documents original_ids = vector_store.add_documents(sample_chunks, sample_embeddings) original_count = len(vector_store._vectors) # Save to temporary file with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as f: temp_file = f.name try: saved_path = vector_store.save_to_disk(temp_file) assert Path(saved_path).exists() # Create new vector store and load new_vector_store = VectorStore(vector_store.config) new_vector_store.load_from_disk(saved_path) # Check that data was loaded correctly assert len(new_vector_store._vectors) == original_count assert new_vector_store.embedding_dim == 384 # Check that we can retrieve the same documents for vector_id in original_ids: result = new_vector_store.get_by_id(vector_id) assert result is not None finally: if Path(temp_file).exists(): Path(temp_file).unlink() def test_embedding_dimension_validation(self, sample_config): """Test embedding dimension validation.""" vs = VectorStore(sample_config, embedding_dim=256) # Create chunks and embeddings with wrong dimension chunks = [DocumentChunk("test", {"source": "test.txt"}, "test_id")] wrong_embeddings = np.random.rand(1, 384) # Wrong dimension with pytest.raises(ValueError, match="Embedding dimension"): vs.add_documents(chunks, wrong_embeddings) class TestVectorEntry: """Test VectorEntry functionality.""" def test_vector_entry_creation(self): """Test creating vector entries.""" vector = np.random.rand(384) metadata = {"source": "test.txt"} entry = VectorEntry("test_id", vector, metadata) assert entry.id == "test_id" assert np.array_equal(entry.vector, vector) assert entry.metadata == metadata assert entry.timestamp is not None def test_vector_entry_to_dict(self): """Test vector entry serialization.""" vector = np.random.rand(384) metadata = {"source": "test.txt"} entry = VectorEntry("test_id", vector, metadata) entry_dict = entry.to_dict() assert entry_dict["id"] == "test_id" assert entry_dict["metadata"] == metadata assert "timestamp" in entry_dict assert "vector_shape" in entry_dict assert entry_dict["vector_shape"] == (384,) if __name__ == "__main__": pytest.main([__file__])