Spaces:
Running
Running
""" | |
Tests for vector store module. | |
""" | |
import pytest | |
import numpy as np | |
from unittest.mock import Mock | |
import tempfile | |
from pathlib import Path | |
import sys | |
sys.path.append(str(Path(__file__).parent.parent)) | |
from src.vector_store import VectorStore, VectorEntry | |
from src.document_processor import DocumentChunk | |
def sample_config(): | |
"""Sample configuration for testing.""" | |
return { | |
"cache": { | |
"cache_dir": "./test_cache" | |
} | |
} | |
def vector_store(sample_config): | |
"""Vector store instance.""" | |
return VectorStore(sample_config, embedding_dim=384) | |
def sample_chunks(): | |
"""Sample document chunks for testing.""" | |
chunks = [] | |
for i in range(3): | |
content = f"This is test document content number {i}. It contains some text for testing." | |
metadata = { | |
"source": f"test_doc_{i}.txt", | |
"chunk_index": i, | |
"content": content | |
} | |
chunk = DocumentChunk(content, metadata, f"chunk_{i}") | |
chunks.append(chunk) | |
return chunks | |
def sample_embeddings(): | |
"""Sample embeddings for testing.""" | |
np.random.seed(42) # For reproducible results | |
return np.random.rand(3, 384).astype(np.float32) | |
class TestVectorStore: | |
"""Test vector store functionality.""" | |
def test_init(self, sample_config): | |
"""Test vector store initialization.""" | |
vs = VectorStore(sample_config, embedding_dim=384) | |
assert vs.embedding_dim == 384 | |
assert len(vs._vectors) == 0 | |
assert len(vs._id_to_index) == 0 | |
def test_add_documents(self, vector_store, sample_chunks, sample_embeddings): | |
"""Test adding documents to vector store.""" | |
added_ids = vector_store.add_documents(sample_chunks, sample_embeddings) | |
assert len(added_ids) == 3 | |
assert len(vector_store._vectors) == 3 | |
assert len(vector_store._id_to_index) == 3 | |
# Check that all IDs are in the index | |
for chunk_id in added_ids: | |
assert chunk_id in vector_store._id_to_index | |
def test_search_similar_vectors(self, vector_store, sample_chunks, sample_embeddings): | |
"""Test similarity search.""" | |
# Add documents first | |
vector_store.add_documents(sample_chunks, sample_embeddings) | |
# Search with first embedding (should return itself as most similar) | |
query_embedding = sample_embeddings[0] | |
results = vector_store.search(query_embedding, k=2) | |
assert len(results) > 0 | |
assert len(results) <= 2 | |
# Results should be tuples of (id, similarity, metadata) | |
for result in results: | |
assert len(result) == 3 | |
vector_id, similarity, metadata = result | |
assert isinstance(vector_id, str) | |
assert isinstance(similarity, float) | |
assert isinstance(metadata, dict) | |
assert 0 <= similarity <= 1 | |
def test_search_with_metadata_filter(self, vector_store, sample_chunks, sample_embeddings): | |
"""Test search with metadata filtering.""" | |
# Add documents | |
vector_store.add_documents(sample_chunks, sample_embeddings) | |
# Search with metadata filter | |
query_embedding = sample_embeddings[0] | |
metadata_filter = {"source": "test_doc_1.txt"} | |
results = vector_store.search(query_embedding, k=5, metadata_filter=metadata_filter) | |
# Should only return documents matching the filter | |
for result in results: | |
_, _, metadata = result | |
assert metadata["source"] == "test_doc_1.txt" | |
def test_get_by_id(self, vector_store, sample_chunks, sample_embeddings): | |
"""Test retrieving vectors by ID.""" | |
added_ids = vector_store.add_documents(sample_chunks, sample_embeddings) | |
# Get first document | |
first_id = added_ids[0] | |
result = vector_store.get_by_id(first_id) | |
assert result is not None | |
vector, metadata = result | |
assert isinstance(vector, np.ndarray) | |
assert vector.shape == (384,) | |
assert isinstance(metadata, dict) | |
assert metadata["source"] == "test_doc_0.txt" | |
def test_delete_by_id(self, vector_store, sample_chunks, sample_embeddings): | |
"""Test deleting vectors by ID.""" | |
added_ids = vector_store.add_documents(sample_chunks, sample_embeddings) | |
original_count = len(vector_store._vectors) | |
# Delete first document | |
first_id = added_ids[0] | |
success = vector_store.delete_by_id(first_id) | |
assert success | |
assert len(vector_store._vectors) == original_count - 1 | |
assert first_id not in vector_store._id_to_index | |
# Should not be able to retrieve deleted document | |
result = vector_store.get_by_id(first_id) | |
assert result is None | |
def test_clear(self, vector_store, sample_chunks, sample_embeddings): | |
"""Test clearing all vectors.""" | |
vector_store.add_documents(sample_chunks, sample_embeddings) | |
assert len(vector_store._vectors) > 0 | |
vector_store.clear() | |
assert len(vector_store._vectors) == 0 | |
assert len(vector_store._id_to_index) == 0 | |
assert vector_store._vector_matrix is None | |
def test_get_stats(self, vector_store, sample_chunks, sample_embeddings): | |
"""Test getting statistics.""" | |
# Test empty store | |
stats = vector_store.get_stats() | |
assert stats["total_vectors"] == 0 | |
assert stats["searches_performed"] == 0 | |
# Add documents and test again | |
vector_store.add_documents(sample_chunks, sample_embeddings) | |
stats = vector_store.get_stats() | |
assert stats["total_vectors"] == 3 | |
assert stats["embedding_dimension"] == 384 | |
def test_save_and_load(self, vector_store, sample_chunks, sample_embeddings): | |
"""Test saving and loading vector store.""" | |
# Add documents | |
original_ids = vector_store.add_documents(sample_chunks, sample_embeddings) | |
original_count = len(vector_store._vectors) | |
# Save to temporary file | |
with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as f: | |
temp_file = f.name | |
try: | |
saved_path = vector_store.save_to_disk(temp_file) | |
assert Path(saved_path).exists() | |
# Create new vector store and load | |
new_vector_store = VectorStore(vector_store.config) | |
new_vector_store.load_from_disk(saved_path) | |
# Check that data was loaded correctly | |
assert len(new_vector_store._vectors) == original_count | |
assert new_vector_store.embedding_dim == 384 | |
# Check that we can retrieve the same documents | |
for vector_id in original_ids: | |
result = new_vector_store.get_by_id(vector_id) | |
assert result is not None | |
finally: | |
if Path(temp_file).exists(): | |
Path(temp_file).unlink() | |
def test_embedding_dimension_validation(self, sample_config): | |
"""Test embedding dimension validation.""" | |
vs = VectorStore(sample_config, embedding_dim=256) | |
# Create chunks and embeddings with wrong dimension | |
chunks = [DocumentChunk("test", {"source": "test.txt"}, "test_id")] | |
wrong_embeddings = np.random.rand(1, 384) # Wrong dimension | |
with pytest.raises(ValueError, match="Embedding dimension"): | |
vs.add_documents(chunks, wrong_embeddings) | |
class TestVectorEntry: | |
"""Test VectorEntry functionality.""" | |
def test_vector_entry_creation(self): | |
"""Test creating vector entries.""" | |
vector = np.random.rand(384) | |
metadata = {"source": "test.txt"} | |
entry = VectorEntry("test_id", vector, metadata) | |
assert entry.id == "test_id" | |
assert np.array_equal(entry.vector, vector) | |
assert entry.metadata == metadata | |
assert entry.timestamp is not None | |
def test_vector_entry_to_dict(self): | |
"""Test vector entry serialization.""" | |
vector = np.random.rand(384) | |
metadata = {"source": "test.txt"} | |
entry = VectorEntry("test_id", vector, metadata) | |
entry_dict = entry.to_dict() | |
assert entry_dict["id"] == "test_id" | |
assert entry_dict["metadata"] == metadata | |
assert "timestamp" in entry_dict | |
assert "vector_shape" in entry_dict | |
assert entry_dict["vector_shape"] == (384,) | |
if __name__ == "__main__": | |
pytest.main([__file__]) |