|
""" |
|
Tests for vector store module. |
|
""" |
|
|
|
import pytest |
|
import numpy as np |
|
from unittest.mock import Mock |
|
import tempfile |
|
from pathlib import Path |
|
|
|
import sys |
|
sys.path.append(str(Path(__file__).parent.parent)) |
|
|
|
from src.vector_store import VectorStore, VectorEntry |
|
from src.document_processor import DocumentChunk |
|
|
|
|
|
@pytest.fixture |
|
def sample_config(): |
|
"""Sample configuration for testing.""" |
|
return { |
|
"cache": { |
|
"cache_dir": "./test_cache" |
|
} |
|
} |
|
|
|
|
|
@pytest.fixture |
|
def vector_store(sample_config): |
|
"""Vector store instance.""" |
|
return VectorStore(sample_config, embedding_dim=384) |
|
|
|
|
|
@pytest.fixture |
|
def sample_chunks(): |
|
"""Sample document chunks for testing.""" |
|
chunks = [] |
|
for i in range(3): |
|
content = f"This is test document content number {i}. It contains some text for testing." |
|
metadata = { |
|
"source": f"test_doc_{i}.txt", |
|
"chunk_index": i, |
|
"content": content |
|
} |
|
chunk = DocumentChunk(content, metadata, f"chunk_{i}") |
|
chunks.append(chunk) |
|
return chunks |
|
|
|
|
|
@pytest.fixture |
|
def sample_embeddings(): |
|
"""Sample embeddings for testing.""" |
|
np.random.seed(42) |
|
return np.random.rand(3, 384).astype(np.float32) |
|
|
|
|
|
class TestVectorStore: |
|
"""Test vector store functionality.""" |
|
|
|
def test_init(self, sample_config): |
|
"""Test vector store initialization.""" |
|
vs = VectorStore(sample_config, embedding_dim=384) |
|
assert vs.embedding_dim == 384 |
|
assert len(vs._vectors) == 0 |
|
assert len(vs._id_to_index) == 0 |
|
|
|
def test_add_documents(self, vector_store, sample_chunks, sample_embeddings): |
|
"""Test adding documents to vector store.""" |
|
added_ids = vector_store.add_documents(sample_chunks, sample_embeddings) |
|
|
|
assert len(added_ids) == 3 |
|
assert len(vector_store._vectors) == 3 |
|
assert len(vector_store._id_to_index) == 3 |
|
|
|
|
|
for chunk_id in added_ids: |
|
assert chunk_id in vector_store._id_to_index |
|
|
|
def test_search_similar_vectors(self, vector_store, sample_chunks, sample_embeddings): |
|
"""Test similarity search.""" |
|
|
|
vector_store.add_documents(sample_chunks, sample_embeddings) |
|
|
|
|
|
query_embedding = sample_embeddings[0] |
|
results = vector_store.search(query_embedding, k=2) |
|
|
|
assert len(results) > 0 |
|
assert len(results) <= 2 |
|
|
|
|
|
for result in results: |
|
assert len(result) == 3 |
|
vector_id, similarity, metadata = result |
|
assert isinstance(vector_id, str) |
|
assert isinstance(similarity, float) |
|
assert isinstance(metadata, dict) |
|
assert 0 <= similarity <= 1 |
|
|
|
def test_search_with_metadata_filter(self, vector_store, sample_chunks, sample_embeddings): |
|
"""Test search with metadata filtering.""" |
|
|
|
vector_store.add_documents(sample_chunks, sample_embeddings) |
|
|
|
|
|
query_embedding = sample_embeddings[0] |
|
metadata_filter = {"source": "test_doc_1.txt"} |
|
results = vector_store.search(query_embedding, k=5, metadata_filter=metadata_filter) |
|
|
|
|
|
for result in results: |
|
_, _, metadata = result |
|
assert metadata["source"] == "test_doc_1.txt" |
|
|
|
def test_get_by_id(self, vector_store, sample_chunks, sample_embeddings): |
|
"""Test retrieving vectors by ID.""" |
|
added_ids = vector_store.add_documents(sample_chunks, sample_embeddings) |
|
|
|
|
|
first_id = added_ids[0] |
|
result = vector_store.get_by_id(first_id) |
|
|
|
assert result is not None |
|
vector, metadata = result |
|
assert isinstance(vector, np.ndarray) |
|
assert vector.shape == (384,) |
|
assert isinstance(metadata, dict) |
|
assert metadata["source"] == "test_doc_0.txt" |
|
|
|
def test_delete_by_id(self, vector_store, sample_chunks, sample_embeddings): |
|
"""Test deleting vectors by ID.""" |
|
added_ids = vector_store.add_documents(sample_chunks, sample_embeddings) |
|
original_count = len(vector_store._vectors) |
|
|
|
|
|
first_id = added_ids[0] |
|
success = vector_store.delete_by_id(first_id) |
|
|
|
assert success |
|
assert len(vector_store._vectors) == original_count - 1 |
|
assert first_id not in vector_store._id_to_index |
|
|
|
|
|
result = vector_store.get_by_id(first_id) |
|
assert result is None |
|
|
|
def test_clear(self, vector_store, sample_chunks, sample_embeddings): |
|
"""Test clearing all vectors.""" |
|
vector_store.add_documents(sample_chunks, sample_embeddings) |
|
assert len(vector_store._vectors) > 0 |
|
|
|
vector_store.clear() |
|
|
|
assert len(vector_store._vectors) == 0 |
|
assert len(vector_store._id_to_index) == 0 |
|
assert vector_store._vector_matrix is None |
|
|
|
def test_get_stats(self, vector_store, sample_chunks, sample_embeddings): |
|
"""Test getting statistics.""" |
|
|
|
stats = vector_store.get_stats() |
|
assert stats["total_vectors"] == 0 |
|
assert stats["searches_performed"] == 0 |
|
|
|
|
|
vector_store.add_documents(sample_chunks, sample_embeddings) |
|
stats = vector_store.get_stats() |
|
assert stats["total_vectors"] == 3 |
|
assert stats["embedding_dimension"] == 384 |
|
|
|
def test_save_and_load(self, vector_store, sample_chunks, sample_embeddings): |
|
"""Test saving and loading vector store.""" |
|
|
|
original_ids = vector_store.add_documents(sample_chunks, sample_embeddings) |
|
original_count = len(vector_store._vectors) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as f: |
|
temp_file = f.name |
|
|
|
try: |
|
saved_path = vector_store.save_to_disk(temp_file) |
|
assert Path(saved_path).exists() |
|
|
|
|
|
new_vector_store = VectorStore(vector_store.config) |
|
new_vector_store.load_from_disk(saved_path) |
|
|
|
|
|
assert len(new_vector_store._vectors) == original_count |
|
assert new_vector_store.embedding_dim == 384 |
|
|
|
|
|
for vector_id in original_ids: |
|
result = new_vector_store.get_by_id(vector_id) |
|
assert result is not None |
|
|
|
finally: |
|
if Path(temp_file).exists(): |
|
Path(temp_file).unlink() |
|
|
|
def test_embedding_dimension_validation(self, sample_config): |
|
"""Test embedding dimension validation.""" |
|
vs = VectorStore(sample_config, embedding_dim=256) |
|
|
|
|
|
chunks = [DocumentChunk("test", {"source": "test.txt"}, "test_id")] |
|
wrong_embeddings = np.random.rand(1, 384) |
|
|
|
with pytest.raises(ValueError, match="Embedding dimension"): |
|
vs.add_documents(chunks, wrong_embeddings) |
|
|
|
|
|
class TestVectorEntry: |
|
"""Test VectorEntry functionality.""" |
|
|
|
def test_vector_entry_creation(self): |
|
"""Test creating vector entries.""" |
|
vector = np.random.rand(384) |
|
metadata = {"source": "test.txt"} |
|
|
|
entry = VectorEntry("test_id", vector, metadata) |
|
|
|
assert entry.id == "test_id" |
|
assert np.array_equal(entry.vector, vector) |
|
assert entry.metadata == metadata |
|
assert entry.timestamp is not None |
|
|
|
def test_vector_entry_to_dict(self): |
|
"""Test vector entry serialization.""" |
|
vector = np.random.rand(384) |
|
metadata = {"source": "test.txt"} |
|
|
|
entry = VectorEntry("test_id", vector, metadata) |
|
entry_dict = entry.to_dict() |
|
|
|
assert entry_dict["id"] == "test_id" |
|
assert entry_dict["metadata"] == metadata |
|
assert "timestamp" in entry_dict |
|
assert "vector_shape" in entry_dict |
|
assert entry_dict["vector_shape"] == (384,) |
|
|
|
|
|
if __name__ == "__main__": |
|
pytest.main([__file__]) |