Spaces:
Running
Running
File size: 8,785 Bytes
11d9dfb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 |
"""
Tests for vector store module.
"""
import pytest
import numpy as np
from unittest.mock import Mock
import tempfile
from pathlib import Path
import sys
sys.path.append(str(Path(__file__).parent.parent))
from src.vector_store import VectorStore, VectorEntry
from src.document_processor import DocumentChunk
@pytest.fixture
def sample_config():
"""Sample configuration for testing."""
return {
"cache": {
"cache_dir": "./test_cache"
}
}
@pytest.fixture
def vector_store(sample_config):
"""Vector store instance."""
return VectorStore(sample_config, embedding_dim=384)
@pytest.fixture
def sample_chunks():
"""Sample document chunks for testing."""
chunks = []
for i in range(3):
content = f"This is test document content number {i}. It contains some text for testing."
metadata = {
"source": f"test_doc_{i}.txt",
"chunk_index": i,
"content": content
}
chunk = DocumentChunk(content, metadata, f"chunk_{i}")
chunks.append(chunk)
return chunks
@pytest.fixture
def sample_embeddings():
"""Sample embeddings for testing."""
np.random.seed(42) # For reproducible results
return np.random.rand(3, 384).astype(np.float32)
class TestVectorStore:
"""Test vector store functionality."""
def test_init(self, sample_config):
"""Test vector store initialization."""
vs = VectorStore(sample_config, embedding_dim=384)
assert vs.embedding_dim == 384
assert len(vs._vectors) == 0
assert len(vs._id_to_index) == 0
def test_add_documents(self, vector_store, sample_chunks, sample_embeddings):
"""Test adding documents to vector store."""
added_ids = vector_store.add_documents(sample_chunks, sample_embeddings)
assert len(added_ids) == 3
assert len(vector_store._vectors) == 3
assert len(vector_store._id_to_index) == 3
# Check that all IDs are in the index
for chunk_id in added_ids:
assert chunk_id in vector_store._id_to_index
def test_search_similar_vectors(self, vector_store, sample_chunks, sample_embeddings):
"""Test similarity search."""
# Add documents first
vector_store.add_documents(sample_chunks, sample_embeddings)
# Search with first embedding (should return itself as most similar)
query_embedding = sample_embeddings[0]
results = vector_store.search(query_embedding, k=2)
assert len(results) > 0
assert len(results) <= 2
# Results should be tuples of (id, similarity, metadata)
for result in results:
assert len(result) == 3
vector_id, similarity, metadata = result
assert isinstance(vector_id, str)
assert isinstance(similarity, float)
assert isinstance(metadata, dict)
assert 0 <= similarity <= 1
def test_search_with_metadata_filter(self, vector_store, sample_chunks, sample_embeddings):
"""Test search with metadata filtering."""
# Add documents
vector_store.add_documents(sample_chunks, sample_embeddings)
# Search with metadata filter
query_embedding = sample_embeddings[0]
metadata_filter = {"source": "test_doc_1.txt"}
results = vector_store.search(query_embedding, k=5, metadata_filter=metadata_filter)
# Should only return documents matching the filter
for result in results:
_, _, metadata = result
assert metadata["source"] == "test_doc_1.txt"
def test_get_by_id(self, vector_store, sample_chunks, sample_embeddings):
"""Test retrieving vectors by ID."""
added_ids = vector_store.add_documents(sample_chunks, sample_embeddings)
# Get first document
first_id = added_ids[0]
result = vector_store.get_by_id(first_id)
assert result is not None
vector, metadata = result
assert isinstance(vector, np.ndarray)
assert vector.shape == (384,)
assert isinstance(metadata, dict)
assert metadata["source"] == "test_doc_0.txt"
def test_delete_by_id(self, vector_store, sample_chunks, sample_embeddings):
"""Test deleting vectors by ID."""
added_ids = vector_store.add_documents(sample_chunks, sample_embeddings)
original_count = len(vector_store._vectors)
# Delete first document
first_id = added_ids[0]
success = vector_store.delete_by_id(first_id)
assert success
assert len(vector_store._vectors) == original_count - 1
assert first_id not in vector_store._id_to_index
# Should not be able to retrieve deleted document
result = vector_store.get_by_id(first_id)
assert result is None
def test_clear(self, vector_store, sample_chunks, sample_embeddings):
"""Test clearing all vectors."""
vector_store.add_documents(sample_chunks, sample_embeddings)
assert len(vector_store._vectors) > 0
vector_store.clear()
assert len(vector_store._vectors) == 0
assert len(vector_store._id_to_index) == 0
assert vector_store._vector_matrix is None
def test_get_stats(self, vector_store, sample_chunks, sample_embeddings):
"""Test getting statistics."""
# Test empty store
stats = vector_store.get_stats()
assert stats["total_vectors"] == 0
assert stats["searches_performed"] == 0
# Add documents and test again
vector_store.add_documents(sample_chunks, sample_embeddings)
stats = vector_store.get_stats()
assert stats["total_vectors"] == 3
assert stats["embedding_dimension"] == 384
def test_save_and_load(self, vector_store, sample_chunks, sample_embeddings):
"""Test saving and loading vector store."""
# Add documents
original_ids = vector_store.add_documents(sample_chunks, sample_embeddings)
original_count = len(vector_store._vectors)
# Save to temporary file
with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as f:
temp_file = f.name
try:
saved_path = vector_store.save_to_disk(temp_file)
assert Path(saved_path).exists()
# Create new vector store and load
new_vector_store = VectorStore(vector_store.config)
new_vector_store.load_from_disk(saved_path)
# Check that data was loaded correctly
assert len(new_vector_store._vectors) == original_count
assert new_vector_store.embedding_dim == 384
# Check that we can retrieve the same documents
for vector_id in original_ids:
result = new_vector_store.get_by_id(vector_id)
assert result is not None
finally:
if Path(temp_file).exists():
Path(temp_file).unlink()
def test_embedding_dimension_validation(self, sample_config):
"""Test embedding dimension validation."""
vs = VectorStore(sample_config, embedding_dim=256)
# Create chunks and embeddings with wrong dimension
chunks = [DocumentChunk("test", {"source": "test.txt"}, "test_id")]
wrong_embeddings = np.random.rand(1, 384) # Wrong dimension
with pytest.raises(ValueError, match="Embedding dimension"):
vs.add_documents(chunks, wrong_embeddings)
class TestVectorEntry:
"""Test VectorEntry functionality."""
def test_vector_entry_creation(self):
"""Test creating vector entries."""
vector = np.random.rand(384)
metadata = {"source": "test.txt"}
entry = VectorEntry("test_id", vector, metadata)
assert entry.id == "test_id"
assert np.array_equal(entry.vector, vector)
assert entry.metadata == metadata
assert entry.timestamp is not None
def test_vector_entry_to_dict(self):
"""Test vector entry serialization."""
vector = np.random.rand(384)
metadata = {"source": "test.txt"}
entry = VectorEntry("test_id", vector, metadata)
entry_dict = entry.to_dict()
assert entry_dict["id"] == "test_id"
assert entry_dict["metadata"] == metadata
assert "timestamp" in entry_dict
assert "vector_shape" in entry_dict
assert entry_dict["vector_shape"] == (384,)
if __name__ == "__main__":
pytest.main([__file__]) |