File size: 8,785 Bytes
11d9dfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
"""
Tests for vector store module.
"""

import pytest
import numpy as np
from unittest.mock import Mock
import tempfile
from pathlib import Path

import sys
sys.path.append(str(Path(__file__).parent.parent))

from src.vector_store import VectorStore, VectorEntry
from src.document_processor import DocumentChunk


@pytest.fixture
def sample_config():
    """Sample configuration for testing."""
    return {
        "cache": {
            "cache_dir": "./test_cache"
        }
    }


@pytest.fixture
def vector_store(sample_config):
    """Vector store instance."""
    return VectorStore(sample_config, embedding_dim=384)


@pytest.fixture
def sample_chunks():
    """Sample document chunks for testing."""
    chunks = []
    for i in range(3):
        content = f"This is test document content number {i}. It contains some text for testing."
        metadata = {
            "source": f"test_doc_{i}.txt",
            "chunk_index": i,
            "content": content
        }
        chunk = DocumentChunk(content, metadata, f"chunk_{i}")
        chunks.append(chunk)
    return chunks


@pytest.fixture
def sample_embeddings():
    """Sample embeddings for testing."""
    np.random.seed(42)  # For reproducible results
    return np.random.rand(3, 384).astype(np.float32)


class TestVectorStore:
    """Test vector store functionality."""
    
    def test_init(self, sample_config):
        """Test vector store initialization."""
        vs = VectorStore(sample_config, embedding_dim=384)
        assert vs.embedding_dim == 384
        assert len(vs._vectors) == 0
        assert len(vs._id_to_index) == 0
    
    def test_add_documents(self, vector_store, sample_chunks, sample_embeddings):
        """Test adding documents to vector store."""
        added_ids = vector_store.add_documents(sample_chunks, sample_embeddings)
        
        assert len(added_ids) == 3
        assert len(vector_store._vectors) == 3
        assert len(vector_store._id_to_index) == 3
        
        # Check that all IDs are in the index
        for chunk_id in added_ids:
            assert chunk_id in vector_store._id_to_index
    
    def test_search_similar_vectors(self, vector_store, sample_chunks, sample_embeddings):
        """Test similarity search."""
        # Add documents first
        vector_store.add_documents(sample_chunks, sample_embeddings)
        
        # Search with first embedding (should return itself as most similar)
        query_embedding = sample_embeddings[0]
        results = vector_store.search(query_embedding, k=2)
        
        assert len(results) > 0
        assert len(results) <= 2
        
        # Results should be tuples of (id, similarity, metadata)
        for result in results:
            assert len(result) == 3
            vector_id, similarity, metadata = result
            assert isinstance(vector_id, str)
            assert isinstance(similarity, float)
            assert isinstance(metadata, dict)
            assert 0 <= similarity <= 1
    
    def test_search_with_metadata_filter(self, vector_store, sample_chunks, sample_embeddings):
        """Test search with metadata filtering."""
        # Add documents
        vector_store.add_documents(sample_chunks, sample_embeddings)
        
        # Search with metadata filter
        query_embedding = sample_embeddings[0]
        metadata_filter = {"source": "test_doc_1.txt"}
        results = vector_store.search(query_embedding, k=5, metadata_filter=metadata_filter)
        
        # Should only return documents matching the filter
        for result in results:
            _, _, metadata = result
            assert metadata["source"] == "test_doc_1.txt"
    
    def test_get_by_id(self, vector_store, sample_chunks, sample_embeddings):
        """Test retrieving vectors by ID."""
        added_ids = vector_store.add_documents(sample_chunks, sample_embeddings)
        
        # Get first document
        first_id = added_ids[0]
        result = vector_store.get_by_id(first_id)
        
        assert result is not None
        vector, metadata = result
        assert isinstance(vector, np.ndarray)
        assert vector.shape == (384,)
        assert isinstance(metadata, dict)
        assert metadata["source"] == "test_doc_0.txt"
    
    def test_delete_by_id(self, vector_store, sample_chunks, sample_embeddings):
        """Test deleting vectors by ID."""
        added_ids = vector_store.add_documents(sample_chunks, sample_embeddings)
        original_count = len(vector_store._vectors)
        
        # Delete first document
        first_id = added_ids[0]
        success = vector_store.delete_by_id(first_id)
        
        assert success
        assert len(vector_store._vectors) == original_count - 1
        assert first_id not in vector_store._id_to_index
        
        # Should not be able to retrieve deleted document
        result = vector_store.get_by_id(first_id)
        assert result is None
    
    def test_clear(self, vector_store, sample_chunks, sample_embeddings):
        """Test clearing all vectors."""
        vector_store.add_documents(sample_chunks, sample_embeddings)
        assert len(vector_store._vectors) > 0
        
        vector_store.clear()
        
        assert len(vector_store._vectors) == 0
        assert len(vector_store._id_to_index) == 0
        assert vector_store._vector_matrix is None
    
    def test_get_stats(self, vector_store, sample_chunks, sample_embeddings):
        """Test getting statistics."""
        # Test empty store
        stats = vector_store.get_stats()
        assert stats["total_vectors"] == 0
        assert stats["searches_performed"] == 0
        
        # Add documents and test again
        vector_store.add_documents(sample_chunks, sample_embeddings)
        stats = vector_store.get_stats()
        assert stats["total_vectors"] == 3
        assert stats["embedding_dimension"] == 384
    
    def test_save_and_load(self, vector_store, sample_chunks, sample_embeddings):
        """Test saving and loading vector store."""
        # Add documents
        original_ids = vector_store.add_documents(sample_chunks, sample_embeddings)
        original_count = len(vector_store._vectors)
        
        # Save to temporary file
        with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as f:
            temp_file = f.name
        
        try:
            saved_path = vector_store.save_to_disk(temp_file)
            assert Path(saved_path).exists()
            
            # Create new vector store and load
            new_vector_store = VectorStore(vector_store.config)
            new_vector_store.load_from_disk(saved_path)
            
            # Check that data was loaded correctly
            assert len(new_vector_store._vectors) == original_count
            assert new_vector_store.embedding_dim == 384
            
            # Check that we can retrieve the same documents
            for vector_id in original_ids:
                result = new_vector_store.get_by_id(vector_id)
                assert result is not None
            
        finally:
            if Path(temp_file).exists():
                Path(temp_file).unlink()
    
    def test_embedding_dimension_validation(self, sample_config):
        """Test embedding dimension validation."""
        vs = VectorStore(sample_config, embedding_dim=256)
        
        # Create chunks and embeddings with wrong dimension
        chunks = [DocumentChunk("test", {"source": "test.txt"}, "test_id")]
        wrong_embeddings = np.random.rand(1, 384)  # Wrong dimension
        
        with pytest.raises(ValueError, match="Embedding dimension"):
            vs.add_documents(chunks, wrong_embeddings)


class TestVectorEntry:
    """Test VectorEntry functionality."""
    
    def test_vector_entry_creation(self):
        """Test creating vector entries."""
        vector = np.random.rand(384)
        metadata = {"source": "test.txt"}
        
        entry = VectorEntry("test_id", vector, metadata)
        
        assert entry.id == "test_id"
        assert np.array_equal(entry.vector, vector)
        assert entry.metadata == metadata
        assert entry.timestamp is not None
    
    def test_vector_entry_to_dict(self):
        """Test vector entry serialization."""
        vector = np.random.rand(384)
        metadata = {"source": "test.txt"}
        
        entry = VectorEntry("test_id", vector, metadata)
        entry_dict = entry.to_dict()
        
        assert entry_dict["id"] == "test_id"
        assert entry_dict["metadata"] == metadata
        assert "timestamp" in entry_dict
        assert "vector_shape" in entry_dict
        assert entry_dict["vector_shape"] == (384,)


if __name__ == "__main__":
    pytest.main([__file__])