RAG_ChatBot

Running

File size: 8,785 Bytes

11d9dfb

"""
Tests for vector store module.
"""

import pytest
import numpy as np
from unittest.mock import Mock
import tempfile
from pathlib import Path

import sys
sys.path.append(str(Path(__file__).parent.parent))

from src.vector_store import VectorStore, VectorEntry
from src.document_processor import DocumentChunk


@pytest.fixture
def sample_config():
    """Sample configuration for testing."""
    return {
        "cache": {
            "cache_dir": "./test_cache"
        }
    }


@pytest.fixture
def vector_store(sample_config):
    """Vector store instance."""
    return VectorStore(sample_config, embedding_dim=384)


@pytest.fixture
def sample_chunks():
    """Sample document chunks for testing."""
    chunks = []
    for i in range(3):
        content = f"This is test document content number {i}. It contains some text for testing."
        metadata = {
            "source": f"test_doc_{i}.txt",
            "chunk_index": i,
            "content": content
        }
        chunk = DocumentChunk(content, metadata, f"chunk_{i}")
        chunks.append(chunk)
    return chunks


@pytest.fixture
def sample_embeddings():
    """Sample embeddings for testing."""
    np.random.seed(42)  # For reproducible results
    return np.random.rand(3, 384).astype(np.float32)


class TestVectorStore:
    """Test vector store functionality."""
    
    def test_init(self, sample_config):
        """Test vector store initialization."""
        vs = VectorStore(sample_config, embedding_dim=384)
        assert vs.embedding_dim == 384
        assert len(vs._vectors) == 0
        assert len(vs._id_to_index) == 0
    
    def test_add_documents(self, vector_store, sample_chunks, sample_embeddings):
        """Test adding documents to vector store."""
        added_ids = vector_store.add_documents(sample_chunks, sample_embeddings)
        
        assert len(added_ids) == 3
        assert len(vector_store._vectors) == 3
        assert len(vector_store._id_to_index) == 3
        
        # Check that all IDs are in the index
        for chunk_id in added_ids:
            assert chunk_id in vector_store._id_to_index
    
    def test_search_similar_vectors(self, vector_store, sample_chunks, sample_embeddings):
        """Test similarity search."""
        # Add documents first
        vector_store.add_documents(sample_chunks, sample_embeddings)
        
        # Search with first embedding (should return itself as most similar)
        query_embedding = sample_embeddings[0]
        results = vector_store.search(query_embedding, k=2)
        
        assert len(results) > 0
        assert len(results) <= 2
        
        # Results should be tuples of (id, similarity, metadata)
        for result in results:
            assert len(result) == 3
            vector_id, similarity, metadata = result
            assert isinstance(vector_id, str)
            assert isinstance(similarity, float)
            assert isinstance(metadata, dict)
            assert 0 <= similarity <= 1
    
    def test_search_with_metadata_filter(self, vector_store, sample_chunks, sample_embeddings):
        """Test search with metadata filtering."""
        # Add documents
        vector_store.add_documents(sample_chunks, sample_embeddings)
        
        # Search with metadata filter
        query_embedding = sample_embeddings[0]
        metadata_filter = {"source": "test_doc_1.txt"}
        results = vector_store.search(query_embedding, k=5, metadata_filter=metadata_filter)
        
        # Should only return documents matching the filter
        for result in results:
            _, _, metadata = result
            assert metadata["source"] == "test_doc_1.txt"
    
    def test_get_by_id(self, vector_store, sample_chunks, sample_embeddings):
        """Test retrieving vectors by ID."""
        added_ids = vector_store.add_documents(sample_chunks, sample_embeddings)
        
        # Get first document
        first_id = added_ids[0]
        result = vector_store.get_by_id(first_id)
        
        assert result is not None
        vector, metadata = result
        assert isinstance(vector, np.ndarray)
        assert vector.shape == (384,)
        assert isinstance(metadata, dict)
        assert metadata["source"] == "test_doc_0.txt"
    
    def test_delete_by_id(self, vector_store, sample_chunks, sample_embeddings):
        """Test deleting vectors by ID."""
        added_ids = vector_store.add_documents(sample_chunks, sample_embeddings)
        original_count = len(vector_store._vectors)
        
        # Delete first document
        first_id = added_ids[0]
        success = vector_store.delete_by_id(first_id)
        
        assert success
        assert len(vector_store._vectors) == original_count - 1
        assert first_id not in vector_store._id_to_index
        
        # Should not be able to retrieve deleted document
        result = vector_store.get_by_id(first_id)
        assert result is None
    
    def test_clear(self, vector_store, sample_chunks, sample_embeddings):
        """Test clearing all vectors."""
        vector_store.add_documents(sample_chunks, sample_embeddings)
        assert len(vector_store._vectors) > 0
        
        vector_store.clear()
        
        assert len(vector_store._vectors) == 0
        assert len(vector_store._id_to_index) == 0
        assert vector_store._vector_matrix is None
    
    def test_get_stats(self, vector_store, sample_chunks, sample_embeddings):
        """Test getting statistics."""
        # Test empty store
        stats = vector_store.get_stats()
        assert stats["total_vectors"] == 0
        assert stats["searches_performed"] == 0
        
        # Add documents and test again
        vector_store.add_documents(sample_chunks, sample_embeddings)
        stats = vector_store.get_stats()
        assert stats["total_vectors"] == 3
        assert stats["embedding_dimension"] == 384
    
    def test_save_and_load(self, vector_store, sample_chunks, sample_embeddings):
        """Test saving and loading vector store."""
        # Add documents
        original_ids = vector_store.add_documents(sample_chunks, sample_embeddings)
        original_count = len(vector_store._vectors)
        
        # Save to temporary file
        with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as f:
            temp_file = f.name
        
        try:
            saved_path = vector_store.save_to_disk(temp_file)
            assert Path(saved_path).exists()
            
            # Create new vector store and load
            new_vector_store = VectorStore(vector_store.config)
            new_vector_store.load_from_disk(saved_path)
            
            # Check that data was loaded correctly
            assert len(new_vector_store._vectors) == original_count
            assert new_vector_store.embedding_dim == 384
            
            # Check that we can retrieve the same documents
            for vector_id in original_ids:
                result = new_vector_store.get_by_id(vector_id)
                assert result is not None
            
        finally:
            if Path(temp_file).exists():
                Path(temp_file).unlink()
    
    def test_embedding_dimension_validation(self, sample_config):
        """Test embedding dimension validation."""
        vs = VectorStore(sample_config, embedding_dim=256)
        
        # Create chunks and embeddings with wrong dimension
        chunks = [DocumentChunk("test", {"source": "test.txt"}, "test_id")]
        wrong_embeddings = np.random.rand(1, 384)  # Wrong dimension
        
        with pytest.raises(ValueError, match="Embedding dimension"):
            vs.add_documents(chunks, wrong_embeddings)


class TestVectorEntry:
    """Test VectorEntry functionality."""
    
    def test_vector_entry_creation(self):
        """Test creating vector entries."""
        vector = np.random.rand(384)
        metadata = {"source": "test.txt"}
        
        entry = VectorEntry("test_id", vector, metadata)
        
        assert entry.id == "test_id"
        assert np.array_equal(entry.vector, vector)
        assert entry.metadata == metadata
        assert entry.timestamp is not None
    
    def test_vector_entry_to_dict(self):
        """Test vector entry serialization."""
        vector = np.random.rand(384)
        metadata = {"source": "test.txt"}
        
        entry = VectorEntry("test_id", vector, metadata)
        entry_dict = entry.to_dict()
        
        assert entry_dict["id"] == "test_id"
        assert entry_dict["metadata"] == metadata
        assert "timestamp" in entry_dict
        assert "vector_shape" in entry_dict
        assert entry_dict["vector_shape"] == (384,)


if __name__ == "__main__":
    pytest.main([__file__])