Spaces:

jlnh
/

RAG

Sleeping

RAG / tests /test_vector_store.py

Jialun He

1st version

11d9dfb 13 days ago

8.79 kB

	"""
	Tests for vector store module.
	"""

	import pytest
	import numpy as np
	from unittest.mock import Mock
	import tempfile
	from pathlib import Path

	import sys
	sys.path.append(str(Path(__file__).parent.parent))

	from src.vector_store import VectorStore, VectorEntry
	from src.document_processor import DocumentChunk


	@pytest.fixture
	def sample_config():
	"""Sample configuration for testing."""
	return {
	"cache": {
	"cache_dir": "./test_cache"
	}
	}


	@pytest.fixture
	def vector_store(sample_config):
	"""Vector store instance."""
	return VectorStore(sample_config, embedding_dim=384)


	@pytest.fixture
	def sample_chunks():
	"""Sample document chunks for testing."""
	chunks = []
	for i in range(3):
	content = f"This is test document content number {i}. It contains some text for testing."
	metadata = {
	"source": f"test_doc_{i}.txt",
	"chunk_index": i,
	"content": content
	}
	chunk = DocumentChunk(content, metadata, f"chunk_{i}")
	chunks.append(chunk)
	return chunks


	@pytest.fixture
	def sample_embeddings():
	"""Sample embeddings for testing."""
	np.random.seed(42) # For reproducible results
	return np.random.rand(3, 384).astype(np.float32)


	class TestVectorStore:
	"""Test vector store functionality."""

	def test_init(self, sample_config):
	"""Test vector store initialization."""
	vs = VectorStore(sample_config, embedding_dim=384)
	assert vs.embedding_dim == 384
	assert len(vs._vectors) == 0
	assert len(vs._id_to_index) == 0

	def test_add_documents(self, vector_store, sample_chunks, sample_embeddings):
	"""Test adding documents to vector store."""
	added_ids = vector_store.add_documents(sample_chunks, sample_embeddings)

	assert len(added_ids) == 3
	assert len(vector_store._vectors) == 3
	assert len(vector_store._id_to_index) == 3

	# Check that all IDs are in the index
	for chunk_id in added_ids:
	assert chunk_id in vector_store._id_to_index

	def test_search_similar_vectors(self, vector_store, sample_chunks, sample_embeddings):
	"""Test similarity search."""
	# Add documents first
	vector_store.add_documents(sample_chunks, sample_embeddings)

	# Search with first embedding (should return itself as most similar)
	query_embedding = sample_embeddings[0]
	results = vector_store.search(query_embedding, k=2)

	assert len(results) > 0
	assert len(results) <= 2

	# Results should be tuples of (id, similarity, metadata)
	for result in results:
	assert len(result) == 3
	vector_id, similarity, metadata = result
	assert isinstance(vector_id, str)
	assert isinstance(similarity, float)
	assert isinstance(metadata, dict)
	assert 0 <= similarity <= 1

	def test_search_with_metadata_filter(self, vector_store, sample_chunks, sample_embeddings):
	"""Test search with metadata filtering."""
	# Add documents
	vector_store.add_documents(sample_chunks, sample_embeddings)

	# Search with metadata filter
	query_embedding = sample_embeddings[0]
	metadata_filter = {"source": "test_doc_1.txt"}
	results = vector_store.search(query_embedding, k=5, metadata_filter=metadata_filter)

	# Should only return documents matching the filter
	for result in results:
	_, _, metadata = result
	assert metadata["source"] == "test_doc_1.txt"

	def test_get_by_id(self, vector_store, sample_chunks, sample_embeddings):
	"""Test retrieving vectors by ID."""
	added_ids = vector_store.add_documents(sample_chunks, sample_embeddings)

	# Get first document
	first_id = added_ids[0]
	result = vector_store.get_by_id(first_id)

	assert result is not None
	vector, metadata = result
	assert isinstance(vector, np.ndarray)
	assert vector.shape == (384,)
	assert isinstance(metadata, dict)
	assert metadata["source"] == "test_doc_0.txt"

	def test_delete_by_id(self, vector_store, sample_chunks, sample_embeddings):
	"""Test deleting vectors by ID."""
	added_ids = vector_store.add_documents(sample_chunks, sample_embeddings)
	original_count = len(vector_store._vectors)

	# Delete first document
	first_id = added_ids[0]
	success = vector_store.delete_by_id(first_id)

	assert success
	assert len(vector_store._vectors) == original_count - 1
	assert first_id not in vector_store._id_to_index

	# Should not be able to retrieve deleted document
	result = vector_store.get_by_id(first_id)
	assert result is None

	def test_clear(self, vector_store, sample_chunks, sample_embeddings):
	"""Test clearing all vectors."""
	vector_store.add_documents(sample_chunks, sample_embeddings)
	assert len(vector_store._vectors) > 0

	vector_store.clear()

	assert len(vector_store._vectors) == 0
	assert len(vector_store._id_to_index) == 0
	assert vector_store._vector_matrix is None

	def test_get_stats(self, vector_store, sample_chunks, sample_embeddings):
	"""Test getting statistics."""
	# Test empty store
	stats = vector_store.get_stats()
	assert stats["total_vectors"] == 0
	assert stats["searches_performed"] == 0

	# Add documents and test again
	vector_store.add_documents(sample_chunks, sample_embeddings)
	stats = vector_store.get_stats()
	assert stats["total_vectors"] == 3
	assert stats["embedding_dimension"] == 384

	def test_save_and_load(self, vector_store, sample_chunks, sample_embeddings):
	"""Test saving and loading vector store."""
	# Add documents
	original_ids = vector_store.add_documents(sample_chunks, sample_embeddings)
	original_count = len(vector_store._vectors)

	# Save to temporary file
	with tempfile.NamedTemporaryFile(suffix='.pkl', delete=False) as f:
	temp_file = f.name

	try:
	saved_path = vector_store.save_to_disk(temp_file)
	assert Path(saved_path).exists()

	# Create new vector store and load
	new_vector_store = VectorStore(vector_store.config)
	new_vector_store.load_from_disk(saved_path)

	# Check that data was loaded correctly
	assert len(new_vector_store._vectors) == original_count
	assert new_vector_store.embedding_dim == 384

	# Check that we can retrieve the same documents
	for vector_id in original_ids:
	result = new_vector_store.get_by_id(vector_id)
	assert result is not None

	finally:
	if Path(temp_file).exists():
	Path(temp_file).unlink()

	def test_embedding_dimension_validation(self, sample_config):
	"""Test embedding dimension validation."""
	vs = VectorStore(sample_config, embedding_dim=256)

	# Create chunks and embeddings with wrong dimension
	chunks = [DocumentChunk("test", {"source": "test.txt"}, "test_id")]
	wrong_embeddings = np.random.rand(1, 384) # Wrong dimension

	with pytest.raises(ValueError, match="Embedding dimension"):
	vs.add_documents(chunks, wrong_embeddings)


	class TestVectorEntry:
	"""Test VectorEntry functionality."""

	def test_vector_entry_creation(self):
	"""Test creating vector entries."""
	vector = np.random.rand(384)
	metadata = {"source": "test.txt"}

	entry = VectorEntry("test_id", vector, metadata)

	assert entry.id == "test_id"
	assert np.array_equal(entry.vector, vector)
	assert entry.metadata == metadata
	assert entry.timestamp is not None

	def test_vector_entry_to_dict(self):
	"""Test vector entry serialization."""
	vector = np.random.rand(384)
	metadata = {"source": "test.txt"}

	entry = VectorEntry("test_id", vector, metadata)
	entry_dict = entry.to_dict()

	assert entry_dict["id"] == "test_id"
	assert entry_dict["metadata"] == metadata
	assert "timestamp" in entry_dict
	assert "vector_shape" in entry_dict
	assert entry_dict["vector_shape"] == (384,)


	if __name__ == "__main__":
	pytest.main([__file__])