Spaces:

ArthyP
/

technical-rag-assistant

Running

technical-rag-assistant / tests /test_basic_rag.py

Arthur Passuello

Initial commit

9f5e57c 2 months ago

7.83 kB

	"""
	BasicRAG System - Core Integration Tests

	This test suite validates the complete BasicRAG system functionality, including
	document indexing, semantic search, and edge case handling. Tests use real PDF
	documents to ensure production-like behavior.

	Test Strategy:
	- Unit tests for individual component behaviors
	- Integration tests for end-to-end workflows
	- Real document testing with RISC-V technical documentation
	- Edge case validation (empty index, missing files, etc.)

	Test Data:
	- Primary test document: riscv-base-instructions.pdf
	- Document characteristics: 97 pages of technical documentation
	- Expected behaviors: High-quality semantic search results

	Performance Expectations:
	- Document indexing: <60 seconds for test PDF
	- Query response: <100ms for indexed documents
	- Memory usage: <500MB during testing

	Coverage Areas:
	1. System initialization and state management
	2. Document indexing pipeline validation
	3. Semantic search accuracy and relevance
	4. Error handling and edge cases

	Author: Arthur Passuello
	Date: June 2025
	Project: RAG Portfolio - Technical Documentation System
	"""

	import pytest
	import sys
	from pathlib import Path

	# Add project paths
	project_root = Path(__file__).parent.parent
	sys.path.append(str(project_root))
	sys.path.append(str(project_root.parent))

	from src.basic_rag import BasicRAG


	def test_basic_rag_initialization():
	"""
	Test BasicRAG system initialization and default state.

	This test validates that the RAG system initializes correctly with:
	- Uninitialized FAISS index (lazy loading pattern)
	- Empty chunk storage
	- Correct embedding dimensions for all-MiniLM-L6-v2

	Test Rationale:
	- Ensures clean slate for document indexing
	- Validates memory-efficient lazy initialization
	- Confirms compatibility with embedding model
	"""
	# Create new RAG instance
	rag = BasicRAG()

	# Validate initial state
	assert rag.index is None, "FAISS index should not be initialized until first document"
	assert len(rag.chunks) == 0, "Chunk storage should be empty initially"
	assert rag.embedding_dim == 768, "Should match all-mpnet-base-v2 dimensions"


	def test_basic_rag_index_document():
	"""
	Test end-to-end document indexing with real PDF.

	This integration test validates the complete indexing pipeline:
	1. PDF text extraction
	2. Text chunking with overlap
	3. Embedding generation
	4. FAISS index creation and population
	5. Metadata storage and alignment

	Test Document: RISC-V Base Instructions Manual
	- Technical documentation with ~97 pages
	- Tests handling of complex formatting
	- Validates chunk quality on real technical content

	Assertions:
	- Successful chunk generation (>0 chunks)
	- FAISS index initialization and population
	- Metadata completeness and accuracy
	- Chunk count consistency across components
	"""
	# Initialize system
	rag = BasicRAG()
	pdf_path = Path("data/test/riscv-base-instructions.pdf")

	# Skip test if PDF not available (CI environments)
	if not pdf_path.exists():
	pytest.skip("Test PDF not found - skipping integration test")

	# Execute document indexing
	num_chunks = rag.index_document(pdf_path)

	# Validate indexing results
	assert num_chunks > 0, "Should generate multiple chunks from 97-page PDF"
	assert rag.index is not None, "FAISS index should be initialized after first document"
	assert len(rag.chunks) == num_chunks, "Chunk storage count should match returned count"
	assert rag.index.ntotal == num_chunks, "FAISS index size should match chunk count"

	# Validate chunk metadata structure
	first_chunk = rag.chunks[0]
	assert "text" in first_chunk, "Chunk should contain text content"
	assert "source" in first_chunk, "Chunk should track source document"
	assert "chunk_id" in first_chunk, "Chunk should have unique identifier"
	assert str(pdf_path) == first_chunk["source"], "Source path should be preserved"

	# Additional metadata validation
	assert "start_char" in first_chunk, "Should track chunk position"
	assert "end_char" in first_chunk, "Should track chunk end position"
	assert first_chunk["chunk_id"] == 0, "First chunk should have ID 0"


	def test_basic_rag_query():
	"""
	Test semantic search functionality with real technical queries.

	This test validates the retrieval component of RAG:
	1. Document indexing for search preparation
	2. Query embedding generation
	3. FAISS similarity search
	4. Result ranking and metadata retrieval
	5. Source document tracking

	Test Query: "What is RISC-V?"
	- Tests understanding of technical concepts
	- Validates semantic similarity (not just keyword matching)
	- Expects relevant chunks from introduction/overview sections

	Assertions:
	- Correct result structure
	- Relevance of returned chunks
	- Similarity score presence
	- Source tracking accuracy
	"""
	# Initialize and prepare system
	rag = BasicRAG()
	pdf_path = Path("data/test/riscv-base-instructions.pdf")

	# Skip if test data unavailable
	if not pdf_path.exists():
	pytest.skip("Test PDF not found - skipping query test")

	# Index document for searching
	rag.index_document(pdf_path)

	# Execute semantic search query
	result = rag.query("What is RISC-V?", top_k=3)

	# Validate result structure
	assert "question" in result, "Result should echo the question"
	assert "chunks" in result, "Result should contain chunks list"
	assert "sources" in result, "Result should contain sources list"
	assert result["question"] == "What is RISC-V?", "Question should be preserved exactly"
	assert len(result["chunks"]) <= 3, "Should respect top_k limit"
	assert len(result["sources"]) > 0, "Should identify source documents"

	# Validate chunk quality and metadata
	if result["chunks"]:
	chunk = result["chunks"][0]
	assert "text" in chunk, "Chunk should contain text content"
	assert "similarity_score" in chunk, "Chunk should have similarity score"
	assert "source" in chunk, "Chunk should track source document"

	# Validate score range
	assert 0 <= chunk["similarity_score"] <= 1, "Cosine similarity should be in [0,1]"

	# Semantic relevance check (top result should mention RISC)
	assert "RISC" in chunk["text"] or "risc" in chunk["text"].lower(), \
	"Top result should be semantically relevant to RISC-V query"


	def test_basic_rag_empty_query():
	"""
	Test edge case: querying an empty RAG system.

	This test validates graceful handling of queries when no documents
	have been indexed. This is a common edge case in production systems
	during initialization or after index clearing.

	Expected Behavior:
	- No exceptions raised
	- Empty but valid result structure
	- Original question preserved
	- Empty chunks and sources lists

	This ensures the system fails gracefully rather than crashing when
	users attempt searches before indexing documents.
	"""
	# Create fresh RAG instance (no documents indexed)
	rag = BasicRAG()

	# Attempt query on empty system
	result = rag.query("test question")

	# Validate graceful failure
	assert result["question"] == "test question", "Should preserve original question"
	assert result["chunks"] == [], "Should return empty chunks list"
	assert result["sources"] == [], "Should return empty sources list"

	# Ensure consistent behavior with different parameters
	result_with_topk = rag.query("another test", top_k=10)
	assert result_with_topk["chunks"] == [], "Should handle top_k on empty index"