RAG_ChatBot

Running

RAG_ChatBot / tests /test_document_processor.py

Jialun He

1st version

11d9dfb 12 days ago

6.01 kB

	"""
	Tests for document processor module.
	"""

	import pytest
	import tempfile
	from pathlib import Path
	from unittest.mock import Mock, patch

	import sys
	sys.path.append(str(Path(__file__).parent.parent))

	from src.document_processor import DocumentProcessor, DocumentChunk
	from src.error_handler import DocumentProcessingError


	@pytest.fixture
	def sample_config():
	"""Sample configuration for testing."""
	return {
	"app": {"max_upload_size": 50},
	"processing": {
	"chunk_size": 512,
	"chunk_overlap": 50,
	"min_chunk_size": 100,
	"max_chunks_per_doc": 1000,
	"supported_formats": ["pdf", "docx", "txt"]
	}
	}


	@pytest.fixture
	def doc_processor(sample_config):
	"""Document processor instance."""
	return DocumentProcessor(sample_config)


	class TestDocumentProcessor:
	"""Test document processor functionality."""

	def test_init(self, sample_config):
	"""Test processor initialization."""
	processor = DocumentProcessor(sample_config)
	assert processor.chunk_size == 512
	assert processor.chunk_overlap == 50
	assert processor.min_chunk_size == 100

	def test_process_text_file(self, doc_processor):
	"""Test processing a simple text file."""
	# Create temporary text file
	with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
	f.write("This is a test document. " * 100) # Make it long enough to chunk
	temp_file = f.name

	try:
	chunks = doc_processor.process_document(temp_file, "test.txt")

	assert len(chunks) > 0
	assert isinstance(chunks[0], DocumentChunk)
	assert chunks[0].content
	assert chunks[0].metadata["filename"] == "test.txt"
	assert chunks[0].metadata["file_type"] == ".txt"

	finally:
	Path(temp_file).unlink()

	def test_empty_file_error(self, doc_processor):
	"""Test error handling for empty files."""
	with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
	f.write("") # Empty file
	temp_file = f.name

	try:
	with pytest.raises(DocumentProcessingError):
	doc_processor.process_document(temp_file, "empty.txt")
	finally:
	Path(temp_file).unlink()

	def test_unsupported_file_type(self, doc_processor):
	"""Test error for unsupported file types."""
	with tempfile.NamedTemporaryFile(suffix='.xyz', delete=False) as f:
	f.write(b"test content")
	temp_file = f.name

	try:
	with pytest.raises(DocumentProcessingError):
	doc_processor.process_document(temp_file, "test.xyz")
	finally:
	Path(temp_file).unlink()

	def test_chunk_creation(self, doc_processor):
	"""Test chunk creation with overlaps."""
	# Create a longer text to ensure multiple chunks
	long_text = "This is sentence one. This is sentence two. " * 50

	with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
	f.write(long_text)
	temp_file = f.name

	try:
	chunks = doc_processor.process_document(temp_file, "long.txt")

	# Should create multiple chunks for long text
	if len(chunks) > 1:
	# Check that chunks have proper metadata
	for i, chunk in enumerate(chunks):
	assert chunk.metadata["chunk_index"] == i
	assert chunk.chunk_id is not None

	finally:
	Path(temp_file).unlink()

	def test_document_stats(self, doc_processor):
	"""Test document statistics generation."""
	with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
	f.write("Test document content. " * 20)
	temp_file = f.name

	try:
	chunks = doc_processor.process_document(temp_file, "stats_test.txt")
	stats = doc_processor.get_document_stats(chunks)

	assert stats["chunk_count"] == len(chunks)
	assert stats["total_chars"] > 0
	assert stats["avg_chunk_size"] > 0
	assert stats["source_file"] == "stats_test.txt"

	finally:
	Path(temp_file).unlink()


	class TestDocumentChunk:
	"""Test DocumentChunk functionality."""

	def test_chunk_creation(self):
	"""Test chunk creation and ID generation."""
	content = "This is test content"
	metadata = {"source": "test.txt", "page": 1}

	chunk = DocumentChunk(content, metadata)

	assert chunk.content == content
	assert chunk.metadata == metadata
	assert chunk.chunk_id is not None
	assert len(chunk.chunk_id) > 0

	def test_chunk_to_dict(self):
	"""Test chunk serialization."""
	content = "Test content"
	metadata = {"source": "test.txt"}

	chunk = DocumentChunk(content, metadata, "custom_id")
	chunk_dict = chunk.to_dict()

	assert chunk_dict["chunk_id"] == "custom_id"
	assert chunk_dict["content"] == content
	assert chunk_dict["metadata"] == metadata

	def test_chunk_id_generation(self):
	"""Test automatic chunk ID generation."""
	chunk1 = DocumentChunk("Same content", {"source": "file1.txt"})
	chunk2 = DocumentChunk("Same content", {"source": "file1.txt"})
	chunk3 = DocumentChunk("Different content", {"source": "file1.txt"})

	# Same content should generate same ID
	assert chunk1.chunk_id == chunk2.chunk_id
	# Different content should generate different ID
	assert chunk1.chunk_id != chunk3.chunk_id


	if __name__ == "__main__":
	pytest.main([__file__])