""" Tests for document processor module. """ import pytest import tempfile from pathlib import Path from unittest.mock import Mock, patch import sys sys.path.append(str(Path(__file__).parent.parent)) from src.document_processor import DocumentProcessor, DocumentChunk from src.error_handler import DocumentProcessingError @pytest.fixture def sample_config(): """Sample configuration for testing.""" return { "app": {"max_upload_size": 50}, "processing": { "chunk_size": 512, "chunk_overlap": 50, "min_chunk_size": 100, "max_chunks_per_doc": 1000, "supported_formats": ["pdf", "docx", "txt"] } } @pytest.fixture def doc_processor(sample_config): """Document processor instance.""" return DocumentProcessor(sample_config) class TestDocumentProcessor: """Test document processor functionality.""" def test_init(self, sample_config): """Test processor initialization.""" processor = DocumentProcessor(sample_config) assert processor.chunk_size == 512 assert processor.chunk_overlap == 50 assert processor.min_chunk_size == 100 def test_process_text_file(self, doc_processor): """Test processing a simple text file.""" # Create temporary text file with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: f.write("This is a test document. " * 100) # Make it long enough to chunk temp_file = f.name try: chunks = doc_processor.process_document(temp_file, "test.txt") assert len(chunks) > 0 assert isinstance(chunks[0], DocumentChunk) assert chunks[0].content assert chunks[0].metadata["filename"] == "test.txt" assert chunks[0].metadata["file_type"] == ".txt" finally: Path(temp_file).unlink() def test_empty_file_error(self, doc_processor): """Test error handling for empty files.""" with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: f.write("") # Empty file temp_file = f.name try: with pytest.raises(DocumentProcessingError): doc_processor.process_document(temp_file, "empty.txt") finally: Path(temp_file).unlink() def test_unsupported_file_type(self, doc_processor): """Test error for unsupported file types.""" with tempfile.NamedTemporaryFile(suffix='.xyz', delete=False) as f: f.write(b"test content") temp_file = f.name try: with pytest.raises(DocumentProcessingError): doc_processor.process_document(temp_file, "test.xyz") finally: Path(temp_file).unlink() def test_chunk_creation(self, doc_processor): """Test chunk creation with overlaps.""" # Create a longer text to ensure multiple chunks long_text = "This is sentence one. This is sentence two. " * 50 with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: f.write(long_text) temp_file = f.name try: chunks = doc_processor.process_document(temp_file, "long.txt") # Should create multiple chunks for long text if len(chunks) > 1: # Check that chunks have proper metadata for i, chunk in enumerate(chunks): assert chunk.metadata["chunk_index"] == i assert chunk.chunk_id is not None finally: Path(temp_file).unlink() def test_document_stats(self, doc_processor): """Test document statistics generation.""" with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: f.write("Test document content. " * 20) temp_file = f.name try: chunks = doc_processor.process_document(temp_file, "stats_test.txt") stats = doc_processor.get_document_stats(chunks) assert stats["chunk_count"] == len(chunks) assert stats["total_chars"] > 0 assert stats["avg_chunk_size"] > 0 assert stats["source_file"] == "stats_test.txt" finally: Path(temp_file).unlink() class TestDocumentChunk: """Test DocumentChunk functionality.""" def test_chunk_creation(self): """Test chunk creation and ID generation.""" content = "This is test content" metadata = {"source": "test.txt", "page": 1} chunk = DocumentChunk(content, metadata) assert chunk.content == content assert chunk.metadata == metadata assert chunk.chunk_id is not None assert len(chunk.chunk_id) > 0 def test_chunk_to_dict(self): """Test chunk serialization.""" content = "Test content" metadata = {"source": "test.txt"} chunk = DocumentChunk(content, metadata, "custom_id") chunk_dict = chunk.to_dict() assert chunk_dict["chunk_id"] == "custom_id" assert chunk_dict["content"] == content assert chunk_dict["metadata"] == metadata def test_chunk_id_generation(self): """Test automatic chunk ID generation.""" chunk1 = DocumentChunk("Same content", {"source": "file1.txt"}) chunk2 = DocumentChunk("Same content", {"source": "file1.txt"}) chunk3 = DocumentChunk("Different content", {"source": "file1.txt"}) # Same content should generate same ID assert chunk1.chunk_id == chunk2.chunk_id # Different content should generate different ID assert chunk1.chunk_id != chunk3.chunk_id if __name__ == "__main__": pytest.main([__file__])