|
""" |
|
Tests for document processor module. |
|
""" |
|
|
|
import pytest |
|
import tempfile |
|
from pathlib import Path |
|
from unittest.mock import Mock, patch |
|
|
|
import sys |
|
sys.path.append(str(Path(__file__).parent.parent)) |
|
|
|
from src.document_processor import DocumentProcessor, DocumentChunk |
|
from src.error_handler import DocumentProcessingError |
|
|
|
|
|
@pytest.fixture |
|
def sample_config(): |
|
"""Sample configuration for testing.""" |
|
return { |
|
"app": {"max_upload_size": 50}, |
|
"processing": { |
|
"chunk_size": 512, |
|
"chunk_overlap": 50, |
|
"min_chunk_size": 100, |
|
"max_chunks_per_doc": 1000, |
|
"supported_formats": ["pdf", "docx", "txt"] |
|
} |
|
} |
|
|
|
|
|
@pytest.fixture |
|
def doc_processor(sample_config): |
|
"""Document processor instance.""" |
|
return DocumentProcessor(sample_config) |
|
|
|
|
|
class TestDocumentProcessor: |
|
"""Test document processor functionality.""" |
|
|
|
def test_init(self, sample_config): |
|
"""Test processor initialization.""" |
|
processor = DocumentProcessor(sample_config) |
|
assert processor.chunk_size == 512 |
|
assert processor.chunk_overlap == 50 |
|
assert processor.min_chunk_size == 100 |
|
|
|
def test_process_text_file(self, doc_processor): |
|
"""Test processing a simple text file.""" |
|
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: |
|
f.write("This is a test document. " * 100) |
|
temp_file = f.name |
|
|
|
try: |
|
chunks = doc_processor.process_document(temp_file, "test.txt") |
|
|
|
assert len(chunks) > 0 |
|
assert isinstance(chunks[0], DocumentChunk) |
|
assert chunks[0].content |
|
assert chunks[0].metadata["filename"] == "test.txt" |
|
assert chunks[0].metadata["file_type"] == ".txt" |
|
|
|
finally: |
|
Path(temp_file).unlink() |
|
|
|
def test_empty_file_error(self, doc_processor): |
|
"""Test error handling for empty files.""" |
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: |
|
f.write("") |
|
temp_file = f.name |
|
|
|
try: |
|
with pytest.raises(DocumentProcessingError): |
|
doc_processor.process_document(temp_file, "empty.txt") |
|
finally: |
|
Path(temp_file).unlink() |
|
|
|
def test_unsupported_file_type(self, doc_processor): |
|
"""Test error for unsupported file types.""" |
|
with tempfile.NamedTemporaryFile(suffix='.xyz', delete=False) as f: |
|
f.write(b"test content") |
|
temp_file = f.name |
|
|
|
try: |
|
with pytest.raises(DocumentProcessingError): |
|
doc_processor.process_document(temp_file, "test.xyz") |
|
finally: |
|
Path(temp_file).unlink() |
|
|
|
def test_chunk_creation(self, doc_processor): |
|
"""Test chunk creation with overlaps.""" |
|
|
|
long_text = "This is sentence one. This is sentence two. " * 50 |
|
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: |
|
f.write(long_text) |
|
temp_file = f.name |
|
|
|
try: |
|
chunks = doc_processor.process_document(temp_file, "long.txt") |
|
|
|
|
|
if len(chunks) > 1: |
|
|
|
for i, chunk in enumerate(chunks): |
|
assert chunk.metadata["chunk_index"] == i |
|
assert chunk.chunk_id is not None |
|
|
|
finally: |
|
Path(temp_file).unlink() |
|
|
|
def test_document_stats(self, doc_processor): |
|
"""Test document statistics generation.""" |
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: |
|
f.write("Test document content. " * 20) |
|
temp_file = f.name |
|
|
|
try: |
|
chunks = doc_processor.process_document(temp_file, "stats_test.txt") |
|
stats = doc_processor.get_document_stats(chunks) |
|
|
|
assert stats["chunk_count"] == len(chunks) |
|
assert stats["total_chars"] > 0 |
|
assert stats["avg_chunk_size"] > 0 |
|
assert stats["source_file"] == "stats_test.txt" |
|
|
|
finally: |
|
Path(temp_file).unlink() |
|
|
|
|
|
class TestDocumentChunk: |
|
"""Test DocumentChunk functionality.""" |
|
|
|
def test_chunk_creation(self): |
|
"""Test chunk creation and ID generation.""" |
|
content = "This is test content" |
|
metadata = {"source": "test.txt", "page": 1} |
|
|
|
chunk = DocumentChunk(content, metadata) |
|
|
|
assert chunk.content == content |
|
assert chunk.metadata == metadata |
|
assert chunk.chunk_id is not None |
|
assert len(chunk.chunk_id) > 0 |
|
|
|
def test_chunk_to_dict(self): |
|
"""Test chunk serialization.""" |
|
content = "Test content" |
|
metadata = {"source": "test.txt"} |
|
|
|
chunk = DocumentChunk(content, metadata, "custom_id") |
|
chunk_dict = chunk.to_dict() |
|
|
|
assert chunk_dict["chunk_id"] == "custom_id" |
|
assert chunk_dict["content"] == content |
|
assert chunk_dict["metadata"] == metadata |
|
|
|
def test_chunk_id_generation(self): |
|
"""Test automatic chunk ID generation.""" |
|
chunk1 = DocumentChunk("Same content", {"source": "file1.txt"}) |
|
chunk2 = DocumentChunk("Same content", {"source": "file1.txt"}) |
|
chunk3 = DocumentChunk("Different content", {"source": "file1.txt"}) |
|
|
|
|
|
assert chunk1.chunk_id == chunk2.chunk_id |
|
|
|
assert chunk1.chunk_id != chunk3.chunk_id |
|
|
|
|
|
if __name__ == "__main__": |
|
pytest.main([__file__]) |