Spaces:
Running
Running
""" | |
Tests for document processor module. | |
""" | |
import pytest | |
import tempfile | |
from pathlib import Path | |
from unittest.mock import Mock, patch | |
import sys | |
sys.path.append(str(Path(__file__).parent.parent)) | |
from src.document_processor import DocumentProcessor, DocumentChunk | |
from src.error_handler import DocumentProcessingError | |
def sample_config(): | |
"""Sample configuration for testing.""" | |
return { | |
"app": {"max_upload_size": 50}, | |
"processing": { | |
"chunk_size": 512, | |
"chunk_overlap": 50, | |
"min_chunk_size": 100, | |
"max_chunks_per_doc": 1000, | |
"supported_formats": ["pdf", "docx", "txt"] | |
} | |
} | |
def doc_processor(sample_config): | |
"""Document processor instance.""" | |
return DocumentProcessor(sample_config) | |
class TestDocumentProcessor: | |
"""Test document processor functionality.""" | |
def test_init(self, sample_config): | |
"""Test processor initialization.""" | |
processor = DocumentProcessor(sample_config) | |
assert processor.chunk_size == 512 | |
assert processor.chunk_overlap == 50 | |
assert processor.min_chunk_size == 100 | |
def test_process_text_file(self, doc_processor): | |
"""Test processing a simple text file.""" | |
# Create temporary text file | |
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: | |
f.write("This is a test document. " * 100) # Make it long enough to chunk | |
temp_file = f.name | |
try: | |
chunks = doc_processor.process_document(temp_file, "test.txt") | |
assert len(chunks) > 0 | |
assert isinstance(chunks[0], DocumentChunk) | |
assert chunks[0].content | |
assert chunks[0].metadata["filename"] == "test.txt" | |
assert chunks[0].metadata["file_type"] == ".txt" | |
finally: | |
Path(temp_file).unlink() | |
def test_empty_file_error(self, doc_processor): | |
"""Test error handling for empty files.""" | |
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: | |
f.write("") # Empty file | |
temp_file = f.name | |
try: | |
with pytest.raises(DocumentProcessingError): | |
doc_processor.process_document(temp_file, "empty.txt") | |
finally: | |
Path(temp_file).unlink() | |
def test_unsupported_file_type(self, doc_processor): | |
"""Test error for unsupported file types.""" | |
with tempfile.NamedTemporaryFile(suffix='.xyz', delete=False) as f: | |
f.write(b"test content") | |
temp_file = f.name | |
try: | |
with pytest.raises(DocumentProcessingError): | |
doc_processor.process_document(temp_file, "test.xyz") | |
finally: | |
Path(temp_file).unlink() | |
def test_chunk_creation(self, doc_processor): | |
"""Test chunk creation with overlaps.""" | |
# Create a longer text to ensure multiple chunks | |
long_text = "This is sentence one. This is sentence two. " * 50 | |
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: | |
f.write(long_text) | |
temp_file = f.name | |
try: | |
chunks = doc_processor.process_document(temp_file, "long.txt") | |
# Should create multiple chunks for long text | |
if len(chunks) > 1: | |
# Check that chunks have proper metadata | |
for i, chunk in enumerate(chunks): | |
assert chunk.metadata["chunk_index"] == i | |
assert chunk.chunk_id is not None | |
finally: | |
Path(temp_file).unlink() | |
def test_document_stats(self, doc_processor): | |
"""Test document statistics generation.""" | |
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: | |
f.write("Test document content. " * 20) | |
temp_file = f.name | |
try: | |
chunks = doc_processor.process_document(temp_file, "stats_test.txt") | |
stats = doc_processor.get_document_stats(chunks) | |
assert stats["chunk_count"] == len(chunks) | |
assert stats["total_chars"] > 0 | |
assert stats["avg_chunk_size"] > 0 | |
assert stats["source_file"] == "stats_test.txt" | |
finally: | |
Path(temp_file).unlink() | |
class TestDocumentChunk: | |
"""Test DocumentChunk functionality.""" | |
def test_chunk_creation(self): | |
"""Test chunk creation and ID generation.""" | |
content = "This is test content" | |
metadata = {"source": "test.txt", "page": 1} | |
chunk = DocumentChunk(content, metadata) | |
assert chunk.content == content | |
assert chunk.metadata == metadata | |
assert chunk.chunk_id is not None | |
assert len(chunk.chunk_id) > 0 | |
def test_chunk_to_dict(self): | |
"""Test chunk serialization.""" | |
content = "Test content" | |
metadata = {"source": "test.txt"} | |
chunk = DocumentChunk(content, metadata, "custom_id") | |
chunk_dict = chunk.to_dict() | |
assert chunk_dict["chunk_id"] == "custom_id" | |
assert chunk_dict["content"] == content | |
assert chunk_dict["metadata"] == metadata | |
def test_chunk_id_generation(self): | |
"""Test automatic chunk ID generation.""" | |
chunk1 = DocumentChunk("Same content", {"source": "file1.txt"}) | |
chunk2 = DocumentChunk("Same content", {"source": "file1.txt"}) | |
chunk3 = DocumentChunk("Different content", {"source": "file1.txt"}) | |
# Same content should generate same ID | |
assert chunk1.chunk_id == chunk2.chunk_id | |
# Different content should generate different ID | |
assert chunk1.chunk_id != chunk3.chunk_id | |
if __name__ == "__main__": | |
pytest.main([__file__]) |