Spaces:
Running
Running
File size: 6,006 Bytes
11d9dfb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
"""
Tests for document processor module.
"""
import pytest
import tempfile
from pathlib import Path
from unittest.mock import Mock, patch
import sys
sys.path.append(str(Path(__file__).parent.parent))
from src.document_processor import DocumentProcessor, DocumentChunk
from src.error_handler import DocumentProcessingError
@pytest.fixture
def sample_config():
"""Sample configuration for testing."""
return {
"app": {"max_upload_size": 50},
"processing": {
"chunk_size": 512,
"chunk_overlap": 50,
"min_chunk_size": 100,
"max_chunks_per_doc": 1000,
"supported_formats": ["pdf", "docx", "txt"]
}
}
@pytest.fixture
def doc_processor(sample_config):
"""Document processor instance."""
return DocumentProcessor(sample_config)
class TestDocumentProcessor:
"""Test document processor functionality."""
def test_init(self, sample_config):
"""Test processor initialization."""
processor = DocumentProcessor(sample_config)
assert processor.chunk_size == 512
assert processor.chunk_overlap == 50
assert processor.min_chunk_size == 100
def test_process_text_file(self, doc_processor):
"""Test processing a simple text file."""
# Create temporary text file
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
f.write("This is a test document. " * 100) # Make it long enough to chunk
temp_file = f.name
try:
chunks = doc_processor.process_document(temp_file, "test.txt")
assert len(chunks) > 0
assert isinstance(chunks[0], DocumentChunk)
assert chunks[0].content
assert chunks[0].metadata["filename"] == "test.txt"
assert chunks[0].metadata["file_type"] == ".txt"
finally:
Path(temp_file).unlink()
def test_empty_file_error(self, doc_processor):
"""Test error handling for empty files."""
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
f.write("") # Empty file
temp_file = f.name
try:
with pytest.raises(DocumentProcessingError):
doc_processor.process_document(temp_file, "empty.txt")
finally:
Path(temp_file).unlink()
def test_unsupported_file_type(self, doc_processor):
"""Test error for unsupported file types."""
with tempfile.NamedTemporaryFile(suffix='.xyz', delete=False) as f:
f.write(b"test content")
temp_file = f.name
try:
with pytest.raises(DocumentProcessingError):
doc_processor.process_document(temp_file, "test.xyz")
finally:
Path(temp_file).unlink()
def test_chunk_creation(self, doc_processor):
"""Test chunk creation with overlaps."""
# Create a longer text to ensure multiple chunks
long_text = "This is sentence one. This is sentence two. " * 50
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
f.write(long_text)
temp_file = f.name
try:
chunks = doc_processor.process_document(temp_file, "long.txt")
# Should create multiple chunks for long text
if len(chunks) > 1:
# Check that chunks have proper metadata
for i, chunk in enumerate(chunks):
assert chunk.metadata["chunk_index"] == i
assert chunk.chunk_id is not None
finally:
Path(temp_file).unlink()
def test_document_stats(self, doc_processor):
"""Test document statistics generation."""
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
f.write("Test document content. " * 20)
temp_file = f.name
try:
chunks = doc_processor.process_document(temp_file, "stats_test.txt")
stats = doc_processor.get_document_stats(chunks)
assert stats["chunk_count"] == len(chunks)
assert stats["total_chars"] > 0
assert stats["avg_chunk_size"] > 0
assert stats["source_file"] == "stats_test.txt"
finally:
Path(temp_file).unlink()
class TestDocumentChunk:
"""Test DocumentChunk functionality."""
def test_chunk_creation(self):
"""Test chunk creation and ID generation."""
content = "This is test content"
metadata = {"source": "test.txt", "page": 1}
chunk = DocumentChunk(content, metadata)
assert chunk.content == content
assert chunk.metadata == metadata
assert chunk.chunk_id is not None
assert len(chunk.chunk_id) > 0
def test_chunk_to_dict(self):
"""Test chunk serialization."""
content = "Test content"
metadata = {"source": "test.txt"}
chunk = DocumentChunk(content, metadata, "custom_id")
chunk_dict = chunk.to_dict()
assert chunk_dict["chunk_id"] == "custom_id"
assert chunk_dict["content"] == content
assert chunk_dict["metadata"] == metadata
def test_chunk_id_generation(self):
"""Test automatic chunk ID generation."""
chunk1 = DocumentChunk("Same content", {"source": "file1.txt"})
chunk2 = DocumentChunk("Same content", {"source": "file1.txt"})
chunk3 = DocumentChunk("Different content", {"source": "file1.txt"})
# Same content should generate same ID
assert chunk1.chunk_id == chunk2.chunk_id
# Different content should generate different ID
assert chunk1.chunk_id != chunk3.chunk_id
if __name__ == "__main__":
pytest.main([__file__]) |