RAG_ChatBot / tests /test_document_processor.py
Jialun He
1st version
11d9dfb
"""
Tests for document processor module.
"""
import pytest
import tempfile
from pathlib import Path
from unittest.mock import Mock, patch
import sys
sys.path.append(str(Path(__file__).parent.parent))
from src.document_processor import DocumentProcessor, DocumentChunk
from src.error_handler import DocumentProcessingError
@pytest.fixture
def sample_config():
"""Sample configuration for testing."""
return {
"app": {"max_upload_size": 50},
"processing": {
"chunk_size": 512,
"chunk_overlap": 50,
"min_chunk_size": 100,
"max_chunks_per_doc": 1000,
"supported_formats": ["pdf", "docx", "txt"]
}
}
@pytest.fixture
def doc_processor(sample_config):
"""Document processor instance."""
return DocumentProcessor(sample_config)
class TestDocumentProcessor:
"""Test document processor functionality."""
def test_init(self, sample_config):
"""Test processor initialization."""
processor = DocumentProcessor(sample_config)
assert processor.chunk_size == 512
assert processor.chunk_overlap == 50
assert processor.min_chunk_size == 100
def test_process_text_file(self, doc_processor):
"""Test processing a simple text file."""
# Create temporary text file
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
f.write("This is a test document. " * 100) # Make it long enough to chunk
temp_file = f.name
try:
chunks = doc_processor.process_document(temp_file, "test.txt")
assert len(chunks) > 0
assert isinstance(chunks[0], DocumentChunk)
assert chunks[0].content
assert chunks[0].metadata["filename"] == "test.txt"
assert chunks[0].metadata["file_type"] == ".txt"
finally:
Path(temp_file).unlink()
def test_empty_file_error(self, doc_processor):
"""Test error handling for empty files."""
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
f.write("") # Empty file
temp_file = f.name
try:
with pytest.raises(DocumentProcessingError):
doc_processor.process_document(temp_file, "empty.txt")
finally:
Path(temp_file).unlink()
def test_unsupported_file_type(self, doc_processor):
"""Test error for unsupported file types."""
with tempfile.NamedTemporaryFile(suffix='.xyz', delete=False) as f:
f.write(b"test content")
temp_file = f.name
try:
with pytest.raises(DocumentProcessingError):
doc_processor.process_document(temp_file, "test.xyz")
finally:
Path(temp_file).unlink()
def test_chunk_creation(self, doc_processor):
"""Test chunk creation with overlaps."""
# Create a longer text to ensure multiple chunks
long_text = "This is sentence one. This is sentence two. " * 50
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
f.write(long_text)
temp_file = f.name
try:
chunks = doc_processor.process_document(temp_file, "long.txt")
# Should create multiple chunks for long text
if len(chunks) > 1:
# Check that chunks have proper metadata
for i, chunk in enumerate(chunks):
assert chunk.metadata["chunk_index"] == i
assert chunk.chunk_id is not None
finally:
Path(temp_file).unlink()
def test_document_stats(self, doc_processor):
"""Test document statistics generation."""
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
f.write("Test document content. " * 20)
temp_file = f.name
try:
chunks = doc_processor.process_document(temp_file, "stats_test.txt")
stats = doc_processor.get_document_stats(chunks)
assert stats["chunk_count"] == len(chunks)
assert stats["total_chars"] > 0
assert stats["avg_chunk_size"] > 0
assert stats["source_file"] == "stats_test.txt"
finally:
Path(temp_file).unlink()
class TestDocumentChunk:
"""Test DocumentChunk functionality."""
def test_chunk_creation(self):
"""Test chunk creation and ID generation."""
content = "This is test content"
metadata = {"source": "test.txt", "page": 1}
chunk = DocumentChunk(content, metadata)
assert chunk.content == content
assert chunk.metadata == metadata
assert chunk.chunk_id is not None
assert len(chunk.chunk_id) > 0
def test_chunk_to_dict(self):
"""Test chunk serialization."""
content = "Test content"
metadata = {"source": "test.txt"}
chunk = DocumentChunk(content, metadata, "custom_id")
chunk_dict = chunk.to_dict()
assert chunk_dict["chunk_id"] == "custom_id"
assert chunk_dict["content"] == content
assert chunk_dict["metadata"] == metadata
def test_chunk_id_generation(self):
"""Test automatic chunk ID generation."""
chunk1 = DocumentChunk("Same content", {"source": "file1.txt"})
chunk2 = DocumentChunk("Same content", {"source": "file1.txt"})
chunk3 = DocumentChunk("Different content", {"source": "file1.txt"})
# Same content should generate same ID
assert chunk1.chunk_id == chunk2.chunk_id
# Different content should generate different ID
assert chunk1.chunk_id != chunk3.chunk_id
if __name__ == "__main__":
pytest.main([__file__])