File size: 6,006 Bytes
11d9dfb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
"""
Tests for document processor module.
"""

import pytest
import tempfile
from pathlib import Path
from unittest.mock import Mock, patch

import sys
sys.path.append(str(Path(__file__).parent.parent))

from src.document_processor import DocumentProcessor, DocumentChunk
from src.error_handler import DocumentProcessingError


@pytest.fixture
def sample_config():
    """Sample configuration for testing."""
    return {
        "app": {"max_upload_size": 50},
        "processing": {
            "chunk_size": 512,
            "chunk_overlap": 50,
            "min_chunk_size": 100,
            "max_chunks_per_doc": 1000,
            "supported_formats": ["pdf", "docx", "txt"]
        }
    }


@pytest.fixture
def doc_processor(sample_config):
    """Document processor instance."""
    return DocumentProcessor(sample_config)


class TestDocumentProcessor:
    """Test document processor functionality."""
    
    def test_init(self, sample_config):
        """Test processor initialization."""
        processor = DocumentProcessor(sample_config)
        assert processor.chunk_size == 512
        assert processor.chunk_overlap == 50
        assert processor.min_chunk_size == 100
    
    def test_process_text_file(self, doc_processor):
        """Test processing a simple text file."""
        # Create temporary text file
        with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
            f.write("This is a test document. " * 100)  # Make it long enough to chunk
            temp_file = f.name
        
        try:
            chunks = doc_processor.process_document(temp_file, "test.txt")
            
            assert len(chunks) > 0
            assert isinstance(chunks[0], DocumentChunk)
            assert chunks[0].content
            assert chunks[0].metadata["filename"] == "test.txt"
            assert chunks[0].metadata["file_type"] == ".txt"
            
        finally:
            Path(temp_file).unlink()
    
    def test_empty_file_error(self, doc_processor):
        """Test error handling for empty files."""
        with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
            f.write("")  # Empty file
            temp_file = f.name
        
        try:
            with pytest.raises(DocumentProcessingError):
                doc_processor.process_document(temp_file, "empty.txt")
        finally:
            Path(temp_file).unlink()
    
    def test_unsupported_file_type(self, doc_processor):
        """Test error for unsupported file types."""
        with tempfile.NamedTemporaryFile(suffix='.xyz', delete=False) as f:
            f.write(b"test content")
            temp_file = f.name
        
        try:
            with pytest.raises(DocumentProcessingError):
                doc_processor.process_document(temp_file, "test.xyz")
        finally:
            Path(temp_file).unlink()
    
    def test_chunk_creation(self, doc_processor):
        """Test chunk creation with overlaps."""
        # Create a longer text to ensure multiple chunks
        long_text = "This is sentence one. This is sentence two. " * 50
        
        with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
            f.write(long_text)
            temp_file = f.name
        
        try:
            chunks = doc_processor.process_document(temp_file, "long.txt")
            
            # Should create multiple chunks for long text
            if len(chunks) > 1:
                # Check that chunks have proper metadata
                for i, chunk in enumerate(chunks):
                    assert chunk.metadata["chunk_index"] == i
                    assert chunk.chunk_id is not None
                    
        finally:
            Path(temp_file).unlink()
    
    def test_document_stats(self, doc_processor):
        """Test document statistics generation."""
        with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
            f.write("Test document content. " * 20)
            temp_file = f.name
        
        try:
            chunks = doc_processor.process_document(temp_file, "stats_test.txt")
            stats = doc_processor.get_document_stats(chunks)
            
            assert stats["chunk_count"] == len(chunks)
            assert stats["total_chars"] > 0
            assert stats["avg_chunk_size"] > 0
            assert stats["source_file"] == "stats_test.txt"
            
        finally:
            Path(temp_file).unlink()


class TestDocumentChunk:
    """Test DocumentChunk functionality."""
    
    def test_chunk_creation(self):
        """Test chunk creation and ID generation."""
        content = "This is test content"
        metadata = {"source": "test.txt", "page": 1}
        
        chunk = DocumentChunk(content, metadata)
        
        assert chunk.content == content
        assert chunk.metadata == metadata
        assert chunk.chunk_id is not None
        assert len(chunk.chunk_id) > 0
    
    def test_chunk_to_dict(self):
        """Test chunk serialization."""
        content = "Test content"
        metadata = {"source": "test.txt"}
        
        chunk = DocumentChunk(content, metadata, "custom_id")
        chunk_dict = chunk.to_dict()
        
        assert chunk_dict["chunk_id"] == "custom_id"
        assert chunk_dict["content"] == content
        assert chunk_dict["metadata"] == metadata
    
    def test_chunk_id_generation(self):
        """Test automatic chunk ID generation."""
        chunk1 = DocumentChunk("Same content", {"source": "file1.txt"})
        chunk2 = DocumentChunk("Same content", {"source": "file1.txt"})
        chunk3 = DocumentChunk("Different content", {"source": "file1.txt"})
        
        # Same content should generate same ID
        assert chunk1.chunk_id == chunk2.chunk_id
        # Different content should generate different ID
        assert chunk1.chunk_id != chunk3.chunk_id


if __name__ == "__main__":
    pytest.main([__file__])