File size: 6,610 Bytes
19aaa42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/usr/bin/env python3
"""
Test script for Enhanced PDF Processing Pipeline
Tests with sample maternal health documents
"""

import sys
from pathlib import Path
import json

# Add src to path for imports
sys.path.append(str(Path(__file__).parent))

from enhanced_pdf_processor import EnhancedMedicalPDFProcessor

def test_single_document():
    """Test processing a single document"""
    print("πŸ§ͺ Testing Enhanced PDF Processor with sample document...")
    
    # Initialize processor
    processor = EnhancedMedicalPDFProcessor(output_dir="test_output")
    
    # Test with a smaller document first
    test_files = [
        "../Obs/RhESUS.pdf",
        "../Obs/puerperal-sepsis.pdf", 
        "../Obs/Management-of-thrombocytopaenia-in-pregnancy-Sept-5.pdf"
    ]
    
    for test_file in test_files:
        test_path = Path(test_file)
        if test_path.exists():
            print(f"\nπŸ“„ Testing with: {test_path.name}")
            
            try:
                # Process single document
                result = processor.process_single_pdf(test_path)
                
                # Display results
                print(f"βœ… Processing successful!")
                print(f"   πŸ“Š Pages: {result.summary['total_pages']}")
                print(f"   πŸ“‹ Tables: {result.summary['total_tables']}")
                print(f"   πŸ“ Words: {result.summary['total_words']}")
                print(f"   πŸ’Ύ Size: {result.file_info['size_mb']:.2f} MB")
                
                # Show content classification for first page
                if result.content:
                    first_page = result.content[0]
                    classification = first_page.metadata['content_classification']
                    print(f"   🏷️  Content Classification:")
                    for category, score in classification.items():
                        if score > 0:
                            print(f"      - {category}: {score:.4f}")
                
                # Test successful, break after first working file
                return True
                
            except Exception as e:
                print(f"❌ Processing failed: {e}")
                continue
    
    print("❌ No test files could be processed successfully")
    return False

def test_table_extraction():
    """Test table extraction specifically"""
    print("\nπŸ” Testing table extraction capabilities...")
    
    processor = EnhancedMedicalPDFProcessor()
    
    # Test with documents likely to have tables
    table_test_files = [
        "../Obs/Management-of-Normal-Labourchart.pdf",
        "../Obs/Management-of-thrombocytopaenia-in-pregnancy-Sept-5.pdf"
    ]
    
    for test_file in table_test_files:
        test_path = Path(test_file)
        if test_path.exists():
            print(f"\nπŸ“Š Testing table extraction with: {test_path.name}")
            
            try:
                result = processor.process_single_pdf(test_path)
                total_tables = result.summary['total_tables']
                
                if total_tables > 0:
                    print(f"βœ… Found {total_tables} tables!")
                    
                    # Show table details
                    for content in result.content:
                        if content.tables:
                            print(f"   Page {content.page_number}: {len(content.tables)} table(s)")
                            for i, table in enumerate(content.tables):
                                print(f"      Table {i+1}: {table.shape[0]} rows Γ— {table.shape[1]} columns")
                                if not table.empty:
                                    print(f"      Columns: {list(table.columns)}")
                    return True
                else:
                    print(f"⚠️  No tables found in {test_path.name}")
                    
            except Exception as e:
                print(f"❌ Table extraction test failed: {e}")
    
    return False

def validate_output_structure():
    """Validate the output structure is correct"""
    print("\nπŸ“ Validating output structure...")
    
    test_output_dir = Path("test_output")
    if not test_output_dir.exists():
        print("❌ Test output directory not found")
        return False
    
    # Check for expected files
    expected_files = []
    for item in test_output_dir.iterdir():
        if item.is_dir():
            expected_files.extend([
                item / "summary.json",
                item / "extracted_text.txt"
            ])
    
    all_valid = True
    for expected_file in expected_files:
        if expected_file.exists():
            print(f"βœ… Found: {expected_file}")
            
            # Validate JSON structure
            if expected_file.name == "summary.json":
                try:
                    with open(expected_file) as f:
                        data = json.load(f)
                    if 'summary' in data and 'file_info' in data:
                        print(f"   πŸ“‹ Valid JSON structure")
                    else:
                        print(f"   ⚠️  Missing keys in JSON")
                        all_valid = False
                except json.JSONDecodeError:
                    print(f"   ❌ Invalid JSON format")
                    all_valid = False
        else:
            print(f"❌ Missing: {expected_file}")
            all_valid = False
    
    return all_valid

def main():
    """Run all tests"""
    print("πŸš€ Starting Enhanced PDF Processor Tests")
    print("=" * 60)
    
    # Test 1: Single document processing
    test1_result = test_single_document()
    
    # Test 2: Table extraction
    test2_result = test_table_extraction()
    
    # Test 3: Output validation
    test3_result = validate_output_structure()
    
    # Summary
    print("\n" + "=" * 60)
    print("πŸ“Š TEST SUMMARY")
    print(f"Single Document Processing: {'βœ… PASS' if test1_result else '❌ FAIL'}")
    print(f"Table Extraction: {'βœ… PASS' if test2_result else '❌ FAIL'}")
    print(f"Output Structure: {'βœ… PASS' if test3_result else '❌ FAIL'}")
    
    overall_success = all([test1_result, test2_result, test3_result])
    print(f"\n🎯 OVERALL: {'βœ… ALL TESTS PASSED' if overall_success else '❌ SOME TESTS FAILED'}")
    
    if overall_success:
        print("\nπŸš€ Ready to process all maternal health documents!")
    else:
        print("\n⚠️  Please fix issues before processing all documents")
    
    return overall_success

if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)