Spaces:
Sleeping
Sleeping
File size: 6,610 Bytes
19aaa42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
#!/usr/bin/env python3
"""
Test script for Enhanced PDF Processing Pipeline
Tests with sample maternal health documents
"""
import sys
from pathlib import Path
import json
# Add src to path for imports
sys.path.append(str(Path(__file__).parent))
from enhanced_pdf_processor import EnhancedMedicalPDFProcessor
def test_single_document():
"""Test processing a single document"""
print("π§ͺ Testing Enhanced PDF Processor with sample document...")
# Initialize processor
processor = EnhancedMedicalPDFProcessor(output_dir="test_output")
# Test with a smaller document first
test_files = [
"../Obs/RhESUS.pdf",
"../Obs/puerperal-sepsis.pdf",
"../Obs/Management-of-thrombocytopaenia-in-pregnancy-Sept-5.pdf"
]
for test_file in test_files:
test_path = Path(test_file)
if test_path.exists():
print(f"\nπ Testing with: {test_path.name}")
try:
# Process single document
result = processor.process_single_pdf(test_path)
# Display results
print(f"β
Processing successful!")
print(f" π Pages: {result.summary['total_pages']}")
print(f" π Tables: {result.summary['total_tables']}")
print(f" π Words: {result.summary['total_words']}")
print(f" πΎ Size: {result.file_info['size_mb']:.2f} MB")
# Show content classification for first page
if result.content:
first_page = result.content[0]
classification = first_page.metadata['content_classification']
print(f" π·οΈ Content Classification:")
for category, score in classification.items():
if score > 0:
print(f" - {category}: {score:.4f}")
# Test successful, break after first working file
return True
except Exception as e:
print(f"β Processing failed: {e}")
continue
print("β No test files could be processed successfully")
return False
def test_table_extraction():
"""Test table extraction specifically"""
print("\nπ Testing table extraction capabilities...")
processor = EnhancedMedicalPDFProcessor()
# Test with documents likely to have tables
table_test_files = [
"../Obs/Management-of-Normal-Labourchart.pdf",
"../Obs/Management-of-thrombocytopaenia-in-pregnancy-Sept-5.pdf"
]
for test_file in table_test_files:
test_path = Path(test_file)
if test_path.exists():
print(f"\nπ Testing table extraction with: {test_path.name}")
try:
result = processor.process_single_pdf(test_path)
total_tables = result.summary['total_tables']
if total_tables > 0:
print(f"β
Found {total_tables} tables!")
# Show table details
for content in result.content:
if content.tables:
print(f" Page {content.page_number}: {len(content.tables)} table(s)")
for i, table in enumerate(content.tables):
print(f" Table {i+1}: {table.shape[0]} rows Γ {table.shape[1]} columns")
if not table.empty:
print(f" Columns: {list(table.columns)}")
return True
else:
print(f"β οΈ No tables found in {test_path.name}")
except Exception as e:
print(f"β Table extraction test failed: {e}")
return False
def validate_output_structure():
"""Validate the output structure is correct"""
print("\nπ Validating output structure...")
test_output_dir = Path("test_output")
if not test_output_dir.exists():
print("β Test output directory not found")
return False
# Check for expected files
expected_files = []
for item in test_output_dir.iterdir():
if item.is_dir():
expected_files.extend([
item / "summary.json",
item / "extracted_text.txt"
])
all_valid = True
for expected_file in expected_files:
if expected_file.exists():
print(f"β
Found: {expected_file}")
# Validate JSON structure
if expected_file.name == "summary.json":
try:
with open(expected_file) as f:
data = json.load(f)
if 'summary' in data and 'file_info' in data:
print(f" π Valid JSON structure")
else:
print(f" β οΈ Missing keys in JSON")
all_valid = False
except json.JSONDecodeError:
print(f" β Invalid JSON format")
all_valid = False
else:
print(f"β Missing: {expected_file}")
all_valid = False
return all_valid
def main():
"""Run all tests"""
print("π Starting Enhanced PDF Processor Tests")
print("=" * 60)
# Test 1: Single document processing
test1_result = test_single_document()
# Test 2: Table extraction
test2_result = test_table_extraction()
# Test 3: Output validation
test3_result = validate_output_structure()
# Summary
print("\n" + "=" * 60)
print("π TEST SUMMARY")
print(f"Single Document Processing: {'β
PASS' if test1_result else 'β FAIL'}")
print(f"Table Extraction: {'β
PASS' if test2_result else 'β FAIL'}")
print(f"Output Structure: {'β
PASS' if test3_result else 'β FAIL'}")
overall_success = all([test1_result, test2_result, test3_result])
print(f"\nπ― OVERALL: {'β
ALL TESTS PASSED' if overall_success else 'β SOME TESTS FAILED'}")
if overall_success:
print("\nπ Ready to process all maternal health documents!")
else:
print("\nβ οΈ Please fix issues before processing all documents")
return overall_success
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1) |