vedaMD / src /test_pdf_processor.py
sniro23's picture
Initial commit without binary files
19aaa42
#!/usr/bin/env python3
"""
Test script for Enhanced PDF Processing Pipeline
Tests with sample maternal health documents
"""
import sys
from pathlib import Path
import json
# Add src to path for imports
sys.path.append(str(Path(__file__).parent))
from enhanced_pdf_processor import EnhancedMedicalPDFProcessor
def test_single_document():
"""Test processing a single document"""
print("πŸ§ͺ Testing Enhanced PDF Processor with sample document...")
# Initialize processor
processor = EnhancedMedicalPDFProcessor(output_dir="test_output")
# Test with a smaller document first
test_files = [
"../Obs/RhESUS.pdf",
"../Obs/puerperal-sepsis.pdf",
"../Obs/Management-of-thrombocytopaenia-in-pregnancy-Sept-5.pdf"
]
for test_file in test_files:
test_path = Path(test_file)
if test_path.exists():
print(f"\nπŸ“„ Testing with: {test_path.name}")
try:
# Process single document
result = processor.process_single_pdf(test_path)
# Display results
print(f"βœ… Processing successful!")
print(f" πŸ“Š Pages: {result.summary['total_pages']}")
print(f" πŸ“‹ Tables: {result.summary['total_tables']}")
print(f" πŸ“ Words: {result.summary['total_words']}")
print(f" πŸ’Ύ Size: {result.file_info['size_mb']:.2f} MB")
# Show content classification for first page
if result.content:
first_page = result.content[0]
classification = first_page.metadata['content_classification']
print(f" 🏷️ Content Classification:")
for category, score in classification.items():
if score > 0:
print(f" - {category}: {score:.4f}")
# Test successful, break after first working file
return True
except Exception as e:
print(f"❌ Processing failed: {e}")
continue
print("❌ No test files could be processed successfully")
return False
def test_table_extraction():
"""Test table extraction specifically"""
print("\nπŸ” Testing table extraction capabilities...")
processor = EnhancedMedicalPDFProcessor()
# Test with documents likely to have tables
table_test_files = [
"../Obs/Management-of-Normal-Labourchart.pdf",
"../Obs/Management-of-thrombocytopaenia-in-pregnancy-Sept-5.pdf"
]
for test_file in table_test_files:
test_path = Path(test_file)
if test_path.exists():
print(f"\nπŸ“Š Testing table extraction with: {test_path.name}")
try:
result = processor.process_single_pdf(test_path)
total_tables = result.summary['total_tables']
if total_tables > 0:
print(f"βœ… Found {total_tables} tables!")
# Show table details
for content in result.content:
if content.tables:
print(f" Page {content.page_number}: {len(content.tables)} table(s)")
for i, table in enumerate(content.tables):
print(f" Table {i+1}: {table.shape[0]} rows Γ— {table.shape[1]} columns")
if not table.empty:
print(f" Columns: {list(table.columns)}")
return True
else:
print(f"⚠️ No tables found in {test_path.name}")
except Exception as e:
print(f"❌ Table extraction test failed: {e}")
return False
def validate_output_structure():
"""Validate the output structure is correct"""
print("\nπŸ“ Validating output structure...")
test_output_dir = Path("test_output")
if not test_output_dir.exists():
print("❌ Test output directory not found")
return False
# Check for expected files
expected_files = []
for item in test_output_dir.iterdir():
if item.is_dir():
expected_files.extend([
item / "summary.json",
item / "extracted_text.txt"
])
all_valid = True
for expected_file in expected_files:
if expected_file.exists():
print(f"βœ… Found: {expected_file}")
# Validate JSON structure
if expected_file.name == "summary.json":
try:
with open(expected_file) as f:
data = json.load(f)
if 'summary' in data and 'file_info' in data:
print(f" πŸ“‹ Valid JSON structure")
else:
print(f" ⚠️ Missing keys in JSON")
all_valid = False
except json.JSONDecodeError:
print(f" ❌ Invalid JSON format")
all_valid = False
else:
print(f"❌ Missing: {expected_file}")
all_valid = False
return all_valid
def main():
"""Run all tests"""
print("πŸš€ Starting Enhanced PDF Processor Tests")
print("=" * 60)
# Test 1: Single document processing
test1_result = test_single_document()
# Test 2: Table extraction
test2_result = test_table_extraction()
# Test 3: Output validation
test3_result = validate_output_structure()
# Summary
print("\n" + "=" * 60)
print("πŸ“Š TEST SUMMARY")
print(f"Single Document Processing: {'βœ… PASS' if test1_result else '❌ FAIL'}")
print(f"Table Extraction: {'βœ… PASS' if test2_result else '❌ FAIL'}")
print(f"Output Structure: {'βœ… PASS' if test3_result else '❌ FAIL'}")
overall_success = all([test1_result, test2_result, test3_result])
print(f"\n🎯 OVERALL: {'βœ… ALL TESTS PASSED' if overall_success else '❌ SOME TESTS FAILED'}")
if overall_success:
print("\nπŸš€ Ready to process all maternal health documents!")
else:
print("\n⚠️ Please fix issues before processing all documents")
return overall_success
if __name__ == "__main__":
success = main()
sys.exit(0 if success else 1)