vedaMD

Sleeping

File size: 6,610 Bytes

19aaa42

#!/usr/bin/env python3
"""
Test script for Enhanced PDF Processing Pipeline
Tests with sample maternal health documents
"""

import sys
from pathlib import Path
import json

# Add src to path for imports
sys.path.append(str(Path(__file__).parent))

from enhanced_pdf_processor import EnhancedMedicalPDFProcessor

def test_single_document():
    """Test processing a single document"""
    print("🧪 Testing Enhanced PDF Processor with sample document...")
    
    # Initialize processor
    processor = EnhancedMedicalPDFProcessor(output_dir="test_output")
    
    # Test with a smaller document first
    test_files = [
        "../Obs/RhESUS.pdf",
        "../Obs/puerperal-sepsis.pdf", 
        "../Obs/Management-of-thrombocytopaenia-in-pregnancy-Sept-5.pdf"
    ]
    
    for test_file in test_files:
        test_path = Path(test_file)
        if test_path.exists():
            print(f"\n📄 Testing with: {test_path.name}")
            
            try:
                # Process single document
                result = processor.process_single_pdf(test_path)
                
                # Display results
                print(f"✅ Processing successful!")
                print(f"   📊 Pages: {result.summary['total_pages']}")
                print(f"   📋 Tables: {result.summary['total_tables']}")
                print(f"   📝 Words: {result.summary['total_words']}")
                print(f"   💾 Size: {result.file_info['size_mb']:.2f} MB")
                
                # Show content classification for first page
                if result.content:
                    first_page = result.content[0]
                    classification = first_page.metadata['content_classification']
                    print(f"   🏷️  Content Classification:")
                    for category, score in classification.items():
                        if score > 0:
                            print(f"      - {category}: {score:.4f}")
                
                # Test successful, break after first working file
                return True
                
            except Exception as e:
                print(f"❌ Processing failed: {e}")
                continue
    
    print("❌ No test files could be processed successfully")
    return False

def test_table_extraction():
    """Test table extraction specifically"""
    print("\n🔍 Testing table extraction capabilities...")
    
    processor = EnhancedMedicalPDFProcessor()
    
    # Test with documents likely to have tables
    table_test_files = [
        "../Obs/Management-of-Normal-Labourchart.pdf",
        "../Obs/Management-of-thrombocytopaenia-in-pregnancy-Sept-5.pdf"
    ]
    
    for test_file in table_test_files:
        test_path = Path(test_file)
        if test_path.exists():
            print(f"\n📊 Testing table extraction with: {test_path.name}")
            
            try:
                result = processor.process_single_pdf(test_path)
                total_tables = result.summary['total_tables']
                
                if total_tables > 0:
                    print(f"✅ Found {total_tables} tables!")
                    
                    # Show table details
                    for content in result.content:
                        if content.tables:
                            print(f"   Page {content.page_number}: {len(content.tables)} table(s)")
                            for i, table in enumerate(content.tables):
                                print(f"      Table {i+1}: {table.shape[0]} rows × {table.shape[1]} columns")
                                if not table.empty:
                                    print(f"      Columns: {list(table.columns)}")
                    return True
                else:
                    print(f"⚠️  No tables found in {test_path.name}")
                    
            except Exception as e:
                print(f"❌ Table extraction test failed: {e}")
    
    return False

def validate_output_structure():
    """Validate the output structure is correct"""
    print("\n📁 Validating output structure...")
    
    test_output_dir = Path("test_output")
    if not test_output_dir.exists():
        print("❌ Test output directory not found")
        return False
    
    # Check for expected files
    expected_files = []
    for item in test_output_dir.iterdir():
        if item.is_dir():
            expected_files.extend([
                item / "summary.json",
                item / "extracted_text.txt"
            ])
    
    all_valid = True
    for expected_file in expected_files:
        if expected_file.exists():
            print(f"✅ Found: {expected_file}")
            
            # Validate JSON structure
            if expected_file.name == "summary.json":
                try:
                    with open(expected_file) as f:
                        data = json.load(f)
                    if 'summary' in data and 'file_info' in data:
                        print(f"   📋 Valid JSON structure")
                    else:
                        print(f"   ⚠️  Missing keys in JSON")
                        all_valid = False
                except json.JSONDecodeError:
                    print(f"   ❌ Invalid JSON format")
                    all_valid = False
        else:
            print(f"❌ Missing: {expected_file}")
            all_valid = False
    
    return all_valid

def main():
    """Run all tests"""
    print("🚀 Starting Enhanced PDF Processor Tests")
    print("=" * 60)
    
    # Test 1: Single document processing
    test1_result = test_single_document()
    
    # Test 2: Table extraction
    test2_result = test_table_extraction()
    
    # Test 3: Output validation
    test3_result = validate_output_structure()
    
    # Summary
    print("\n" + "=" * 60)
    print("📊 TEST SUMMARY")
    print(f"Single Document Processing: {'✅ PASS' if test1_result else '❌ FAIL'}")
    print(f"Table Extraction: {'✅ PASS' if test2_result else '❌ FAIL'}")
    print(f"Output Structure: {'✅ PASS' if test3_result else '❌ FAIL'}")
    
    overall_success = all([test1_result, test2_result, test3_result])
    print(f"\n🎯 OVERALL: {'✅ ALL TESTS PASSED' if overall_success else '❌ SOME TESTS FAILED'}")
    
    if overall_success:
        print("\n🚀 Ready to process all maternal health documents!")
    else:
        print("\n⚠️  Please fix issues before processing all documents")
    
    return overall_success

if __name__ == "__main__":
    success = main()
    sys.exit(0 if success else 1)