#!/usr/bin/env python3 """ Test script for Enhanced PDF Processing Pipeline Tests with sample maternal health documents """ import sys from pathlib import Path import json # Add src to path for imports sys.path.append(str(Path(__file__).parent)) from enhanced_pdf_processor import EnhancedMedicalPDFProcessor def test_single_document(): """Test processing a single document""" print("๐Ÿงช Testing Enhanced PDF Processor with sample document...") # Initialize processor processor = EnhancedMedicalPDFProcessor(output_dir="test_output") # Test with a smaller document first test_files = [ "../Obs/RhESUS.pdf", "../Obs/puerperal-sepsis.pdf", "../Obs/Management-of-thrombocytopaenia-in-pregnancy-Sept-5.pdf" ] for test_file in test_files: test_path = Path(test_file) if test_path.exists(): print(f"\n๐Ÿ“„ Testing with: {test_path.name}") try: # Process single document result = processor.process_single_pdf(test_path) # Display results print(f"โœ… Processing successful!") print(f" ๐Ÿ“Š Pages: {result.summary['total_pages']}") print(f" ๐Ÿ“‹ Tables: {result.summary['total_tables']}") print(f" ๐Ÿ“ Words: {result.summary['total_words']}") print(f" ๐Ÿ’พ Size: {result.file_info['size_mb']:.2f} MB") # Show content classification for first page if result.content: first_page = result.content[0] classification = first_page.metadata['content_classification'] print(f" ๐Ÿท๏ธ Content Classification:") for category, score in classification.items(): if score > 0: print(f" - {category}: {score:.4f}") # Test successful, break after first working file return True except Exception as e: print(f"โŒ Processing failed: {e}") continue print("โŒ No test files could be processed successfully") return False def test_table_extraction(): """Test table extraction specifically""" print("\n๐Ÿ” Testing table extraction capabilities...") processor = EnhancedMedicalPDFProcessor() # Test with documents likely to have tables table_test_files = [ "../Obs/Management-of-Normal-Labourchart.pdf", "../Obs/Management-of-thrombocytopaenia-in-pregnancy-Sept-5.pdf" ] for test_file in table_test_files: test_path = Path(test_file) if test_path.exists(): print(f"\n๐Ÿ“Š Testing table extraction with: {test_path.name}") try: result = processor.process_single_pdf(test_path) total_tables = result.summary['total_tables'] if total_tables > 0: print(f"โœ… Found {total_tables} tables!") # Show table details for content in result.content: if content.tables: print(f" Page {content.page_number}: {len(content.tables)} table(s)") for i, table in enumerate(content.tables): print(f" Table {i+1}: {table.shape[0]} rows ร— {table.shape[1]} columns") if not table.empty: print(f" Columns: {list(table.columns)}") return True else: print(f"โš ๏ธ No tables found in {test_path.name}") except Exception as e: print(f"โŒ Table extraction test failed: {e}") return False def validate_output_structure(): """Validate the output structure is correct""" print("\n๐Ÿ“ Validating output structure...") test_output_dir = Path("test_output") if not test_output_dir.exists(): print("โŒ Test output directory not found") return False # Check for expected files expected_files = [] for item in test_output_dir.iterdir(): if item.is_dir(): expected_files.extend([ item / "summary.json", item / "extracted_text.txt" ]) all_valid = True for expected_file in expected_files: if expected_file.exists(): print(f"โœ… Found: {expected_file}") # Validate JSON structure if expected_file.name == "summary.json": try: with open(expected_file) as f: data = json.load(f) if 'summary' in data and 'file_info' in data: print(f" ๐Ÿ“‹ Valid JSON structure") else: print(f" โš ๏ธ Missing keys in JSON") all_valid = False except json.JSONDecodeError: print(f" โŒ Invalid JSON format") all_valid = False else: print(f"โŒ Missing: {expected_file}") all_valid = False return all_valid def main(): """Run all tests""" print("๐Ÿš€ Starting Enhanced PDF Processor Tests") print("=" * 60) # Test 1: Single document processing test1_result = test_single_document() # Test 2: Table extraction test2_result = test_table_extraction() # Test 3: Output validation test3_result = validate_output_structure() # Summary print("\n" + "=" * 60) print("๐Ÿ“Š TEST SUMMARY") print(f"Single Document Processing: {'โœ… PASS' if test1_result else 'โŒ FAIL'}") print(f"Table Extraction: {'โœ… PASS' if test2_result else 'โŒ FAIL'}") print(f"Output Structure: {'โœ… PASS' if test3_result else 'โŒ FAIL'}") overall_success = all([test1_result, test2_result, test3_result]) print(f"\n๐ŸŽฏ OVERALL: {'โœ… ALL TESTS PASSED' if overall_success else 'โŒ SOME TESTS FAILED'}") if overall_success: print("\n๐Ÿš€ Ready to process all maternal health documents!") else: print("\nโš ๏ธ Please fix issues before processing all documents") return overall_success if __name__ == "__main__": success = main() sys.exit(0 if success else 1)