Spaces:
Sleeping
Sleeping
#!/usr/bin/env python3 | |
""" | |
Test script for Enhanced PDF Processing Pipeline | |
Tests with sample maternal health documents | |
""" | |
import sys | |
from pathlib import Path | |
import json | |
# Add src to path for imports | |
sys.path.append(str(Path(__file__).parent)) | |
from enhanced_pdf_processor import EnhancedMedicalPDFProcessor | |
def test_single_document(): | |
"""Test processing a single document""" | |
print("π§ͺ Testing Enhanced PDF Processor with sample document...") | |
# Initialize processor | |
processor = EnhancedMedicalPDFProcessor(output_dir="test_output") | |
# Test with a smaller document first | |
test_files = [ | |
"../Obs/RhESUS.pdf", | |
"../Obs/puerperal-sepsis.pdf", | |
"../Obs/Management-of-thrombocytopaenia-in-pregnancy-Sept-5.pdf" | |
] | |
for test_file in test_files: | |
test_path = Path(test_file) | |
if test_path.exists(): | |
print(f"\nπ Testing with: {test_path.name}") | |
try: | |
# Process single document | |
result = processor.process_single_pdf(test_path) | |
# Display results | |
print(f"β Processing successful!") | |
print(f" π Pages: {result.summary['total_pages']}") | |
print(f" π Tables: {result.summary['total_tables']}") | |
print(f" π Words: {result.summary['total_words']}") | |
print(f" πΎ Size: {result.file_info['size_mb']:.2f} MB") | |
# Show content classification for first page | |
if result.content: | |
first_page = result.content[0] | |
classification = first_page.metadata['content_classification'] | |
print(f" π·οΈ Content Classification:") | |
for category, score in classification.items(): | |
if score > 0: | |
print(f" - {category}: {score:.4f}") | |
# Test successful, break after first working file | |
return True | |
except Exception as e: | |
print(f"β Processing failed: {e}") | |
continue | |
print("β No test files could be processed successfully") | |
return False | |
def test_table_extraction(): | |
"""Test table extraction specifically""" | |
print("\nπ Testing table extraction capabilities...") | |
processor = EnhancedMedicalPDFProcessor() | |
# Test with documents likely to have tables | |
table_test_files = [ | |
"../Obs/Management-of-Normal-Labourchart.pdf", | |
"../Obs/Management-of-thrombocytopaenia-in-pregnancy-Sept-5.pdf" | |
] | |
for test_file in table_test_files: | |
test_path = Path(test_file) | |
if test_path.exists(): | |
print(f"\nπ Testing table extraction with: {test_path.name}") | |
try: | |
result = processor.process_single_pdf(test_path) | |
total_tables = result.summary['total_tables'] | |
if total_tables > 0: | |
print(f"β Found {total_tables} tables!") | |
# Show table details | |
for content in result.content: | |
if content.tables: | |
print(f" Page {content.page_number}: {len(content.tables)} table(s)") | |
for i, table in enumerate(content.tables): | |
print(f" Table {i+1}: {table.shape[0]} rows Γ {table.shape[1]} columns") | |
if not table.empty: | |
print(f" Columns: {list(table.columns)}") | |
return True | |
else: | |
print(f"β οΈ No tables found in {test_path.name}") | |
except Exception as e: | |
print(f"β Table extraction test failed: {e}") | |
return False | |
def validate_output_structure(): | |
"""Validate the output structure is correct""" | |
print("\nπ Validating output structure...") | |
test_output_dir = Path("test_output") | |
if not test_output_dir.exists(): | |
print("β Test output directory not found") | |
return False | |
# Check for expected files | |
expected_files = [] | |
for item in test_output_dir.iterdir(): | |
if item.is_dir(): | |
expected_files.extend([ | |
item / "summary.json", | |
item / "extracted_text.txt" | |
]) | |
all_valid = True | |
for expected_file in expected_files: | |
if expected_file.exists(): | |
print(f"β Found: {expected_file}") | |
# Validate JSON structure | |
if expected_file.name == "summary.json": | |
try: | |
with open(expected_file) as f: | |
data = json.load(f) | |
if 'summary' in data and 'file_info' in data: | |
print(f" π Valid JSON structure") | |
else: | |
print(f" β οΈ Missing keys in JSON") | |
all_valid = False | |
except json.JSONDecodeError: | |
print(f" β Invalid JSON format") | |
all_valid = False | |
else: | |
print(f"β Missing: {expected_file}") | |
all_valid = False | |
return all_valid | |
def main(): | |
"""Run all tests""" | |
print("π Starting Enhanced PDF Processor Tests") | |
print("=" * 60) | |
# Test 1: Single document processing | |
test1_result = test_single_document() | |
# Test 2: Table extraction | |
test2_result = test_table_extraction() | |
# Test 3: Output validation | |
test3_result = validate_output_structure() | |
# Summary | |
print("\n" + "=" * 60) | |
print("π TEST SUMMARY") | |
print(f"Single Document Processing: {'β PASS' if test1_result else 'β FAIL'}") | |
print(f"Table Extraction: {'β PASS' if test2_result else 'β FAIL'}") | |
print(f"Output Structure: {'β PASS' if test3_result else 'β FAIL'}") | |
overall_success = all([test1_result, test2_result, test3_result]) | |
print(f"\nπ― OVERALL: {'β ALL TESTS PASSED' if overall_success else 'β SOME TESTS FAILED'}") | |
if overall_success: | |
print("\nπ Ready to process all maternal health documents!") | |
else: | |
print("\nβ οΈ Please fix issues before processing all documents") | |
return overall_success | |
if __name__ == "__main__": | |
success = main() | |
sys.exit(0 if success else 1) |