""" Enhanced OCR Engine for GAIA Agent - Phase 6 Handles multi-orientation text recognition, rotated/distorted text, and advanced OCR """ import logging import numpy as np from typing import Dict, Any, List, Optional, Tuple from pathlib import Path import tempfile import os # Image processing try: from PIL import Image, ImageEnhance, ImageFilter, ImageOps PIL_AVAILABLE = True except ImportError: PIL_AVAILABLE = False # OCR engine try: import pytesseract PYTESSERACT_AVAILABLE = True except ImportError: PYTESSERACT_AVAILABLE = False # Computer vision for advanced processing try: import cv2 CV2_AVAILABLE = True except ImportError: CV2_AVAILABLE = False logger = logging.getLogger(__name__) class EnhancedOCREngine: """ Enhanced OCR engine for complex text recognition scenarios. Features: - Multi-orientation text recognition (0°, 90°, 180°, 270°) - Rotated and distorted text handling - Multi-language OCR support - Text quality enhancement and preprocessing - Confidence scoring for OCR results - Advanced text extraction from complex layouts """ def __init__(self): """Initialize the enhanced OCR engine.""" self.name = "enhanced_ocr_engine" self.description = "Enhanced OCR for multi-orientation text, rotated/distorted text, and complex layouts" # Check dependencies self.available = PIL_AVAILABLE and PYTESSERACT_AVAILABLE if not self.available: missing = [] if not PIL_AVAILABLE: missing.append("PIL/Pillow") if not PYTESSERACT_AVAILABLE: missing.append("pytesseract") logger.warning(f"⚠️ Enhanced OCR Engine not available - missing: {', '.join(missing)}") return # Test tesseract installation try: pytesseract.get_tesseract_version() logger.info("✅ Tesseract OCR engine detected") except Exception as e: logger.warning(f"⚠️ Tesseract not properly installed: {e}") self.available = False return # OCR configurations for different scenarios self.ocr_configs = { 'default': '--oem 3 --psm 6', 'single_line': '--oem 3 --psm 8', 'single_word': '--oem 3 --psm 7', 'sparse_text': '--oem 3 --psm 11', 'single_char': '--oem 3 --psm 10', 'vertical_text': '--oem 3 --psm 5', 'uniform_block': '--oem 3 --psm 6' } # Supported orientations self.orientations = [0, 90, 180, 270] # Language codes for multi-language support self.supported_languages = [ 'eng', 'ara', 'chi_sim', 'chi_tra', 'fra', 'deu', 'spa', 'rus', 'jpn', 'kor', 'hin', 'tha', 'vie', 'heb', 'tur', 'pol', 'nld', 'ita', 'por', 'swe', 'dan', 'nor', 'fin', 'ces', 'hun', 'ron' ] logger.info("✅ Enhanced OCR Engine initialized") def preprocess_image(self, image: Image.Image, enhancement_level: str = 'medium') -> Image.Image: """ Preprocess image for better OCR results. Args: image: PIL Image object enhancement_level: 'light', 'medium', 'heavy' Returns: Preprocessed PIL Image """ if not isinstance(image, Image.Image): return image try: # Convert to RGB if necessary if image.mode != 'RGB': image = image.convert('RGB') # Apply enhancements based on level if enhancement_level in ['medium', 'heavy']: # Enhance contrast enhancer = ImageEnhance.Contrast(image) image = enhancer.enhance(1.2) # Enhance sharpness enhancer = ImageEnhance.Sharpness(image) image = enhancer.enhance(1.1) if enhancement_level == 'heavy': # Additional heavy processing # Reduce noise image = image.filter(ImageFilter.MedianFilter(size=3)) # Enhance brightness slightly enhancer = ImageEnhance.Brightness(image) image = enhancer.enhance(1.05) # Convert to grayscale for better OCR image = ImageOps.grayscale(image) # Increase contrast for text enhancer = ImageEnhance.Contrast(image) image = enhancer.enhance(1.3) return image except Exception as e: logger.warning(f"Image preprocessing failed: {e}") return image def rotate_image(self, image: Image.Image, angle: int) -> Image.Image: """ Rotate image by specified angle. Args: image: PIL Image object angle: Rotation angle in degrees Returns: Rotated PIL Image """ try: if angle == 0: return image # Rotate image rotated = image.rotate(-angle, expand=True, fillcolor='white') return rotated except Exception as e: logger.warning(f"Image rotation failed: {e}") return image def detect_text_orientation(self, image: Image.Image) -> Dict[str, Any]: """ Detect the orientation of text in the image. Args: image: PIL Image object Returns: Dictionary with orientation detection results """ result = { 'best_orientation': 0, 'confidence': 0.0, 'orientations_tested': [], 'method': 'ocr_confidence' } if not self.available: return result try: best_confidence = 0 best_orientation = 0 orientation_results = [] # Test each orientation for angle in self.orientations: rotated_image = self.rotate_image(image, angle) preprocessed = self.preprocess_image(rotated_image, 'light') # Get OCR data with confidence try: data = pytesseract.image_to_data( preprocessed, config=self.ocr_configs['default'], output_type=pytesseract.Output.DICT ) # Calculate average confidence for detected text confidences = [int(conf) for conf in data['conf'] if int(conf) > 0] avg_confidence = sum(confidences) / len(confidences) if confidences else 0 orientation_results.append({ 'angle': angle, 'confidence': avg_confidence, 'text_blocks': len(confidences) }) if avg_confidence > best_confidence: best_confidence = avg_confidence best_orientation = angle except Exception as e: logger.warning(f"OCR failed for orientation {angle}: {e}") orientation_results.append({ 'angle': angle, 'confidence': 0, 'text_blocks': 0 }) result['best_orientation'] = best_orientation result['confidence'] = best_confidence result['orientations_tested'] = orientation_results except Exception as e: logger.warning(f"Orientation detection failed: {e}") return result def extract_text_with_confidence(self, image: Image.Image, config: str = 'default', languages: List[str] = None) -> Dict[str, Any]: """ Extract text from image with confidence scores. Args: image: PIL Image object config: OCR configuration key languages: List of language codes to use Returns: Dictionary with text extraction results """ result = { 'text': '', 'confidence': 0.0, 'word_confidences': [], 'bounding_boxes': [], 'languages_used': languages or ['eng'] } if not self.available: return result try: # Prepare language string lang_string = '+'.join(languages) if languages else 'eng' # Get OCR configuration ocr_config = self.ocr_configs.get(config, self.ocr_configs['default']) ocr_config += f' -l {lang_string}' # Extract text with detailed data data = pytesseract.image_to_data( image, config=ocr_config, output_type=pytesseract.Output.DICT ) # Process results words = [] confidences = [] boxes = [] for i in range(len(data['text'])): text = data['text'][i].strip() conf = int(data['conf'][i]) if text and conf > 0: words.append(text) confidences.append(conf) boxes.append({ 'x': data['left'][i], 'y': data['top'][i], 'width': data['width'][i], 'height': data['height'][i], 'text': text, 'confidence': conf }) # Combine results result['text'] = ' '.join(words) result['confidence'] = sum(confidences) / len(confidences) if confidences else 0 result['word_confidences'] = confidences result['bounding_boxes'] = boxes except Exception as e: logger.warning(f"Text extraction failed: {e}") return result def process_multi_orientation_ocr(self, image: Image.Image, auto_detect_orientation: bool = True) -> Dict[str, Any]: """ Process OCR with multiple orientations and return best result. Args: image: PIL Image object auto_detect_orientation: Whether to auto-detect best orientation Returns: Dictionary with best OCR results """ result = { 'text': '', 'confidence': 0.0, 'best_orientation': 0, 'orientation_results': [], 'preprocessing_applied': True } if not self.available: return result try: # Preprocess image preprocessed = self.preprocess_image(image, 'medium') if auto_detect_orientation: # Detect best orientation first orientation_info = self.detect_text_orientation(preprocessed) best_angle = orientation_info['best_orientation'] # Process with best orientation rotated = self.rotate_image(preprocessed, best_angle) ocr_result = self.extract_text_with_confidence(rotated) result.update(ocr_result) result['best_orientation'] = best_angle result['orientation_results'] = orientation_info['orientations_tested'] else: # Try all orientations and pick best best_confidence = 0 best_result = None best_angle = 0 orientation_results = [] for angle in self.orientations: rotated = self.rotate_image(preprocessed, angle) ocr_result = self.extract_text_with_confidence(rotated) orientation_results.append({ 'angle': angle, 'confidence': ocr_result['confidence'], 'text_length': len(ocr_result['text']), 'word_count': len(ocr_result['text'].split()) }) if ocr_result['confidence'] > best_confidence: best_confidence = ocr_result['confidence'] best_result = ocr_result best_angle = angle if best_result: result.update(best_result) result['best_orientation'] = best_angle result['orientation_results'] = orientation_results except Exception as e: logger.error(f"Multi-orientation OCR failed: {e}") return result def process_image_file(self, image_path: str, **kwargs) -> Dict[str, Any]: """ Process an image file with enhanced OCR. Args: image_path: Path to image file **kwargs: Additional arguments for OCR processing Returns: Dictionary with OCR results """ result = { 'success': False, 'error': '', 'text': '', 'confidence': 0.0 } if not self.available: result['error'] = 'OCR engine not available' return result try: # Load image image = Image.open(image_path) # Process with multi-orientation OCR ocr_result = self.process_multi_orientation_ocr(image, **kwargs) result['success'] = True result.update(ocr_result) except Exception as e: result['error'] = str(e) logger.error(f"Image file processing failed: {e}") return result def enhance_text_quality(self, text: str) -> str: """ Enhance OCR text quality by fixing common errors. Args: text: Raw OCR text Returns: Enhanced text """ if not text: return text # Common OCR error corrections corrections = { # Number/letter confusions '0': 'O', # Context-dependent '1': 'l', # Context-dependent '5': 'S', # Context-dependent '8': 'B', # Context-dependent # Common character mistakes 'rn': 'm', 'cl': 'd', 'vv': 'w', # Punctuation fixes ' ,': ',', ' .': '.', ' !': '!', ' ?': '?', } enhanced = text # Apply basic corrections for wrong, right in corrections.items(): if wrong in enhanced: # Apply context-aware corrections enhanced = enhanced.replace(wrong, right) # Clean up extra spaces enhanced = ' '.join(enhanced.split()) return enhanced def get_enhanced_ocr_tools() -> List[EnhancedOCREngine]: """Get list of enhanced OCR tools.""" try: ocr_engine = EnhancedOCREngine() if ocr_engine.available: return [ocr_engine] else: logger.warning("⚠️ Enhanced OCR engine not available") return [] except Exception as e: logger.error(f"❌ Failed to create enhanced OCR engine: {e}") return []