File size: 16,216 Bytes
9a6a4dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
"""
Enhanced OCR Engine for GAIA Agent - Phase 6
Handles multi-orientation text recognition, rotated/distorted text, and advanced OCR
"""

import logging
import numpy as np
from typing import Dict, Any, List, Optional, Tuple
from pathlib import Path
import tempfile
import os

# Image processing
try:
    from PIL import Image, ImageEnhance, ImageFilter, ImageOps
    PIL_AVAILABLE = True
except ImportError:
    PIL_AVAILABLE = False

# OCR engine
try:
    import pytesseract
    PYTESSERACT_AVAILABLE = True
except ImportError:
    PYTESSERACT_AVAILABLE = False

# Computer vision for advanced processing
try:
    import cv2
    CV2_AVAILABLE = True
except ImportError:
    CV2_AVAILABLE = False

logger = logging.getLogger(__name__)


class EnhancedOCREngine:
    """
    Enhanced OCR engine for complex text recognition scenarios.
    
    Features:
    - Multi-orientation text recognition (0°, 90°, 180°, 270°)
    - Rotated and distorted text handling
    - Multi-language OCR support
    - Text quality enhancement and preprocessing
    - Confidence scoring for OCR results
    - Advanced text extraction from complex layouts
    """
    
    def __init__(self):
        """Initialize the enhanced OCR engine."""
        self.name = "enhanced_ocr_engine"
        self.description = "Enhanced OCR for multi-orientation text, rotated/distorted text, and complex layouts"
        
        # Check dependencies
        self.available = PIL_AVAILABLE and PYTESSERACT_AVAILABLE
        
        if not self.available:
            missing = []
            if not PIL_AVAILABLE:
                missing.append("PIL/Pillow")
            if not PYTESSERACT_AVAILABLE:
                missing.append("pytesseract")
            logger.warning(f"⚠️ Enhanced OCR Engine not available - missing: {', '.join(missing)}")
            return
        
        # Test tesseract installation
        try:
            pytesseract.get_tesseract_version()
            logger.info("✅ Tesseract OCR engine detected")
        except Exception as e:
            logger.warning(f"⚠️ Tesseract not properly installed: {e}")
            self.available = False
            return
        
        # OCR configurations for different scenarios
        self.ocr_configs = {
            'default': '--oem 3 --psm 6',
            'single_line': '--oem 3 --psm 8',
            'single_word': '--oem 3 --psm 7',
            'sparse_text': '--oem 3 --psm 11',
            'single_char': '--oem 3 --psm 10',
            'vertical_text': '--oem 3 --psm 5',
            'uniform_block': '--oem 3 --psm 6'
        }
        
        # Supported orientations
        self.orientations = [0, 90, 180, 270]
        
        # Language codes for multi-language support
        self.supported_languages = [
            'eng', 'ara', 'chi_sim', 'chi_tra', 'fra', 'deu', 'spa', 'rus',
            'jpn', 'kor', 'hin', 'tha', 'vie', 'heb', 'tur', 'pol', 'nld',
            'ita', 'por', 'swe', 'dan', 'nor', 'fin', 'ces', 'hun', 'ron'
        ]
        
        logger.info("✅ Enhanced OCR Engine initialized")
    
    def preprocess_image(self, image: Image.Image, enhancement_level: str = 'medium') -> Image.Image:
        """
        Preprocess image for better OCR results.
        
        Args:
            image: PIL Image object
            enhancement_level: 'light', 'medium', 'heavy'
            
        Returns:
            Preprocessed PIL Image
        """
        if not isinstance(image, Image.Image):
            return image
        
        try:
            # Convert to RGB if necessary
            if image.mode != 'RGB':
                image = image.convert('RGB')
            
            # Apply enhancements based on level
            if enhancement_level in ['medium', 'heavy']:
                # Enhance contrast
                enhancer = ImageEnhance.Contrast(image)
                image = enhancer.enhance(1.2)
                
                # Enhance sharpness
                enhancer = ImageEnhance.Sharpness(image)
                image = enhancer.enhance(1.1)
            
            if enhancement_level == 'heavy':
                # Additional heavy processing
                # Reduce noise
                image = image.filter(ImageFilter.MedianFilter(size=3))
                
                # Enhance brightness slightly
                enhancer = ImageEnhance.Brightness(image)
                image = enhancer.enhance(1.05)
            
            # Convert to grayscale for better OCR
            image = ImageOps.grayscale(image)
            
            # Increase contrast for text
            enhancer = ImageEnhance.Contrast(image)
            image = enhancer.enhance(1.3)
            
            return image
            
        except Exception as e:
            logger.warning(f"Image preprocessing failed: {e}")
            return image
    
    def rotate_image(self, image: Image.Image, angle: int) -> Image.Image:
        """
        Rotate image by specified angle.
        
        Args:
            image: PIL Image object
            angle: Rotation angle in degrees
            
        Returns:
            Rotated PIL Image
        """
        try:
            if angle == 0:
                return image
            
            # Rotate image
            rotated = image.rotate(-angle, expand=True, fillcolor='white')
            return rotated
            
        except Exception as e:
            logger.warning(f"Image rotation failed: {e}")
            return image
    
    def detect_text_orientation(self, image: Image.Image) -> Dict[str, Any]:
        """
        Detect the orientation of text in the image.
        
        Args:
            image: PIL Image object
            
        Returns:
            Dictionary with orientation detection results
        """
        result = {
            'best_orientation': 0,
            'confidence': 0.0,
            'orientations_tested': [],
            'method': 'ocr_confidence'
        }
        
        if not self.available:
            return result
        
        try:
            best_confidence = 0
            best_orientation = 0
            orientation_results = []
            
            # Test each orientation
            for angle in self.orientations:
                rotated_image = self.rotate_image(image, angle)
                preprocessed = self.preprocess_image(rotated_image, 'light')
                
                # Get OCR data with confidence
                try:
                    data = pytesseract.image_to_data(
                        preprocessed,
                        config=self.ocr_configs['default'],
                        output_type=pytesseract.Output.DICT
                    )
                    
                    # Calculate average confidence for detected text
                    confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
                    avg_confidence = sum(confidences) / len(confidences) if confidences else 0
                    
                    orientation_results.append({
                        'angle': angle,
                        'confidence': avg_confidence,
                        'text_blocks': len(confidences)
                    })
                    
                    if avg_confidence > best_confidence:
                        best_confidence = avg_confidence
                        best_orientation = angle
                        
                except Exception as e:
                    logger.warning(f"OCR failed for orientation {angle}: {e}")
                    orientation_results.append({
                        'angle': angle,
                        'confidence': 0,
                        'text_blocks': 0
                    })
            
            result['best_orientation'] = best_orientation
            result['confidence'] = best_confidence
            result['orientations_tested'] = orientation_results
            
        except Exception as e:
            logger.warning(f"Orientation detection failed: {e}")
        
        return result
    
    def extract_text_with_confidence(self, image: Image.Image, config: str = 'default', 
                                   languages: List[str] = None) -> Dict[str, Any]:
        """
        Extract text from image with confidence scores.
        
        Args:
            image: PIL Image object
            config: OCR configuration key
            languages: List of language codes to use
            
        Returns:
            Dictionary with text extraction results
        """
        result = {
            'text': '',
            'confidence': 0.0,
            'word_confidences': [],
            'bounding_boxes': [],
            'languages_used': languages or ['eng']
        }
        
        if not self.available:
            return result
        
        try:
            # Prepare language string
            lang_string = '+'.join(languages) if languages else 'eng'
            
            # Get OCR configuration
            ocr_config = self.ocr_configs.get(config, self.ocr_configs['default'])
            ocr_config += f' -l {lang_string}'
            
            # Extract text with detailed data
            data = pytesseract.image_to_data(
                image,
                config=ocr_config,
                output_type=pytesseract.Output.DICT
            )
            
            # Process results
            words = []
            confidences = []
            boxes = []
            
            for i in range(len(data['text'])):
                text = data['text'][i].strip()
                conf = int(data['conf'][i])
                
                if text and conf > 0:
                    words.append(text)
                    confidences.append(conf)
                    boxes.append({
                        'x': data['left'][i],
                        'y': data['top'][i],
                        'width': data['width'][i],
                        'height': data['height'][i],
                        'text': text,
                        'confidence': conf
                    })
            
            # Combine results
            result['text'] = ' '.join(words)
            result['confidence'] = sum(confidences) / len(confidences) if confidences else 0
            result['word_confidences'] = confidences
            result['bounding_boxes'] = boxes
            
        except Exception as e:
            logger.warning(f"Text extraction failed: {e}")
        
        return result
    
    def process_multi_orientation_ocr(self, image: Image.Image, 
                                    auto_detect_orientation: bool = True) -> Dict[str, Any]:
        """
        Process OCR with multiple orientations and return best result.
        
        Args:
            image: PIL Image object
            auto_detect_orientation: Whether to auto-detect best orientation
            
        Returns:
            Dictionary with best OCR results
        """
        result = {
            'text': '',
            'confidence': 0.0,
            'best_orientation': 0,
            'orientation_results': [],
            'preprocessing_applied': True
        }
        
        if not self.available:
            return result
        
        try:
            # Preprocess image
            preprocessed = self.preprocess_image(image, 'medium')
            
            if auto_detect_orientation:
                # Detect best orientation first
                orientation_info = self.detect_text_orientation(preprocessed)
                best_angle = orientation_info['best_orientation']
                
                # Process with best orientation
                rotated = self.rotate_image(preprocessed, best_angle)
                ocr_result = self.extract_text_with_confidence(rotated)
                
                result.update(ocr_result)
                result['best_orientation'] = best_angle
                result['orientation_results'] = orientation_info['orientations_tested']
            else:
                # Try all orientations and pick best
                best_confidence = 0
                best_result = None
                best_angle = 0
                orientation_results = []
                
                for angle in self.orientations:
                    rotated = self.rotate_image(preprocessed, angle)
                    ocr_result = self.extract_text_with_confidence(rotated)
                    
                    orientation_results.append({
                        'angle': angle,
                        'confidence': ocr_result['confidence'],
                        'text_length': len(ocr_result['text']),
                        'word_count': len(ocr_result['text'].split())
                    })
                    
                    if ocr_result['confidence'] > best_confidence:
                        best_confidence = ocr_result['confidence']
                        best_result = ocr_result
                        best_angle = angle
                
                if best_result:
                    result.update(best_result)
                    result['best_orientation'] = best_angle
                    result['orientation_results'] = orientation_results
        
        except Exception as e:
            logger.error(f"Multi-orientation OCR failed: {e}")
        
        return result
    
    def process_image_file(self, image_path: str, **kwargs) -> Dict[str, Any]:
        """
        Process an image file with enhanced OCR.
        
        Args:
            image_path: Path to image file
            **kwargs: Additional arguments for OCR processing
            
        Returns:
            Dictionary with OCR results
        """
        result = {
            'success': False,
            'error': '',
            'text': '',
            'confidence': 0.0
        }
        
        if not self.available:
            result['error'] = 'OCR engine not available'
            return result
        
        try:
            # Load image
            image = Image.open(image_path)
            
            # Process with multi-orientation OCR
            ocr_result = self.process_multi_orientation_ocr(image, **kwargs)
            
            result['success'] = True
            result.update(ocr_result)
            
        except Exception as e:
            result['error'] = str(e)
            logger.error(f"Image file processing failed: {e}")
        
        return result
    
    def enhance_text_quality(self, text: str) -> str:
        """
        Enhance OCR text quality by fixing common errors.
        
        Args:
            text: Raw OCR text
            
        Returns:
            Enhanced text
        """
        if not text:
            return text
        
        # Common OCR error corrections
        corrections = {
            # Number/letter confusions
            '0': 'O',  # Context-dependent
            '1': 'l',  # Context-dependent
            '5': 'S',  # Context-dependent
            '8': 'B',  # Context-dependent
            
            # Common character mistakes
            'rn': 'm',
            'cl': 'd',
            'vv': 'w',
            
            # Punctuation fixes
            ' ,': ',',
            ' .': '.',
            ' !': '!',
            ' ?': '?',
        }
        
        enhanced = text
        
        # Apply basic corrections
        for wrong, right in corrections.items():
            if wrong in enhanced:
                # Apply context-aware corrections
                enhanced = enhanced.replace(wrong, right)
        
        # Clean up extra spaces
        enhanced = ' '.join(enhanced.split())
        
        return enhanced


def get_enhanced_ocr_tools() -> List[EnhancedOCREngine]:
    """Get list of enhanced OCR tools."""
    try:
        ocr_engine = EnhancedOCREngine()
        if ocr_engine.available:
            return [ocr_engine]
        else:
            logger.warning("⚠️ Enhanced OCR engine not available")
            return []
    except Exception as e:
        logger.error(f"❌ Failed to create enhanced OCR engine: {e}")
        return []