Spaces:

jm-vis
/

gaia-enhanced-agent

Sleeping

gaia-enhanced-agent / tools /enhanced_ocr_engine.py

GAIA Agent Deployment

Deploy Complete Enhanced GAIA Agent with Phase 1-6 Improvements

9a6a4dc 7 months ago

16.2 kB

	"""
	Enhanced OCR Engine for GAIA Agent - Phase 6
	Handles multi-orientation text recognition, rotated/distorted text, and advanced OCR
	"""

	import logging
	import numpy as np
	from typing import Dict, Any, List, Optional, Tuple
	from pathlib import Path
	import tempfile
	import os

	# Image processing
	try:
	from PIL import Image, ImageEnhance, ImageFilter, ImageOps
	PIL_AVAILABLE = True
	except ImportError:
	PIL_AVAILABLE = False

	# OCR engine
	try:
	import pytesseract
	PYTESSERACT_AVAILABLE = True
	except ImportError:
	PYTESSERACT_AVAILABLE = False

	# Computer vision for advanced processing
	try:
	import cv2
	CV2_AVAILABLE = True
	except ImportError:
	CV2_AVAILABLE = False

	logger = logging.getLogger(__name__)


	class EnhancedOCREngine:
	"""
	Enhanced OCR engine for complex text recognition scenarios.

	Features:
	- Multi-orientation text recognition (0°, 90°, 180°, 270°)
	- Rotated and distorted text handling
	- Multi-language OCR support
	- Text quality enhancement and preprocessing
	- Confidence scoring for OCR results
	- Advanced text extraction from complex layouts
	"""

	def __init__(self):
	"""Initialize the enhanced OCR engine."""
	self.name = "enhanced_ocr_engine"
	self.description = "Enhanced OCR for multi-orientation text, rotated/distorted text, and complex layouts"

	# Check dependencies
	self.available = PIL_AVAILABLE and PYTESSERACT_AVAILABLE

	if not self.available:
	missing = []
	if not PIL_AVAILABLE:
	missing.append("PIL/Pillow")
	if not PYTESSERACT_AVAILABLE:
	missing.append("pytesseract")
	logger.warning(f"⚠️ Enhanced OCR Engine not available - missing: {', '.join(missing)}")
	return

	# Test tesseract installation
	try:
	pytesseract.get_tesseract_version()
	logger.info("✅ Tesseract OCR engine detected")
	except Exception as e:
	logger.warning(f"⚠️ Tesseract not properly installed: {e}")
	self.available = False
	return

	# OCR configurations for different scenarios
	self.ocr_configs = {
	'default': '--oem 3 --psm 6',
	'single_line': '--oem 3 --psm 8',
	'single_word': '--oem 3 --psm 7',
	'sparse_text': '--oem 3 --psm 11',
	'single_char': '--oem 3 --psm 10',
	'vertical_text': '--oem 3 --psm 5',
	'uniform_block': '--oem 3 --psm 6'
	}

	# Supported orientations
	self.orientations = [0, 90, 180, 270]

	# Language codes for multi-language support
	self.supported_languages = [
	'eng', 'ara', 'chi_sim', 'chi_tra', 'fra', 'deu', 'spa', 'rus',
	'jpn', 'kor', 'hin', 'tha', 'vie', 'heb', 'tur', 'pol', 'nld',
	'ita', 'por', 'swe', 'dan', 'nor', 'fin', 'ces', 'hun', 'ron'
	]

	logger.info("✅ Enhanced OCR Engine initialized")

	def preprocess_image(self, image: Image.Image, enhancement_level: str = 'medium') -> Image.Image:
	"""
	Preprocess image for better OCR results.

	Args:
	image: PIL Image object
	enhancement_level: 'light', 'medium', 'heavy'

	Returns:
	Preprocessed PIL Image
	"""
	if not isinstance(image, Image.Image):
	return image

	try:
	# Convert to RGB if necessary
	if image.mode != 'RGB':
	image = image.convert('RGB')

	# Apply enhancements based on level
	if enhancement_level in ['medium', 'heavy']:
	# Enhance contrast
	enhancer = ImageEnhance.Contrast(image)
	image = enhancer.enhance(1.2)

	# Enhance sharpness
	enhancer = ImageEnhance.Sharpness(image)
	image = enhancer.enhance(1.1)

	if enhancement_level == 'heavy':
	# Additional heavy processing
	# Reduce noise
	image = image.filter(ImageFilter.MedianFilter(size=3))

	# Enhance brightness slightly
	enhancer = ImageEnhance.Brightness(image)
	image = enhancer.enhance(1.05)

	# Convert to grayscale for better OCR
	image = ImageOps.grayscale(image)

	# Increase contrast for text
	enhancer = ImageEnhance.Contrast(image)
	image = enhancer.enhance(1.3)

	return image

	except Exception as e:
	logger.warning(f"Image preprocessing failed: {e}")
	return image

	def rotate_image(self, image: Image.Image, angle: int) -> Image.Image:
	"""
	Rotate image by specified angle.

	Args:
	image: PIL Image object
	angle: Rotation angle in degrees

	Returns:
	Rotated PIL Image
	"""
	try:
	if angle == 0:
	return image

	# Rotate image
	rotated = image.rotate(-angle, expand=True, fillcolor='white')
	return rotated

	except Exception as e:
	logger.warning(f"Image rotation failed: {e}")
	return image

	def detect_text_orientation(self, image: Image.Image) -> Dict[str, Any]:
	"""
	Detect the orientation of text in the image.

	Args:
	image: PIL Image object

	Returns:
	Dictionary with orientation detection results
	"""
	result = {
	'best_orientation': 0,
	'confidence': 0.0,
	'orientations_tested': [],
	'method': 'ocr_confidence'
	}

	if not self.available:
	return result

	try:
	best_confidence = 0
	best_orientation = 0
	orientation_results = []

	# Test each orientation
	for angle in self.orientations:
	rotated_image = self.rotate_image(image, angle)
	preprocessed = self.preprocess_image(rotated_image, 'light')

	# Get OCR data with confidence
	try:
	data = pytesseract.image_to_data(
	preprocessed,
	config=self.ocr_configs['default'],
	output_type=pytesseract.Output.DICT
	)

	# Calculate average confidence for detected text
	confidences = [int(conf) for conf in data['conf'] if int(conf) > 0]
	avg_confidence = sum(confidences) / len(confidences) if confidences else 0

	orientation_results.append({
	'angle': angle,
	'confidence': avg_confidence,
	'text_blocks': len(confidences)
	})

	if avg_confidence > best_confidence:
	best_confidence = avg_confidence
	best_orientation = angle

	except Exception as e:
	logger.warning(f"OCR failed for orientation {angle}: {e}")
	orientation_results.append({
	'angle': angle,
	'confidence': 0,
	'text_blocks': 0
	})

	result['best_orientation'] = best_orientation
	result['confidence'] = best_confidence
	result['orientations_tested'] = orientation_results

	except Exception as e:
	logger.warning(f"Orientation detection failed: {e}")

	return result

	def extract_text_with_confidence(self, image: Image.Image, config: str = 'default',
	languages: List[str] = None) -> Dict[str, Any]:
	"""
	Extract text from image with confidence scores.

	Args:
	image: PIL Image object
	config: OCR configuration key
	languages: List of language codes to use

	Returns:
	Dictionary with text extraction results
	"""
	result = {
	'text': '',
	'confidence': 0.0,
	'word_confidences': [],
	'bounding_boxes': [],
	'languages_used': languages or ['eng']
	}

	if not self.available:
	return result

	try:
	# Prepare language string
	lang_string = '+'.join(languages) if languages else 'eng'

	# Get OCR configuration
	ocr_config = self.ocr_configs.get(config, self.ocr_configs['default'])
	ocr_config += f' -l {lang_string}'

	# Extract text with detailed data
	data = pytesseract.image_to_data(
	image,
	config=ocr_config,
	output_type=pytesseract.Output.DICT
	)

	# Process results
	words = []
	confidences = []
	boxes = []

	for i in range(len(data['text'])):
	text = data['text'][i].strip()
	conf = int(data['conf'][i])

	if text and conf > 0:
	words.append(text)
	confidences.append(conf)
	boxes.append({
	'x': data['left'][i],
	'y': data['top'][i],
	'width': data['width'][i],
	'height': data['height'][i],
	'text': text,
	'confidence': conf
	})

	# Combine results
	result['text'] = ' '.join(words)
	result['confidence'] = sum(confidences) / len(confidences) if confidences else 0
	result['word_confidences'] = confidences
	result['bounding_boxes'] = boxes

	except Exception as e:
	logger.warning(f"Text extraction failed: {e}")

	return result

	def process_multi_orientation_ocr(self, image: Image.Image,
	auto_detect_orientation: bool = True) -> Dict[str, Any]:
	"""
	Process OCR with multiple orientations and return best result.

	Args:
	image: PIL Image object
	auto_detect_orientation: Whether to auto-detect best orientation

	Returns:
	Dictionary with best OCR results
	"""
	result = {
	'text': '',
	'confidence': 0.0,
	'best_orientation': 0,
	'orientation_results': [],
	'preprocessing_applied': True
	}

	if not self.available:
	return result

	try:
	# Preprocess image
	preprocessed = self.preprocess_image(image, 'medium')

	if auto_detect_orientation:
	# Detect best orientation first
	orientation_info = self.detect_text_orientation(preprocessed)
	best_angle = orientation_info['best_orientation']

	# Process with best orientation
	rotated = self.rotate_image(preprocessed, best_angle)
	ocr_result = self.extract_text_with_confidence(rotated)

	result.update(ocr_result)
	result['best_orientation'] = best_angle
	result['orientation_results'] = orientation_info['orientations_tested']
	else:
	# Try all orientations and pick best
	best_confidence = 0
	best_result = None
	best_angle = 0
	orientation_results = []

	for angle in self.orientations:
	rotated = self.rotate_image(preprocessed, angle)
	ocr_result = self.extract_text_with_confidence(rotated)

	orientation_results.append({
	'angle': angle,
	'confidence': ocr_result['confidence'],
	'text_length': len(ocr_result['text']),
	'word_count': len(ocr_result['text'].split())
	})

	if ocr_result['confidence'] > best_confidence:
	best_confidence = ocr_result['confidence']
	best_result = ocr_result
	best_angle = angle

	if best_result:
	result.update(best_result)
	result['best_orientation'] = best_angle
	result['orientation_results'] = orientation_results

	except Exception as e:
	logger.error(f"Multi-orientation OCR failed: {e}")

	return result

	def process_image_file(self, image_path: str, **kwargs) -> Dict[str, Any]:
	"""
	Process an image file with enhanced OCR.

	Args:
	image_path: Path to image file
	**kwargs: Additional arguments for OCR processing

	Returns:
	Dictionary with OCR results
	"""
	result = {
	'success': False,
	'error': '',
	'text': '',
	'confidence': 0.0
	}

	if not self.available:
	result['error'] = 'OCR engine not available'
	return result

	try:
	# Load image
	image = Image.open(image_path)

	# Process with multi-orientation OCR
	ocr_result = self.process_multi_orientation_ocr(image, **kwargs)

	result['success'] = True
	result.update(ocr_result)

	except Exception as e:
	result['error'] = str(e)
	logger.error(f"Image file processing failed: {e}")

	return result

	def enhance_text_quality(self, text: str) -> str:
	"""
	Enhance OCR text quality by fixing common errors.

	Args:
	text: Raw OCR text

	Returns:
	Enhanced text
	"""
	if not text:
	return text

	# Common OCR error corrections
	corrections = {
	# Number/letter confusions
	'0': 'O', # Context-dependent
	'1': 'l', # Context-dependent
	'5': 'S', # Context-dependent
	'8': 'B', # Context-dependent

	# Common character mistakes
	'rn': 'm',
	'cl': 'd',
	'vv': 'w',

	# Punctuation fixes
	' ,': ',',
	' .': '.',
	' !': '!',
	' ?': '?',
	}

	enhanced = text

	# Apply basic corrections
	for wrong, right in corrections.items():
	if wrong in enhanced:
	# Apply context-aware corrections
	enhanced = enhanced.replace(wrong, right)

	# Clean up extra spaces
	enhanced = ' '.join(enhanced.split())

	return enhanced


	def get_enhanced_ocr_tools() -> List[EnhancedOCREngine]:
	"""Get list of enhanced OCR tools."""
	try:
	ocr_engine = EnhancedOCREngine()
	if ocr_engine.available:
	return [ocr_engine]
	else:
	logger.warning("⚠️ Enhanced OCR engine not available")
	return []
	except Exception as e:
	logger.error(f"❌ Failed to create enhanced OCR engine: {e}")
	return []