import re from typing import Dict, List, Tuple, Any class OCRAccuracyAnalyzer: def __init__(self): """Initialize the OCR Accuracy Analyzer.""" self.confidence_thresholds = { 'high': 0.8, 'medium': 0.6, 'low': 0.4 } def analyze_ocr_quality(self, full_text_annotation: Any, text: str) -> Dict[str, float]: """ Analyze the quality of OCR results. Args: full_text_annotation: The full text annotation from Google Cloud Vision API text: The extracted text content Returns: Dictionary containing accuracy metrics """ try: # Calculate confidence score confidence_score = self._calculate_confidence_score(full_text_annotation) # Calculate word count accuracy word_count_accuracy = self._calculate_word_count_accuracy(text) # Calculate overall accuracy overall_accuracy = (confidence_score + word_count_accuracy) / 2 return { "confidence_score": confidence_score, "word_count_accuracy": word_count_accuracy, "overall_accuracy": overall_accuracy } except Exception as e: print(f"Error in accuracy analysis: {str(e)}") return { "confidence_score": 0.0, "word_count_accuracy": 0.0, "overall_accuracy": 0.0 } def _calculate_confidence_score(self, full_text_annotation: Any) -> float: """ Calculate the confidence score from the full text annotation. Args: full_text_annotation: The full text annotation from Google Cloud Vision API Returns: Confidence score between 0 and 1 """ try: if not full_text_annotation or not hasattr(full_text_annotation, 'pages'): return 0.0 total_confidence = 0.0 total_words = 0 for page in full_text_annotation.pages: for block in page.blocks: for paragraph in block.paragraphs: for word in paragraph.words: total_confidence += word.confidence total_words += 1 return total_confidence / total_words if total_words > 0 else 0.0 except Exception as e: print(f"Error calculating confidence score: {str(e)}") return 0.0 def _calculate_word_count_accuracy(self, text: str) -> float: """ Calculate word count accuracy based on text characteristics. Args: text: The extracted text content Returns: Word count accuracy score between 0 and 1 """ try: if not text: return 0.0 # Count words words = text.split() word_count = len(words) # Check for minimum word count (assuming OCR should extract at least some words) if word_count < 10: return 0.3 # Check for common OCR issues issues = 0 total_checks = 4 # Check 1: Presence of special characters if re.search(r'[^a-zA-Z0-9\s.,!?-]', text): issues += 1 # Check 2: Presence of multiple spaces if re.search(r'\s{2,}', text): issues += 1 # Check 3: Presence of mixed case words (potential OCR errors) mixed_case_words = sum(1 for word in words if not word.isupper() and not word.islower()) if mixed_case_words > len(words) * 0.3: # If more than 30% words have mixed case issues += 1 # Check 4: Presence of very short words (potential OCR errors) short_words = sum(1 for word in words if len(word) < 2) if short_words > len(words) * 0.1: # If more than 10% words are very short issues += 1 return 1 - (issues / total_checks) except Exception as e: print(f"Error calculating word count accuracy: {str(e)}") return 0.0 def get_accuracy_status(self, accuracy_metrics: Dict[str, float]) -> Tuple[str, str]: """ Get the accuracy status and message based on metrics. Args: accuracy_metrics: Dictionary containing accuracy metrics Returns: Tuple of (status, message) """ try: overall_accuracy = accuracy_metrics.get('overall_accuracy', 0.0) if overall_accuracy >= self.confidence_thresholds['high']: return 'high', 'OCR quality is excellent' elif overall_accuracy >= self.confidence_thresholds['medium']: return 'medium', 'OCR quality is acceptable' elif overall_accuracy >= self.confidence_thresholds['low']: return 'low', 'OCR quality needs improvement' else: return 'poor', 'OCR quality is poor' except Exception as e: print(f"Error getting accuracy status: {str(e)}") return 'unknown', 'Unable to determine accuracy status' def get_detailed_analysis(self, accuracy_metrics: Dict[str, float]) -> List[str]: """ Get detailed analysis points based on accuracy metrics. Args: accuracy_metrics: Dictionary containing accuracy metrics Returns: List of analysis points """ analysis_points = [] try: confidence_score = accuracy_metrics.get('confidence_score', 0.0) word_count_accuracy = accuracy_metrics.get('word_count_accuracy', 0.0) overall_accuracy = accuracy_metrics.get('overall_accuracy', 0.0) # Analyze confidence score if confidence_score >= self.confidence_thresholds['high']: analysis_points.append("High confidence in text recognition") elif confidence_score >= self.confidence_thresholds['medium']: analysis_points.append("Moderate confidence in text recognition") else: analysis_points.append("Low confidence in text recognition") # Analyze word count accuracy if word_count_accuracy >= self.confidence_thresholds['high']: analysis_points.append("Excellent word count accuracy") elif word_count_accuracy >= self.confidence_thresholds['medium']: analysis_points.append("Acceptable word count accuracy") else: analysis_points.append("Poor word count accuracy") # Overall analysis if overall_accuracy >= self.confidence_thresholds['high']: analysis_points.append("Overall OCR quality is excellent") elif overall_accuracy >= self.confidence_thresholds['medium']: analysis_points.append("Overall OCR quality is acceptable") else: analysis_points.append("Overall OCR quality needs improvement") except Exception as e: print(f"Error getting detailed analysis: {str(e)}") analysis_points.append("Unable to perform detailed analysis") return analysis_points