Spaces:

danishjameel003
/

newtestingdanish

Sleeping

File size: 7,787 Bytes

459923e

import re
from typing import Dict, List, Tuple, Any

class OCRAccuracyAnalyzer:
    def __init__(self):
        """Initialize the OCR Accuracy Analyzer."""
        self.confidence_thresholds = {
            'high': 0.8,
            'medium': 0.6,
            'low': 0.4
        }

    def analyze_ocr_quality(self, full_text_annotation: Any, text: str) -> Dict[str, float]:
        """
        Analyze the quality of OCR results.
        
        Args:
            full_text_annotation: The full text annotation from Google Cloud Vision API
            text: The extracted text content
            
        Returns:
            Dictionary containing accuracy metrics
        """
        try:
            # Calculate confidence score
            confidence_score = self._calculate_confidence_score(full_text_annotation)
            
            # Calculate word count accuracy
            word_count_accuracy = self._calculate_word_count_accuracy(text)
            
            # Calculate overall accuracy
            overall_accuracy = (confidence_score + word_count_accuracy) / 2
            
            return {
                "confidence_score": confidence_score,
                "word_count_accuracy": word_count_accuracy,
                "overall_accuracy": overall_accuracy
            }
        except Exception as e:
            print(f"Error in accuracy analysis: {str(e)}")
            return {
                "confidence_score": 0.0,
                "word_count_accuracy": 0.0,
                "overall_accuracy": 0.0
            }

    def _calculate_confidence_score(self, full_text_annotation: Any) -> float:
        """
        Calculate the confidence score from the full text annotation.
        
        Args:
            full_text_annotation: The full text annotation from Google Cloud Vision API
            
        Returns:
            Confidence score between 0 and 1
        """
        try:
            if not full_text_annotation or not hasattr(full_text_annotation, 'pages'):
                return 0.0
                
            total_confidence = 0.0
            total_words = 0
            
            for page in full_text_annotation.pages:
                for block in page.blocks:
                    for paragraph in block.paragraphs:
                        for word in paragraph.words:
                            total_confidence += word.confidence
                            total_words += 1
            
            return total_confidence / total_words if total_words > 0 else 0.0
            
        except Exception as e:
            print(f"Error calculating confidence score: {str(e)}")
            return 0.0

    def _calculate_word_count_accuracy(self, text: str) -> float:
        """
        Calculate word count accuracy based on text characteristics.
        
        Args:
            text: The extracted text content
            
        Returns:
            Word count accuracy score between 0 and 1
        """
        try:
            if not text:
                return 0.0
                
            # Count words
            words = text.split()
            word_count = len(words)
            
            # Check for minimum word count (assuming OCR should extract at least some words)
            if word_count < 10:
                return 0.3
                
            # Check for common OCR issues
            issues = 0
            total_checks = 4
            
            # Check 1: Presence of special characters
            if re.search(r'[^a-zA-Z0-9\s.,!?-]', text):
                issues += 1
                
            # Check 2: Presence of multiple spaces
            if re.search(r'\s{2,}', text):
                issues += 1
                
            # Check 3: Presence of mixed case words (potential OCR errors)
            mixed_case_words = sum(1 for word in words if not word.isupper() and not word.islower())
            if mixed_case_words > len(words) * 0.3:  # If more than 30% words have mixed case
                issues += 1
                
            # Check 4: Presence of very short words (potential OCR errors)
            short_words = sum(1 for word in words if len(word) < 2)
            if short_words > len(words) * 0.1:  # If more than 10% words are very short
                issues += 1
                
            return 1 - (issues / total_checks)
            
        except Exception as e:
            print(f"Error calculating word count accuracy: {str(e)}")
            return 0.0

    def get_accuracy_status(self, accuracy_metrics: Dict[str, float]) -> Tuple[str, str]:
        """
        Get the accuracy status and message based on metrics.
        
        Args:
            accuracy_metrics: Dictionary containing accuracy metrics
            
        Returns:
            Tuple of (status, message)
        """
        try:
            overall_accuracy = accuracy_metrics.get('overall_accuracy', 0.0)
            
            if overall_accuracy >= self.confidence_thresholds['high']:
                return 'high', 'OCR quality is excellent'
            elif overall_accuracy >= self.confidence_thresholds['medium']:
                return 'medium', 'OCR quality is acceptable'
            elif overall_accuracy >= self.confidence_thresholds['low']:
                return 'low', 'OCR quality needs improvement'
            else:
                return 'poor', 'OCR quality is poor'
                
        except Exception as e:
            print(f"Error getting accuracy status: {str(e)}")
            return 'unknown', 'Unable to determine accuracy status'

    def get_detailed_analysis(self, accuracy_metrics: Dict[str, float]) -> List[str]:
        """
        Get detailed analysis points based on accuracy metrics.
        
        Args:
            accuracy_metrics: Dictionary containing accuracy metrics
            
        Returns:
            List of analysis points
        """
        analysis_points = []
        
        try:
            confidence_score = accuracy_metrics.get('confidence_score', 0.0)
            word_count_accuracy = accuracy_metrics.get('word_count_accuracy', 0.0)
            overall_accuracy = accuracy_metrics.get('overall_accuracy', 0.0)
            
            # Analyze confidence score
            if confidence_score >= self.confidence_thresholds['high']:
                analysis_points.append("High confidence in text recognition")
            elif confidence_score >= self.confidence_thresholds['medium']:
                analysis_points.append("Moderate confidence in text recognition")
            else:
                analysis_points.append("Low confidence in text recognition")
                
            # Analyze word count accuracy
            if word_count_accuracy >= self.confidence_thresholds['high']:
                analysis_points.append("Excellent word count accuracy")
            elif word_count_accuracy >= self.confidence_thresholds['medium']:
                analysis_points.append("Acceptable word count accuracy")
            else:
                analysis_points.append("Poor word count accuracy")
                
            # Overall analysis
            if overall_accuracy >= self.confidence_thresholds['high']:
                analysis_points.append("Overall OCR quality is excellent")
            elif overall_accuracy >= self.confidence_thresholds['medium']:
                analysis_points.append("Overall OCR quality is acceptable")
            else:
                analysis_points.append("Overall OCR quality needs improvement")
                
        except Exception as e:
            print(f"Error getting detailed analysis: {str(e)}")
            analysis_points.append("Unable to perform detailed analysis")
            
        return analysis_points