File size: 7,787 Bytes
459923e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import re
from typing import Dict, List, Tuple, Any

class OCRAccuracyAnalyzer:
    def __init__(self):
        """Initialize the OCR Accuracy Analyzer."""
        self.confidence_thresholds = {
            'high': 0.8,
            'medium': 0.6,
            'low': 0.4
        }

    def analyze_ocr_quality(self, full_text_annotation: Any, text: str) -> Dict[str, float]:
        """
        Analyze the quality of OCR results.
        
        Args:
            full_text_annotation: The full text annotation from Google Cloud Vision API
            text: The extracted text content
            
        Returns:
            Dictionary containing accuracy metrics
        """
        try:
            # Calculate confidence score
            confidence_score = self._calculate_confidence_score(full_text_annotation)
            
            # Calculate word count accuracy
            word_count_accuracy = self._calculate_word_count_accuracy(text)
            
            # Calculate overall accuracy
            overall_accuracy = (confidence_score + word_count_accuracy) / 2
            
            return {
                "confidence_score": confidence_score,
                "word_count_accuracy": word_count_accuracy,
                "overall_accuracy": overall_accuracy
            }
        except Exception as e:
            print(f"Error in accuracy analysis: {str(e)}")
            return {
                "confidence_score": 0.0,
                "word_count_accuracy": 0.0,
                "overall_accuracy": 0.0
            }

    def _calculate_confidence_score(self, full_text_annotation: Any) -> float:
        """
        Calculate the confidence score from the full text annotation.
        
        Args:
            full_text_annotation: The full text annotation from Google Cloud Vision API
            
        Returns:
            Confidence score between 0 and 1
        """
        try:
            if not full_text_annotation or not hasattr(full_text_annotation, 'pages'):
                return 0.0
                
            total_confidence = 0.0
            total_words = 0
            
            for page in full_text_annotation.pages:
                for block in page.blocks:
                    for paragraph in block.paragraphs:
                        for word in paragraph.words:
                            total_confidence += word.confidence
                            total_words += 1
            
            return total_confidence / total_words if total_words > 0 else 0.0
            
        except Exception as e:
            print(f"Error calculating confidence score: {str(e)}")
            return 0.0

    def _calculate_word_count_accuracy(self, text: str) -> float:
        """
        Calculate word count accuracy based on text characteristics.
        
        Args:
            text: The extracted text content
            
        Returns:
            Word count accuracy score between 0 and 1
        """
        try:
            if not text:
                return 0.0
                
            # Count words
            words = text.split()
            word_count = len(words)
            
            # Check for minimum word count (assuming OCR should extract at least some words)
            if word_count < 10:
                return 0.3
                
            # Check for common OCR issues
            issues = 0
            total_checks = 4
            
            # Check 1: Presence of special characters
            if re.search(r'[^a-zA-Z0-9\s.,!?-]', text):
                issues += 1
                
            # Check 2: Presence of multiple spaces
            if re.search(r'\s{2,}', text):
                issues += 1
                
            # Check 3: Presence of mixed case words (potential OCR errors)
            mixed_case_words = sum(1 for word in words if not word.isupper() and not word.islower())
            if mixed_case_words > len(words) * 0.3:  # If more than 30% words have mixed case
                issues += 1
                
            # Check 4: Presence of very short words (potential OCR errors)
            short_words = sum(1 for word in words if len(word) < 2)
            if short_words > len(words) * 0.1:  # If more than 10% words are very short
                issues += 1
                
            return 1 - (issues / total_checks)
            
        except Exception as e:
            print(f"Error calculating word count accuracy: {str(e)}")
            return 0.0

    def get_accuracy_status(self, accuracy_metrics: Dict[str, float]) -> Tuple[str, str]:
        """
        Get the accuracy status and message based on metrics.
        
        Args:
            accuracy_metrics: Dictionary containing accuracy metrics
            
        Returns:
            Tuple of (status, message)
        """
        try:
            overall_accuracy = accuracy_metrics.get('overall_accuracy', 0.0)
            
            if overall_accuracy >= self.confidence_thresholds['high']:
                return 'high', 'OCR quality is excellent'
            elif overall_accuracy >= self.confidence_thresholds['medium']:
                return 'medium', 'OCR quality is acceptable'
            elif overall_accuracy >= self.confidence_thresholds['low']:
                return 'low', 'OCR quality needs improvement'
            else:
                return 'poor', 'OCR quality is poor'
                
        except Exception as e:
            print(f"Error getting accuracy status: {str(e)}")
            return 'unknown', 'Unable to determine accuracy status'

    def get_detailed_analysis(self, accuracy_metrics: Dict[str, float]) -> List[str]:
        """
        Get detailed analysis points based on accuracy metrics.
        
        Args:
            accuracy_metrics: Dictionary containing accuracy metrics
            
        Returns:
            List of analysis points
        """
        analysis_points = []
        
        try:
            confidence_score = accuracy_metrics.get('confidence_score', 0.0)
            word_count_accuracy = accuracy_metrics.get('word_count_accuracy', 0.0)
            overall_accuracy = accuracy_metrics.get('overall_accuracy', 0.0)
            
            # Analyze confidence score
            if confidence_score >= self.confidence_thresholds['high']:
                analysis_points.append("High confidence in text recognition")
            elif confidence_score >= self.confidence_thresholds['medium']:
                analysis_points.append("Moderate confidence in text recognition")
            else:
                analysis_points.append("Low confidence in text recognition")
                
            # Analyze word count accuracy
            if word_count_accuracy >= self.confidence_thresholds['high']:
                analysis_points.append("Excellent word count accuracy")
            elif word_count_accuracy >= self.confidence_thresholds['medium']:
                analysis_points.append("Acceptable word count accuracy")
            else:
                analysis_points.append("Poor word count accuracy")
                
            # Overall analysis
            if overall_accuracy >= self.confidence_thresholds['high']:
                analysis_points.append("Overall OCR quality is excellent")
            elif overall_accuracy >= self.confidence_thresholds['medium']:
                analysis_points.append("Overall OCR quality is acceptable")
            else:
                analysis_points.append("Overall OCR quality needs improvement")
                
        except Exception as e:
            print(f"Error getting detailed analysis: {str(e)}")
            analysis_points.append("Unable to perform detailed analysis")
            
        return analysis_points