Spaces:
Sleeping
Sleeping
File size: 7,787 Bytes
459923e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
import re
from typing import Dict, List, Tuple, Any
class OCRAccuracyAnalyzer:
def __init__(self):
"""Initialize the OCR Accuracy Analyzer."""
self.confidence_thresholds = {
'high': 0.8,
'medium': 0.6,
'low': 0.4
}
def analyze_ocr_quality(self, full_text_annotation: Any, text: str) -> Dict[str, float]:
"""
Analyze the quality of OCR results.
Args:
full_text_annotation: The full text annotation from Google Cloud Vision API
text: The extracted text content
Returns:
Dictionary containing accuracy metrics
"""
try:
# Calculate confidence score
confidence_score = self._calculate_confidence_score(full_text_annotation)
# Calculate word count accuracy
word_count_accuracy = self._calculate_word_count_accuracy(text)
# Calculate overall accuracy
overall_accuracy = (confidence_score + word_count_accuracy) / 2
return {
"confidence_score": confidence_score,
"word_count_accuracy": word_count_accuracy,
"overall_accuracy": overall_accuracy
}
except Exception as e:
print(f"Error in accuracy analysis: {str(e)}")
return {
"confidence_score": 0.0,
"word_count_accuracy": 0.0,
"overall_accuracy": 0.0
}
def _calculate_confidence_score(self, full_text_annotation: Any) -> float:
"""
Calculate the confidence score from the full text annotation.
Args:
full_text_annotation: The full text annotation from Google Cloud Vision API
Returns:
Confidence score between 0 and 1
"""
try:
if not full_text_annotation or not hasattr(full_text_annotation, 'pages'):
return 0.0
total_confidence = 0.0
total_words = 0
for page in full_text_annotation.pages:
for block in page.blocks:
for paragraph in block.paragraphs:
for word in paragraph.words:
total_confidence += word.confidence
total_words += 1
return total_confidence / total_words if total_words > 0 else 0.0
except Exception as e:
print(f"Error calculating confidence score: {str(e)}")
return 0.0
def _calculate_word_count_accuracy(self, text: str) -> float:
"""
Calculate word count accuracy based on text characteristics.
Args:
text: The extracted text content
Returns:
Word count accuracy score between 0 and 1
"""
try:
if not text:
return 0.0
# Count words
words = text.split()
word_count = len(words)
# Check for minimum word count (assuming OCR should extract at least some words)
if word_count < 10:
return 0.3
# Check for common OCR issues
issues = 0
total_checks = 4
# Check 1: Presence of special characters
if re.search(r'[^a-zA-Z0-9\s.,!?-]', text):
issues += 1
# Check 2: Presence of multiple spaces
if re.search(r'\s{2,}', text):
issues += 1
# Check 3: Presence of mixed case words (potential OCR errors)
mixed_case_words = sum(1 for word in words if not word.isupper() and not word.islower())
if mixed_case_words > len(words) * 0.3: # If more than 30% words have mixed case
issues += 1
# Check 4: Presence of very short words (potential OCR errors)
short_words = sum(1 for word in words if len(word) < 2)
if short_words > len(words) * 0.1: # If more than 10% words are very short
issues += 1
return 1 - (issues / total_checks)
except Exception as e:
print(f"Error calculating word count accuracy: {str(e)}")
return 0.0
def get_accuracy_status(self, accuracy_metrics: Dict[str, float]) -> Tuple[str, str]:
"""
Get the accuracy status and message based on metrics.
Args:
accuracy_metrics: Dictionary containing accuracy metrics
Returns:
Tuple of (status, message)
"""
try:
overall_accuracy = accuracy_metrics.get('overall_accuracy', 0.0)
if overall_accuracy >= self.confidence_thresholds['high']:
return 'high', 'OCR quality is excellent'
elif overall_accuracy >= self.confidence_thresholds['medium']:
return 'medium', 'OCR quality is acceptable'
elif overall_accuracy >= self.confidence_thresholds['low']:
return 'low', 'OCR quality needs improvement'
else:
return 'poor', 'OCR quality is poor'
except Exception as e:
print(f"Error getting accuracy status: {str(e)}")
return 'unknown', 'Unable to determine accuracy status'
def get_detailed_analysis(self, accuracy_metrics: Dict[str, float]) -> List[str]:
"""
Get detailed analysis points based on accuracy metrics.
Args:
accuracy_metrics: Dictionary containing accuracy metrics
Returns:
List of analysis points
"""
analysis_points = []
try:
confidence_score = accuracy_metrics.get('confidence_score', 0.0)
word_count_accuracy = accuracy_metrics.get('word_count_accuracy', 0.0)
overall_accuracy = accuracy_metrics.get('overall_accuracy', 0.0)
# Analyze confidence score
if confidence_score >= self.confidence_thresholds['high']:
analysis_points.append("High confidence in text recognition")
elif confidence_score >= self.confidence_thresholds['medium']:
analysis_points.append("Moderate confidence in text recognition")
else:
analysis_points.append("Low confidence in text recognition")
# Analyze word count accuracy
if word_count_accuracy >= self.confidence_thresholds['high']:
analysis_points.append("Excellent word count accuracy")
elif word_count_accuracy >= self.confidence_thresholds['medium']:
analysis_points.append("Acceptable word count accuracy")
else:
analysis_points.append("Poor word count accuracy")
# Overall analysis
if overall_accuracy >= self.confidence_thresholds['high']:
analysis_points.append("Overall OCR quality is excellent")
elif overall_accuracy >= self.confidence_thresholds['medium']:
analysis_points.append("Overall OCR quality is acceptable")
else:
analysis_points.append("Overall OCR quality needs improvement")
except Exception as e:
print(f"Error getting detailed analysis: {str(e)}")
analysis_points.append("Unable to perform detailed analysis")
return analysis_points |