newtestingdanish / OCRAccuracyAnalyzer.py
aghaai's picture
Fresh commit of all updated files
459923e
import re
from typing import Dict, List, Tuple, Any
class OCRAccuracyAnalyzer:
def __init__(self):
"""Initialize the OCR Accuracy Analyzer."""
self.confidence_thresholds = {
'high': 0.8,
'medium': 0.6,
'low': 0.4
}
def analyze_ocr_quality(self, full_text_annotation: Any, text: str) -> Dict[str, float]:
"""
Analyze the quality of OCR results.
Args:
full_text_annotation: The full text annotation from Google Cloud Vision API
text: The extracted text content
Returns:
Dictionary containing accuracy metrics
"""
try:
# Calculate confidence score
confidence_score = self._calculate_confidence_score(full_text_annotation)
# Calculate word count accuracy
word_count_accuracy = self._calculate_word_count_accuracy(text)
# Calculate overall accuracy
overall_accuracy = (confidence_score + word_count_accuracy) / 2
return {
"confidence_score": confidence_score,
"word_count_accuracy": word_count_accuracy,
"overall_accuracy": overall_accuracy
}
except Exception as e:
print(f"Error in accuracy analysis: {str(e)}")
return {
"confidence_score": 0.0,
"word_count_accuracy": 0.0,
"overall_accuracy": 0.0
}
def _calculate_confidence_score(self, full_text_annotation: Any) -> float:
"""
Calculate the confidence score from the full text annotation.
Args:
full_text_annotation: The full text annotation from Google Cloud Vision API
Returns:
Confidence score between 0 and 1
"""
try:
if not full_text_annotation or not hasattr(full_text_annotation, 'pages'):
return 0.0
total_confidence = 0.0
total_words = 0
for page in full_text_annotation.pages:
for block in page.blocks:
for paragraph in block.paragraphs:
for word in paragraph.words:
total_confidence += word.confidence
total_words += 1
return total_confidence / total_words if total_words > 0 else 0.0
except Exception as e:
print(f"Error calculating confidence score: {str(e)}")
return 0.0
def _calculate_word_count_accuracy(self, text: str) -> float:
"""
Calculate word count accuracy based on text characteristics.
Args:
text: The extracted text content
Returns:
Word count accuracy score between 0 and 1
"""
try:
if not text:
return 0.0
# Count words
words = text.split()
word_count = len(words)
# Check for minimum word count (assuming OCR should extract at least some words)
if word_count < 10:
return 0.3
# Check for common OCR issues
issues = 0
total_checks = 4
# Check 1: Presence of special characters
if re.search(r'[^a-zA-Z0-9\s.,!?-]', text):
issues += 1
# Check 2: Presence of multiple spaces
if re.search(r'\s{2,}', text):
issues += 1
# Check 3: Presence of mixed case words (potential OCR errors)
mixed_case_words = sum(1 for word in words if not word.isupper() and not word.islower())
if mixed_case_words > len(words) * 0.3: # If more than 30% words have mixed case
issues += 1
# Check 4: Presence of very short words (potential OCR errors)
short_words = sum(1 for word in words if len(word) < 2)
if short_words > len(words) * 0.1: # If more than 10% words are very short
issues += 1
return 1 - (issues / total_checks)
except Exception as e:
print(f"Error calculating word count accuracy: {str(e)}")
return 0.0
def get_accuracy_status(self, accuracy_metrics: Dict[str, float]) -> Tuple[str, str]:
"""
Get the accuracy status and message based on metrics.
Args:
accuracy_metrics: Dictionary containing accuracy metrics
Returns:
Tuple of (status, message)
"""
try:
overall_accuracy = accuracy_metrics.get('overall_accuracy', 0.0)
if overall_accuracy >= self.confidence_thresholds['high']:
return 'high', 'OCR quality is excellent'
elif overall_accuracy >= self.confidence_thresholds['medium']:
return 'medium', 'OCR quality is acceptable'
elif overall_accuracy >= self.confidence_thresholds['low']:
return 'low', 'OCR quality needs improvement'
else:
return 'poor', 'OCR quality is poor'
except Exception as e:
print(f"Error getting accuracy status: {str(e)}")
return 'unknown', 'Unable to determine accuracy status'
def get_detailed_analysis(self, accuracy_metrics: Dict[str, float]) -> List[str]:
"""
Get detailed analysis points based on accuracy metrics.
Args:
accuracy_metrics: Dictionary containing accuracy metrics
Returns:
List of analysis points
"""
analysis_points = []
try:
confidence_score = accuracy_metrics.get('confidence_score', 0.0)
word_count_accuracy = accuracy_metrics.get('word_count_accuracy', 0.0)
overall_accuracy = accuracy_metrics.get('overall_accuracy', 0.0)
# Analyze confidence score
if confidence_score >= self.confidence_thresholds['high']:
analysis_points.append("High confidence in text recognition")
elif confidence_score >= self.confidence_thresholds['medium']:
analysis_points.append("Moderate confidence in text recognition")
else:
analysis_points.append("Low confidence in text recognition")
# Analyze word count accuracy
if word_count_accuracy >= self.confidence_thresholds['high']:
analysis_points.append("Excellent word count accuracy")
elif word_count_accuracy >= self.confidence_thresholds['medium']:
analysis_points.append("Acceptable word count accuracy")
else:
analysis_points.append("Poor word count accuracy")
# Overall analysis
if overall_accuracy >= self.confidence_thresholds['high']:
analysis_points.append("Overall OCR quality is excellent")
elif overall_accuracy >= self.confidence_thresholds['medium']:
analysis_points.append("Overall OCR quality is acceptable")
else:
analysis_points.append("Overall OCR quality needs improvement")
except Exception as e:
print(f"Error getting detailed analysis: {str(e)}")
analysis_points.append("Unable to perform detailed analysis")
return analysis_points