Spaces:
Sleeping
Sleeping
import re | |
from typing import Dict, List, Tuple, Any | |
class OCRAccuracyAnalyzer: | |
def __init__(self): | |
"""Initialize the OCR Accuracy Analyzer.""" | |
self.confidence_thresholds = { | |
'high': 0.8, | |
'medium': 0.6, | |
'low': 0.4 | |
} | |
def analyze_ocr_quality(self, full_text_annotation: Any, text: str) -> Dict[str, float]: | |
""" | |
Analyze the quality of OCR results. | |
Args: | |
full_text_annotation: The full text annotation from Google Cloud Vision API | |
text: The extracted text content | |
Returns: | |
Dictionary containing accuracy metrics | |
""" | |
try: | |
# Calculate confidence score | |
confidence_score = self._calculate_confidence_score(full_text_annotation) | |
# Calculate word count accuracy | |
word_count_accuracy = self._calculate_word_count_accuracy(text) | |
# Calculate overall accuracy | |
overall_accuracy = (confidence_score + word_count_accuracy) / 2 | |
return { | |
"confidence_score": confidence_score, | |
"word_count_accuracy": word_count_accuracy, | |
"overall_accuracy": overall_accuracy | |
} | |
except Exception as e: | |
print(f"Error in accuracy analysis: {str(e)}") | |
return { | |
"confidence_score": 0.0, | |
"word_count_accuracy": 0.0, | |
"overall_accuracy": 0.0 | |
} | |
def _calculate_confidence_score(self, full_text_annotation: Any) -> float: | |
""" | |
Calculate the confidence score from the full text annotation. | |
Args: | |
full_text_annotation: The full text annotation from Google Cloud Vision API | |
Returns: | |
Confidence score between 0 and 1 | |
""" | |
try: | |
if not full_text_annotation or not hasattr(full_text_annotation, 'pages'): | |
return 0.0 | |
total_confidence = 0.0 | |
total_words = 0 | |
for page in full_text_annotation.pages: | |
for block in page.blocks: | |
for paragraph in block.paragraphs: | |
for word in paragraph.words: | |
total_confidence += word.confidence | |
total_words += 1 | |
return total_confidence / total_words if total_words > 0 else 0.0 | |
except Exception as e: | |
print(f"Error calculating confidence score: {str(e)}") | |
return 0.0 | |
def _calculate_word_count_accuracy(self, text: str) -> float: | |
""" | |
Calculate word count accuracy based on text characteristics. | |
Args: | |
text: The extracted text content | |
Returns: | |
Word count accuracy score between 0 and 1 | |
""" | |
try: | |
if not text: | |
return 0.0 | |
# Count words | |
words = text.split() | |
word_count = len(words) | |
# Check for minimum word count (assuming OCR should extract at least some words) | |
if word_count < 10: | |
return 0.3 | |
# Check for common OCR issues | |
issues = 0 | |
total_checks = 4 | |
# Check 1: Presence of special characters | |
if re.search(r'[^a-zA-Z0-9\s.,!?-]', text): | |
issues += 1 | |
# Check 2: Presence of multiple spaces | |
if re.search(r'\s{2,}', text): | |
issues += 1 | |
# Check 3: Presence of mixed case words (potential OCR errors) | |
mixed_case_words = sum(1 for word in words if not word.isupper() and not word.islower()) | |
if mixed_case_words > len(words) * 0.3: # If more than 30% words have mixed case | |
issues += 1 | |
# Check 4: Presence of very short words (potential OCR errors) | |
short_words = sum(1 for word in words if len(word) < 2) | |
if short_words > len(words) * 0.1: # If more than 10% words are very short | |
issues += 1 | |
return 1 - (issues / total_checks) | |
except Exception as e: | |
print(f"Error calculating word count accuracy: {str(e)}") | |
return 0.0 | |
def get_accuracy_status(self, accuracy_metrics: Dict[str, float]) -> Tuple[str, str]: | |
""" | |
Get the accuracy status and message based on metrics. | |
Args: | |
accuracy_metrics: Dictionary containing accuracy metrics | |
Returns: | |
Tuple of (status, message) | |
""" | |
try: | |
overall_accuracy = accuracy_metrics.get('overall_accuracy', 0.0) | |
if overall_accuracy >= self.confidence_thresholds['high']: | |
return 'high', 'OCR quality is excellent' | |
elif overall_accuracy >= self.confidence_thresholds['medium']: | |
return 'medium', 'OCR quality is acceptable' | |
elif overall_accuracy >= self.confidence_thresholds['low']: | |
return 'low', 'OCR quality needs improvement' | |
else: | |
return 'poor', 'OCR quality is poor' | |
except Exception as e: | |
print(f"Error getting accuracy status: {str(e)}") | |
return 'unknown', 'Unable to determine accuracy status' | |
def get_detailed_analysis(self, accuracy_metrics: Dict[str, float]) -> List[str]: | |
""" | |
Get detailed analysis points based on accuracy metrics. | |
Args: | |
accuracy_metrics: Dictionary containing accuracy metrics | |
Returns: | |
List of analysis points | |
""" | |
analysis_points = [] | |
try: | |
confidence_score = accuracy_metrics.get('confidence_score', 0.0) | |
word_count_accuracy = accuracy_metrics.get('word_count_accuracy', 0.0) | |
overall_accuracy = accuracy_metrics.get('overall_accuracy', 0.0) | |
# Analyze confidence score | |
if confidence_score >= self.confidence_thresholds['high']: | |
analysis_points.append("High confidence in text recognition") | |
elif confidence_score >= self.confidence_thresholds['medium']: | |
analysis_points.append("Moderate confidence in text recognition") | |
else: | |
analysis_points.append("Low confidence in text recognition") | |
# Analyze word count accuracy | |
if word_count_accuracy >= self.confidence_thresholds['high']: | |
analysis_points.append("Excellent word count accuracy") | |
elif word_count_accuracy >= self.confidence_thresholds['medium']: | |
analysis_points.append("Acceptable word count accuracy") | |
else: | |
analysis_points.append("Poor word count accuracy") | |
# Overall analysis | |
if overall_accuracy >= self.confidence_thresholds['high']: | |
analysis_points.append("Overall OCR quality is excellent") | |
elif overall_accuracy >= self.confidence_thresholds['medium']: | |
analysis_points.append("Overall OCR quality is acceptable") | |
else: | |
analysis_points.append("Overall OCR quality needs improvement") | |
except Exception as e: | |
print(f"Error getting detailed analysis: {str(e)}") | |
analysis_points.append("Unable to perform detailed analysis") | |
return analysis_points |