Spaces:

danishjameel003
/

newtestingdanish

Sleeping

App Files Files Community

newtestingdanish / OCRAccuracyAnalyzer.py

aghaai

Fresh commit of all updated files

459923e about 2 months ago

raw

history blame contribute delete

7.79 kB

	import re
	from typing import Dict, List, Tuple, Any

	class OCRAccuracyAnalyzer:
	def __init__(self):
	"""Initialize the OCR Accuracy Analyzer."""
	self.confidence_thresholds = {
	'high': 0.8,
	'medium': 0.6,
	'low': 0.4
	}

	def analyze_ocr_quality(self, full_text_annotation: Any, text: str) -> Dict[str, float]:
	"""
	Analyze the quality of OCR results.

	Args:
	full_text_annotation: The full text annotation from Google Cloud Vision API
	text: The extracted text content

	Returns:
	Dictionary containing accuracy metrics
	"""
	try:
	# Calculate confidence score
	confidence_score = self._calculate_confidence_score(full_text_annotation)

	# Calculate word count accuracy
	word_count_accuracy = self._calculate_word_count_accuracy(text)

	# Calculate overall accuracy
	overall_accuracy = (confidence_score + word_count_accuracy) / 2

	return {
	"confidence_score": confidence_score,
	"word_count_accuracy": word_count_accuracy,
	"overall_accuracy": overall_accuracy
	}
	except Exception as e:
	print(f"Error in accuracy analysis: {str(e)}")
	return {
	"confidence_score": 0.0,
	"word_count_accuracy": 0.0,
	"overall_accuracy": 0.0
	}

	def _calculate_confidence_score(self, full_text_annotation: Any) -> float:
	"""
	Calculate the confidence score from the full text annotation.

	Args:
	full_text_annotation: The full text annotation from Google Cloud Vision API

	Returns:
	Confidence score between 0 and 1
	"""
	try:
	if not full_text_annotation or not hasattr(full_text_annotation, 'pages'):
	return 0.0

	total_confidence = 0.0
	total_words = 0

	for page in full_text_annotation.pages:
	for block in page.blocks:
	for paragraph in block.paragraphs:
	for word in paragraph.words:
	total_confidence += word.confidence
	total_words += 1

	return total_confidence / total_words if total_words > 0 else 0.0

	except Exception as e:
	print(f"Error calculating confidence score: {str(e)}")
	return 0.0

	def _calculate_word_count_accuracy(self, text: str) -> float:
	"""
	Calculate word count accuracy based on text characteristics.

	Args:
	text: The extracted text content

	Returns:
	Word count accuracy score between 0 and 1
	"""
	try:
	if not text:
	return 0.0

	# Count words
	words = text.split()
	word_count = len(words)

	# Check for minimum word count (assuming OCR should extract at least some words)
	if word_count < 10:
	return 0.3

	# Check for common OCR issues
	issues = 0
	total_checks = 4

	# Check 1: Presence of special characters
	if re.search(r'[^a-zA-Z0-9\s.,!?-]', text):
	issues += 1

	# Check 2: Presence of multiple spaces
	if re.search(r'\s{2,}', text):
	issues += 1

	# Check 3: Presence of mixed case words (potential OCR errors)
	mixed_case_words = sum(1 for word in words if not word.isupper() and not word.islower())
	if mixed_case_words > len(words) * 0.3: # If more than 30% words have mixed case
	issues += 1

	# Check 4: Presence of very short words (potential OCR errors)
	short_words = sum(1 for word in words if len(word) < 2)
	if short_words > len(words) * 0.1: # If more than 10% words are very short
	issues += 1

	return 1 - (issues / total_checks)

	except Exception as e:
	print(f"Error calculating word count accuracy: {str(e)}")
	return 0.0

	def get_accuracy_status(self, accuracy_metrics: Dict[str, float]) -> Tuple[str, str]:
	"""
	Get the accuracy status and message based on metrics.

	Args:
	accuracy_metrics: Dictionary containing accuracy metrics

	Returns:
	Tuple of (status, message)
	"""
	try:
	overall_accuracy = accuracy_metrics.get('overall_accuracy', 0.0)

	if overall_accuracy >= self.confidence_thresholds['high']:
	return 'high', 'OCR quality is excellent'
	elif overall_accuracy >= self.confidence_thresholds['medium']:
	return 'medium', 'OCR quality is acceptable'
	elif overall_accuracy >= self.confidence_thresholds['low']:
	return 'low', 'OCR quality needs improvement'
	else:
	return 'poor', 'OCR quality is poor'

	except Exception as e:
	print(f"Error getting accuracy status: {str(e)}")
	return 'unknown', 'Unable to determine accuracy status'

	def get_detailed_analysis(self, accuracy_metrics: Dict[str, float]) -> List[str]:
	"""
	Get detailed analysis points based on accuracy metrics.

	Args:
	accuracy_metrics: Dictionary containing accuracy metrics

	Returns:
	List of analysis points
	"""
	analysis_points = []

	try:
	confidence_score = accuracy_metrics.get('confidence_score', 0.0)
	word_count_accuracy = accuracy_metrics.get('word_count_accuracy', 0.0)
	overall_accuracy = accuracy_metrics.get('overall_accuracy', 0.0)

	# Analyze confidence score
	if confidence_score >= self.confidence_thresholds['high']:
	analysis_points.append("High confidence in text recognition")
	elif confidence_score >= self.confidence_thresholds['medium']:
	analysis_points.append("Moderate confidence in text recognition")
	else:
	analysis_points.append("Low confidence in text recognition")

	# Analyze word count accuracy
	if word_count_accuracy >= self.confidence_thresholds['high']:
	analysis_points.append("Excellent word count accuracy")
	elif word_count_accuracy >= self.confidence_thresholds['medium']:
	analysis_points.append("Acceptable word count accuracy")
	else:
	analysis_points.append("Poor word count accuracy")

	# Overall analysis
	if overall_accuracy >= self.confidence_thresholds['high']:
	analysis_points.append("Overall OCR quality is excellent")
	elif overall_accuracy >= self.confidence_thresholds['medium']:
	analysis_points.append("Overall OCR quality is acceptable")
	else:
	analysis_points.append("Overall OCR quality needs improvement")

	except Exception as e:
	print(f"Error getting detailed analysis: {str(e)}")
	analysis_points.append("Unable to perform detailed analysis")

	return analysis_points