Spaces:

Rogue2003
/

Receipt_Agent

Running

App Files Files Community

Raghu commited on 13 days ago

Commit

acf3ed2

1 Parent(s): 23980e2

Add TrOCR and PaddleOCR to OCR ensemble for improved accuracy

Browse files

Files changed (2) hide show

app.py +255 -116
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -369,16 +369,51 @@ class EnsembleDocumentClassifier:
 # ============================================================================
 class ReceiptOCR:
-    """Enhanced OCR with EasyOCR + Tesseract fallback, better preprocessing, and retry logic."""
     def __init__(self):
         self.reader = None
         self.use_tesseract = False
         try:
             import pytesseract
             self.use_tesseract = True
         except ImportError:
-            pass
     def load(self):
         if self.reader is None:
@@ -410,7 +445,7 @@ class ReceiptOCR:
             # Denoise
             denoised = cv2.fastNlMeansDenoising(enhanced, h=10)
-            # Convert back to RGB for EasyOCR
             return cv2.cvtColor(denoised, cv2.COLOR_GRAY2RGB)
         elif method == 'sharpen':
@@ -424,20 +459,106 @@ class ReceiptOCR:
                 sharpened = cv2.cvtColor(sharpened, cv2.COLOR_GRAY2RGB)
             return sharpened
-        elif method == 'binarize':
-            # Adaptive thresholding
-            if len(img_array.shape) == 3:
-                gray = cv2.cvtColor(img_array, cv2.COLOR_RGB2GRAY)
-            else:
-                gray = img_array
-            binary = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
-                                          cv2.THRESH_BINARY, 11, 2)
-            return cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
         return img_array
-    def _extract_with_tesseract(self, image):
-        """Fallback OCR using Tesseract."""
         if not self.use_tesseract:
             return []
@@ -449,7 +570,6 @@ class ReceiptOCR:
             else:
                 pil_image = Image.fromarray(image).convert('RGB')
-            # Get detailed output with bounding boxes
             data = pytesseract.image_to_data(pil_image, output_type=pytesseract.Output.DICT)
             results = []
@@ -473,44 +593,6 @@ class ReceiptOCR:
             print(f"Tesseract OCR error: {e}")
             return []
-    def _merge_ocr_results(self, easyocr_results, tesseract_results):
-        """Merge results from multiple OCR engines, preferring higher confidence."""
-        if not tesseract_results:
-            return easyocr_results
-        # Create a map of EasyOCR results by approximate position
-        merged = []
-        used_tesseract = set()
-        for easy_result in easyocr_results:
-            best_match = None
-            best_iou = 0
-            # Find best matching Tesseract result
-            for i, tess_result in enumerate(tesseract_results):
-                if i in used_tesseract:
-                    continue
-                # Simple IoU calculation
-                iou = self._compute_iou(easy_result['bbox'], tess_result['bbox'])
-                if iou > best_iou and iou > 0.3:  # 30% overlap threshold
-                    best_iou = iou
-                    best_match = (i, tess_result)
-            if best_match and best_match[1]['confidence'] > easy_result['confidence']:
-                # Use Tesseract result if it's more confident
-                merged.append(best_match[1])
-                used_tesseract.add(best_match[0])
-            else:
-                merged.append(easy_result)
-        # Add unused Tesseract results
-        for i, tess_result in enumerate(tesseract_results):
-            if i not in used_tesseract:
-                merged.append(tess_result)
-        return merged
     def _compute_iou(self, box1, box2):
         """Compute Intersection over Union for bounding boxes."""
         x1_1, y1_1, x2_1, y2_1 = box1
@@ -528,78 +610,135 @@ class ReceiptOCR:
         return inter_area / union_area if union_area > 0 else 0
-    def extract_with_positions(self, image, min_confidence=0.3, use_fallback=True):
-        """Extract text with positions using EasyOCR + optional Tesseract fallback."""
-        if self.reader is None:
-            self.load()
-        original_image = image
         if isinstance(image, Image.Image):
-            image = np.array(image)
-        # Try EasyOCR first
         try:
-            results = self.reader.readtext(image)
         except Exception as e:
             print(f"EasyOCR error: {e}")
-            results = []
-        extracted = []
-        for bbox, text, conf in results:
-            if conf >= min_confidence:
-                x_coords = [p[0] for p in bbox]
-                y_coords = [p[1] for p in bbox]
-                extracted.append({
-                    'text': text.strip(),
-                    'confidence': conf,
-                    'bbox': [min(x_coords), min(y_coords), max(x_coords), max(y_coords)],
-                    'engine': 'easyocr'
-                })
-        # Check if we need fallback (low confidence or few results)
-        avg_confidence = np.mean([r['confidence'] for r in extracted]) if extracted else 0
-        needs_fallback = use_fallback and (len(extracted) < 3 or avg_confidence < 0.5)
-        if needs_fallback and self.use_tesseract:
-            # Try preprocessing + Tesseract
-            preprocessed = self._preprocess_image(original_image, method='enhance')
-            tesseract_results = self._extract_with_tesseract(preprocessed)
-            if tesseract_results:
-                # Merge results
-                extracted = self._merge_ocr_results(extracted, tesseract_results)
-        # If still poor results, try with preprocessing
-        if len(extracted) < 3 or avg_confidence < 0.4:
-            for method in ['enhance', 'sharpen']:
-                try:
-                    preprocessed = self._preprocess_image(original_image, method=method)
-                    retry_results = self.reader.readtext(preprocessed)
-                    retry_extracted = []
-                    for bbox, text, conf in retry_results:
-                        if conf >= min_confidence:
-                            x_coords = [p[0] for p in bbox]
-                            y_coords = [p[1] for p in bbox]
-                            retry_extracted.append({
-                                'text': text.strip(),
-                                'confidence': conf,
-                                'bbox': [min(x_coords), min(y_coords), max(x_coords), max(y_coords)],
-                                'engine': 'easyocr'
-                            })
-                    # Use retry if it's better
-                    retry_avg = np.mean([r['confidence'] for r in retry_extracted]) if retry_extracted else 0
-                    if retry_avg > avg_confidence:
-                        extracted = retry_extracted
-                        break
-                except Exception as e:
-                    continue
         # Sort by confidence (highest first)
-        extracted.sort(key=lambda x: x['confidence'], reverse=True)
-        return extracted
     def postprocess_receipt(self, ocr_results):
         """Extract structured fields from OCR results with improved patterns."""

 # ============================================================================
 class ReceiptOCR:
+    """Enhanced OCR with EasyOCR + TrOCR + PaddleOCR + Tesseract ensemble."""
     def __init__(self):
         self.reader = None
+        self.trocr_engine = None
+        self.paddleocr_engine = None
         self.use_tesseract = False
+        # Engine weights for ensemble
+        self.engine_weights = {
+            'trocr': 0.40,      # Highest weight - best quality
+            'easyocr': 0.35,
+            'paddleocr': 0.30,
+            'tesseract': 0.20
+        }
+        # Try to initialize TrOCR
+        try:
+            from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+            self.trocr_processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-printed")
+            self.trocr_model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-printed")
+            self.trocr_model = self.trocr_model.to(DEVICE)
+            self.trocr_model.eval()
+            self.trocr_available = True
+            print("TrOCR initialized")
+        except Exception as e:
+            self.trocr_available = False
+            print(f"TrOCR not available: {e}")
+        # Try to initialize PaddleOCR
+        try:
+            from paddleocr import PaddleOCR
+            self.paddleocr_engine = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
+            self.paddleocr_available = True
+            print("PaddleOCR initialized")
+        except Exception as e:
+            self.paddleocr_available = False
+            print(f"PaddleOCR not available: {e}")
+        # Try to initialize Tesseract
         try:
             import pytesseract
             self.use_tesseract = True
         except ImportError:
+            self.use_tesseract = False
     def load(self):
         if self.reader is None:
             # Denoise
             denoised = cv2.fastNlMeansDenoising(enhanced, h=10)
+            # Convert back to RGB for OCR engines
             return cv2.cvtColor(denoised, cv2.COLOR_GRAY2RGB)
         elif method == 'sharpen':
                 sharpened = cv2.cvtColor(sharpened, cv2.COLOR_GRAY2RGB)
             return sharpened
         return img_array
+    def _run_easyocr(self, image):
+        """Run EasyOCR."""
+        if self.reader is None:
+            self.load()
+        results = self.reader.readtext(image)
+        extracted = []
+        for bbox, text, conf in results:
+            x_coords = [p[0] for p in bbox]
+            y_coords = [p[1] for p in bbox]
+            extracted.append({
+                'text': text.strip(),
+                'confidence': conf,
+                'bbox': [min(x_coords), min(y_coords), max(x_coords), max(y_coords)],
+                'engine': 'easyocr'
+            })
+        return extracted
+    def _run_trocr(self, image, boxes):
+        """Run TrOCR on detected text regions."""
+        if not self.trocr_available:
+            return []
+        if isinstance(image, np.ndarray):
+            pil_image = Image.fromarray(image).convert('RGB')
+        else:
+            pil_image = image.convert('RGB')
+        results = []
+        for box in boxes:
+            try:
+                if isinstance(box, list) and len(box) >= 4:
+                    # Convert to [x1, y1, x2, y2]
+                    if isinstance(box[0], list):
+                        x1 = int(min(p[0] for p in box))
+                        y1 = int(min(p[1] for p in box))
+                        x2 = int(max(p[0] for p in box))
+                        y2 = int(max(p[1] for p in box))
+                    else:
+                        x1, y1, x2, y2 = [int(b) for b in box[:4]]
+                    # Crop and recognize
+                    cropped = pil_image.crop((x1, y1, x2, y2))
+                    # TrOCR recognition
+                    pixel_values = self.trocr_processor(images=cropped, return_tensors="pt").pixel_values.to(DEVICE)
+                    with torch.no_grad():
+                        generated_ids = self.trocr_model.generate(
+                            pixel_values,
+                            max_length=128,
+                            num_beams=4,
+                            early_stopping=True
+                        )
+                    text = self.trocr_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+                    if text.strip():
+                        results.append({
+                            'text': text.strip(),
+                            'confidence': 0.9,  # TrOCR doesn't provide confidence, use high default
+                            'bbox': [x1, y1, x2, y2],
+                            'engine': 'trocr'
+                        })
+            except Exception as e:
+                continue
+        return results
+    def _run_paddleocr(self, image):
+        """Run PaddleOCR."""
+        if not self.paddleocr_available:
+            return []
+        try:
+            result = self.paddleocr_engine.ocr(image, cls=True)
+            if result is None or len(result) == 0 or result[0] is None:
+                return []
+            extracted = []
+            for line in result[0]:
+                if line is None:
+                    continue
+                bbox, (text, conf) = line
+                x_coords = [p[0] for p in bbox]
+                y_coords = [p[1] for p in bbox]
+                extracted.append({
+                    'text': text.strip(),
+                    'confidence': conf,
+                    'bbox': [min(x_coords), min(y_coords), max(x_coords), max(y_coords)],
+                    'engine': 'paddleocr'
+                })
+            return extracted
+        except Exception as e:
+            print(f"PaddleOCR error: {e}")
+            return []
+    def _run_tesseract(self, image):
+        """Run Tesseract OCR."""
         if not self.use_tesseract:
             return []
             else:
                 pil_image = Image.fromarray(image).convert('RGB')
             data = pytesseract.image_to_data(pil_image, output_type=pytesseract.Output.DICT)
             results = []
             print(f"Tesseract OCR error: {e}")
             return []
     def _compute_iou(self, box1, box2):
         """Compute Intersection over Union for bounding boxes."""
         x1_1, y1_1, x2_1, y2_1 = box1
         return inter_area / union_area if union_area > 0 else 0
+    def _merge_results(self, all_results):
+        """Merge results from multiple OCR engines using weighted voting."""
+        if not all_results:
+            return []
+        # Use the engine with most detections as base
+        base_engine = max(all_results.keys(), key=lambda k: len(all_results[k]))
+        base_results = all_results[base_engine]
+        merged = []
+        for base_result in base_results:
+            base_box = base_result['bbox']
+            base_text = base_result['text']
+            base_conf = base_result['confidence']
+            # Find matching results from other engines
+            matches = [(base_text, base_conf, self.engine_weights.get(base_engine, 0.3))]
+            for engine_name, results in all_results.items():
+                if engine_name == base_engine:
+                    continue
+                for result in results:
+                    iou = self._compute_iou(base_box, result['bbox'])
+                    if iou > 0.3:  # Same text region
+                        weight = self.engine_weights.get(engine_name, 0.2)
+                        matches.append((result['text'], result['confidence'], weight))
+            # Vote on the best text
+            if len(matches) == 1:
+                final_text = base_text
+                final_conf = base_conf
+            else:
+                # Weighted voting
+                text_scores = {}
+                for text, conf, weight in matches:
+                    if text not in text_scores:
+                        text_scores[text] = 0
+                    text_scores[text] += conf * weight
+                final_text = max(text_scores.keys(), key=lambda t: text_scores[t])
+                total_weight = sum(w for _, _, w in matches)
+                final_conf = min(0.99, text_scores[final_text] / total_weight if total_weight > 0 else 0.5)
+            merged.append({
+                'text': final_text,
+                'confidence': final_conf,
+                'bbox': base_box,
+                'engines_used': len(matches)
+            })
+        return merged
+    def extract_with_positions(self, image, min_confidence=0.3, use_ensemble=True):
+        """Extract text with positions using ensemble of OCR engines."""
         if isinstance(image, Image.Image):
+            img_array = np.array(image)
+        else:
+            img_array = image.copy()
+        all_results = {}
+        # Run EasyOCR (always available)
         try:
+            easyocr_results = self._run_easyocr(img_array)
+            if easyocr_results:
+                all_results['easyocr'] = easyocr_results
         except Exception as e:
             print(f"EasyOCR error: {e}")
+        # Run PaddleOCR if available
+        if self.paddleocr_available and use_ensemble:
+            try:
+                paddleocr_results = self._run_paddleocr(img_array)
+                if paddleocr_results:
+                    all_results['paddleocr'] = paddleocr_results
+            except Exception as e:
+                print(f"PaddleOCR error: {e}")
+        # Run Tesseract if available
+        if self.use_tesseract and use_ensemble:
+            try:
+                tesseract_results = self._run_tesseract(img_array)
+                if tesseract_results:
+                    all_results['tesseract'] = tesseract_results
+            except Exception as e:
+                print(f"Tesseract error: {e}")
+        # Run TrOCR on detected boxes (needs boxes from other engines)
+        if self.trocr_available and use_ensemble and all_results:
+            try:
+                # Get boxes from best available engine
+                source_engine = max(all_results.keys(), key=lambda k: len(all_results[k]))
+                boxes = [r['bbox'] for r in all_results[source_engine]]
+                trocr_results = self._run_trocr(img_array, boxes)
+                if trocr_results:
+                    all_results['trocr'] = trocr_results
+            except Exception as e:
+                print(f"TrOCR error: {e}")
+        # Merge results if ensemble, otherwise use EasyOCR only
+        if use_ensemble and len(all_results) > 1:
+            merged = self._merge_results(all_results)
+        elif 'easyocr' in all_results:
+            merged = all_results['easyocr']
+        else:
+            merged = []
+        # Filter by confidence
+        filtered = [r for r in merged if r['confidence'] >= min_confidence]
+        # If results are poor, try with preprocessing
+        avg_confidence = np.mean([r['confidence'] for r in filtered]) if filtered else 0
+        if len(filtered) < 3 or avg_confidence < 0.4:
+            try:
+                preprocessed = self._preprocess_image(image, method='enhance')
+                retry_results = self._run_easyocr(preprocessed)
+                retry_filtered = [r for r in retry_results if r['confidence'] >= min_confidence]
+                retry_avg = np.mean([r['confidence'] for r in retry_filtered]) if retry_filtered else 0
+                if retry_avg > avg_confidence:
+                    filtered = retry_filtered
+            except Exception:
+                pass
         # Sort by confidence (highest first)
+        filtered.sort(key=lambda x: x['confidence'], reverse=True)
+        return filtered
     def postprocess_receipt(self, ocr_results):
         """Extract structured fields from OCR results with improved patterns."""

requirements.txt CHANGED Viewed

@@ -2,6 +2,7 @@ torch>=2.0.0
 torchvision>=0.15.0
 transformers>=4.30.0
 easyocr>=1.7.0
 # Pin Gradio/gradio_client to a stable pair to avoid json_schema issues on Spaces
 gradio==3.41.2
 gradio_client==0.5.0

 torchvision>=0.15.0
 transformers>=4.30.0
 easyocr>=1.7.0
+paddleocr>=2.7.0
 # Pin Gradio/gradio_client to a stable pair to avoid json_schema issues on Spaces
 gradio==3.41.2
 gradio_client==0.5.0