Spaces:

Rogue2003
/

Receipt_Agent

Running

App Files Files Community

Raghu commited on 15 days ago

Commit

b8f0f36

1 Parent(s): acf3ed2

Accuracy-first optimizations: improved total extraction, OCR-first field extraction, LayoutLM validation, adaptive OCR ensemble

Browse files

Files changed (1) hide show

app.py +69 -20

app.py CHANGED Viewed

@@ -664,7 +664,7 @@ class ReceiptOCR:
         return merged
-    def extract_with_positions(self, image, min_confidence=0.3, use_ensemble=True):
         """Extract text with positions using ensemble of OCR engines."""
         if isinstance(image, Image.Image):
             img_array = np.array(image)
@@ -785,21 +785,29 @@ class ReceiptOCR:
         return None
     def _extract_total(self, text):
-        """Extract total amount with improved patterns."""
-        # Look for TOTAL, AMOUNT, DUE keywords
-        patterns = [
-            r'(?:TOTAL|AMOUNT|DUE|BALANCE)[:\s]*\$?\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)',
-            r'\$\s*(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)',  # Any dollar amount
-        ]
-        for pattern in patterns:
-            matches = re.findall(pattern, text, re.IGNORECASE)
-            if matches:
-                # Return largest amount (usually the total)
-                amounts = [float(m.replace(',', '')) for m in matches]
-                return f"{max(amounts):.2f}"
-        return None
     def _extract_time(self, text):
         """Extract time."""
@@ -1110,7 +1118,15 @@ def process_receipt(image):
     ocr_results = []
     try:
         if receipt_ocr:
-            ocr_results = receipt_ocr.extract_with_positions(image)
             ocr_image = draw_ocr_boxes(image, ocr_results)
             lines = [f"{i+1}. [{r['confidence']:.0%}] {r['text']}" for i, r in enumerate(ocr_results)]
@@ -1119,14 +1135,47 @@ def process_receipt(image):
     except Exception as e:
         ocr_text = f"OCR error: {e}"
-    # 3. Field Extraction
     fields = {}
     fields_html = ""
     try:
-        if layoutlm_extractor:
-            fields = layoutlm_extractor.predict_fields(image, ocr_results)
-        elif receipt_ocr and ocr_results:
-            fields = receipt_ocr.postprocess_receipt(ocr_results)
         fields_html = "<div style='padding: 16px; background: #111827; color: #e5e7eb; border-radius: 12px; border: 1px solid #1f2937;'><h4 style=\"color: #e5e7eb;\">Extracted Fields</h4>"
         for name, value in [

         return merged
+    def extract_with_positions(self, image, min_confidence=0.3, use_ensemble=False):
         """Extract text with positions using ensemble of OCR engines."""
         if isinstance(image, Image.Image):
             img_array = np.array(image)
         return None
     def _extract_total(self, text):
+        """Extract total amount - improved to find largest amount near TOTAL keyword."""
+        # First, find all dollar amounts in the text
+        all_amounts = re.findall(r'\$(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)', text)
+        all_amounts = [float(a.replace(',', '')) for a in all_amounts]
+        if not all_amounts:
+            return None
+        # Look for "TOTAL", "AMOUNT DUE", "BALANCE" keywords and find amount near them
+        lines = text.split('\n')
+        for i, line in enumerate(lines):
+            line_upper = line.upper()
+            if any(keyword in line_upper for keyword in ['TOTAL', 'AMOUNT DUE', 'BALANCE DUE', 'DUE']):
+                # Check this line and next 2 lines for amount
+                search_text = ' '.join(lines[i:min(i+3, len(lines))])
+                matches = re.findall(r'\$(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)', search_text)
+                if matches:
+                    amounts_near_total = [float(m.replace(',', '')) for m in matches]
+                    # Return largest amount near TOTAL keyword
+                    return f"{max(amounts_near_total):.2f}"
+        # Fallback: return largest amount overall (usually the total)
+        return f"{max(all_amounts):.2f}"
     def _extract_time(self, text):
         """Extract time."""
     ocr_results = []
     try:
         if receipt_ocr:
+            # Try fast OCR first (EasyOCR + Tesseract only)
+            ocr_results = receipt_ocr.extract_with_positions(image, use_ensemble=False)
+            # If confidence is low, try full ensemble
+            if ocr_results:
+                avg_conf = np.mean([r['confidence'] for r in ocr_results])
+                if avg_conf < 0.5 or len(ocr_results) < 5:
+                    # Low confidence or few results, try full ensemble
+                    ocr_results = receipt_ocr.extract_with_positions(image, use_ensemble=True)
             ocr_image = draw_ocr_boxes(image, ocr_results)
             lines = [f"{i+1}. [{r['confidence']:.0%}] {r['text']}" for i, r in enumerate(ocr_results)]
     except Exception as e:
         ocr_text = f"OCR error: {e}"
+    # 3. Field Extraction (OCR-first, LayoutLM as fallback)
     fields = {}
     fields_html = ""
     try:
+        # Try OCR regex first (faster and often more accurate for totals)
+        ocr_fields = {}
+        if receipt_ocr and ocr_results:
+            ocr_fields = receipt_ocr.postprocess_receipt(ocr_results)
+            fields = ocr_fields.copy()
+        # Use LayoutLM only to fill in missing fields or validate
+        if layoutlm_extractor and ocr_results:
+            layoutlm_fields = layoutlm_extractor.predict_fields(image, ocr_results)
+            # For each field, merge intelligently
+            for field_name in ['vendor', 'date', 'total', 'time']:
+                ocr_val = ocr_fields.get(field_name)
+                layoutlm_val = layoutlm_fields.get(field_name)
+                if not ocr_val and layoutlm_val:
+                    # OCR didn't find it, use LayoutLM
+                    fields[field_name] = layoutlm_val
+                elif ocr_val and layoutlm_val and field_name == 'total':
+                    # For total: validate LayoutLM against OCR text
+                    ocr_text = ' '.join([r['text'] for r in ocr_results])
+                    layoutlm_clean = str(layoutlm_val).replace('$', '').replace('.', '').replace(',', '').strip()
+                    ocr_clean = ocr_text.replace('$', '').replace('.', '').replace(',', '')
+                    # Check if LayoutLM total appears in OCR text
+                    if layoutlm_clean in ocr_clean:
+                        # LayoutLM matches OCR, use it (might be more accurate)
+                        fields['total'] = layoutlm_val
+                    else:
+                        # LayoutLM doesn't match OCR, trust OCR (more reliable)
+                        fields['total'] = ocr_val
+                elif ocr_val and layoutlm_val and field_name != 'total':
+                    # For other fields, prefer LayoutLM if it's longer/more complete
+                    if len(str(layoutlm_val)) > len(str(ocr_val)):
+                        fields[field_name] = layoutlm_val
+                    else:
+                        fields[field_name] = ocr_val
         fields_html = "<div style='padding: 16px; background: #111827; color: #e5e7eb; border-radius: 12px; border: 1px solid #1f2937;'><h4 style=\"color: #e5e7eb;\">Extracted Fields</h4>"
         for name, value in [