Spaces:
Running
Running
# rag/ocr_mistral.py | |
USE_MISTRAL = os.getenv("USE_MISTRAL", "false").lower() == "true" | |
if not USE_MISTRAL: | |
# dev chose Azure, so expose a shim that imports azure parser | |
from .ocr_azure import parse_passport_azure as parse_passport # type: ignore | |
# everything else in this file is skipped | |
raise SystemExit # stop executing rest of the file | |
import os, base64, re, logging | |
from typing import Dict, Any, Optional | |
from mistralai import Mistral | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# load your .env or rely on HF_SPACE secrets / os.environ | |
MISTRAL_KEY = os.getenv("MISTRAL_OCR_KEY") | |
if not MISTRAL_KEY: | |
raise RuntimeError("Please set MISTRAL_OCR_KEY in your environment") | |
# initialize once | |
_client = Mistral(api_key=MISTRAL_KEY) | |
def parse_passport(file_bytes: bytes) -> Dict[str, Any]: | |
""" | |
Send PDF/JPEG/PNG bytes to Mistral OCR and extract passport fields. | |
Returns a dictionary with extracted fields and raw text. | |
""" | |
try: | |
# 1) Base64‑encode the bytes | |
b64 = base64.b64encode(file_bytes).decode("utf-8") | |
# 2) Build the 'document' payload per Mistral spec | |
header = file_bytes[:8] | |
if header.startswith(b"%PDF"): | |
doc = { | |
"type": "document_url", | |
"document_url": f"data:application/pdf;base64,{b64}" | |
} | |
elif header.startswith(b"\x89PNG\r\n\x1a\n"): | |
doc = { | |
"type": "image_url", | |
"image_url": f"data:image/png;base64,{b64}" | |
} | |
else: | |
# treat everything else as JPEG | |
doc = { | |
"type": "image_url", | |
"image_url": f"data:image/jpeg;base64,{b64}" | |
} | |
# 3) Call Mistral OCR | |
resp = _client.ocr.process( | |
model="mistral-ocr-latest", | |
document=doc, | |
include_image_base64=False | |
) | |
# 4) Aggregate all text | |
pages = resp.get("pages", []) | |
full_text = "\n".join(p.get("text","") for p in pages) | |
# 5) Enhanced regex extraction with multiple patterns | |
profile = {} | |
# First Name patterns | |
first_name_patterns = [ | |
r"Given\s+Name[:\s]+([A-Za-z\-]+)", | |
r"First\s+Name[:\s]+([A-Za-z\-]+)", | |
r"Given\s+Names?[:\s]+([A-Za-z\-]+)", | |
r"First\s+Names?[:\s]+([A-Za-z\-]+)" | |
] | |
# Last Name patterns | |
last_name_patterns = [ | |
r"Family\s+Name[:\s]+([A-Za-z\-]+)", | |
r"Last\s+Name[:\s]+([A-Za-z\-]+)", | |
r"Surname[:\s]+([A-Za-z\-]+)", | |
r"Family\s+Names?[:\s]+([A-Za-z\-]+)" | |
] | |
# Date of Birth patterns | |
dob_patterns = [ | |
r"Date\s+of\s+Birth[:\s]+(\d{2}/\d{2}/\d{4})", | |
r"DOB[:\s]+(\d{2}/\d{2}/\d{4})", | |
r"Birth\s+Date[:\s]+(\d{2}/\d{2}/\d{4})" | |
] | |
# A-Number patterns | |
a_number_patterns = [ | |
r"A-Number[:\s]*(A\d{8,9})", | |
r"A\s*Number[:\s]*(A\d{8,9})", | |
r"Alien\s+Number[:\s]*(A\d{8,9})" | |
] | |
# Try each pattern until we find a match | |
for pattern in first_name_patterns: | |
if m := re.search(pattern, full_text, re.IGNORECASE): | |
profile["FirstName"] = m.group(1) | |
break | |
for pattern in last_name_patterns: | |
if m := re.search(pattern, full_text, re.IGNORECASE): | |
profile["LastName"] = m.group(1) | |
break | |
for pattern in dob_patterns: | |
if m := re.search(pattern, full_text, re.IGNORECASE): | |
profile["DateOfBirth"] = m.group(1) | |
break | |
for pattern in a_number_patterns: | |
if m := re.search(pattern, full_text, re.IGNORECASE): | |
profile["ANumber"] = m.group(1) | |
break | |
# include raw text for debugging | |
profile["_raw_text"] = full_text | |
# Log extraction results | |
logger.info(f"Extracted profile: {profile}") | |
return profile | |
except Exception as e: | |
logger.error(f"Error processing passport: {str(e)}") | |
return { | |
"error": str(e), | |
"_raw_text": full_text if 'full_text' in locals() else "" | |
} |