Spaces:
Runtime error
Runtime error
File size: 3,869 Bytes
ae2e698 c70099c 254fdf9 5ebcb93 ae2e698 254fdf9 a8683a1 ae2e698 8324e53 ae2e698 8324e53 ae2e698 254fdf9 b07dfbb 254fdf9 a726fb2 2c3e33d 254fdf9 65bef46 2c3e33d 254fdf9 65bef46 a726fb2 254fdf9 a726fb2 a8683a1 8324e53 254fdf9 65bef46 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
from paddleocr import PaddleOCR
import re
# Initialize OCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')
def extract_kyc_fields(file_path, force_type=None):
try:
result = ocr.ocr(file_path, cls=True)
lines = []
for block in result:
for line in block:
text = line[1][0].strip()
if text:
lines.append(text)
full_text = "\n".join(lines)
if force_type:
card_type = force_type.upper()
else:
pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
card_type = "UNKNOWN"
if pan_match:
card_type = "PAN"
elif aadhaar_match:
card_type = "AADHAAR"
response = {"card_type": card_type}
if card_type == "PAN":
pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
if pan_match:
response["pan_number"] = pan_match.group(0)
response["dob"] = extract_dob(lines)
response["name"] = extract_pan_name(lines)
elif card_type == "AADHAAR":
aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
if aadhaar_match:
response["aadhaar_number"] = aadhaar_match.group(0)
response["dob"] = extract_dob(lines)
response["gender"] = extract_gender(lines)
response["name"] = extract_aadhaar_name(lines)
else:
response["error"] = "Could not identify document as PAN or Aadhaar."
return response
except Exception as e:
return {"error": f"OCR processing failed: {str(e)}"}
def extract_dob(lines):
dob = "Not found"
for line in lines:
match = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
if match:
return match.group(0)
for line in lines:
match = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
if match:
return match.group(0)
for line in lines:
match = re.search(r'\b(19|20)\d{2}\b', line)
if match and any(label in line.upper() for label in ["YOB", "YEAR", "BIRTH"]):
return match.group(0)
return dob
def extract_gender(lines):
for line in lines:
if "MALE" in line.upper():
return "MALE"
elif "FEMALE" in line.upper():
return "FEMALE"
elif "TRANSGENDER" in line.upper():
return "TRANSGENDER"
return "Not found"
def extract_pan_name(lines):
for i in range(len(lines)):
if "INCOME TAX DEPARTMENT" in lines[i].upper():
for j in range(i + 1, len(lines)):
possible = lines[j].strip()
if (
re.match(r'^[A-Z\s.]+$', possible)
and not any(x in possible for x in ["INDIA", "GOVT", "DEPARTMENT"])
and not re.search(r'\d', possible)
):
return possible.strip()
return "Not found"
def extract_aadhaar_name(lines):
for i, line in enumerate(lines):
if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
possible_name = lines[i - 1].strip()
if (
not re.search(r'\d', possible_name)
and len(possible_name.split()) >= 2
and not any(x in possible_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
):
return possible_name
for line in lines:
if (
not re.search(r'\d', line)
and len(line.split()) >= 2
and not any(x in line.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
):
return line.strip()
return "Not found"
|