File size: 3,869 Bytes
ae2e698
c70099c
 
254fdf9
5ebcb93
ae2e698
254fdf9
a8683a1
ae2e698
 
8324e53
 
 
 
 
 
ae2e698
8324e53
ae2e698
254fdf9
 
b07dfbb
254fdf9
 
 
 
 
 
 
a726fb2
 
 
2c3e33d
254fdf9
 
 
65bef46
 
2c3e33d
254fdf9
 
 
 
65bef46
 
 
a726fb2
254fdf9
 
 
a726fb2
a8683a1
8324e53
254fdf9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65bef46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from paddleocr import PaddleOCR
import re

# Initialize OCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

def extract_kyc_fields(file_path, force_type=None):
    try:
        result = ocr.ocr(file_path, cls=True)

        lines = []
        for block in result:
            for line in block:
                text = line[1][0].strip()
                if text:
                    lines.append(text)

        full_text = "\n".join(lines)

        if force_type:
            card_type = force_type.upper()
        else:
            pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
            aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
            card_type = "UNKNOWN"
            if pan_match:
                card_type = "PAN"
            elif aadhaar_match:
                card_type = "AADHAAR"

        response = {"card_type": card_type}

        if card_type == "PAN":
            pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
            if pan_match:
                response["pan_number"] = pan_match.group(0)
            response["dob"] = extract_dob(lines)
            response["name"] = extract_pan_name(lines)

        elif card_type == "AADHAAR":
            aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
            if aadhaar_match:
                response["aadhaar_number"] = aadhaar_match.group(0)
            response["dob"] = extract_dob(lines)
            response["gender"] = extract_gender(lines)
            response["name"] = extract_aadhaar_name(lines)

        else:
            response["error"] = "Could not identify document as PAN or Aadhaar."

        return response
    except Exception as e:
        return {"error": f"OCR processing failed: {str(e)}"}


def extract_dob(lines):
    dob = "Not found"
    for line in lines:
        match = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
        if match:
            return match.group(0)
    for line in lines:
        match = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
        if match:
            return match.group(0)
    for line in lines:
        match = re.search(r'\b(19|20)\d{2}\b', line)
        if match and any(label in line.upper() for label in ["YOB", "YEAR", "BIRTH"]):
            return match.group(0)
    return dob


def extract_gender(lines):
    for line in lines:
        if "MALE" in line.upper():
            return "MALE"
        elif "FEMALE" in line.upper():
            return "FEMALE"
        elif "TRANSGENDER" in line.upper():
            return "TRANSGENDER"
    return "Not found"


def extract_pan_name(lines):
    for i in range(len(lines)):
        if "INCOME TAX DEPARTMENT" in lines[i].upper():
            for j in range(i + 1, len(lines)):
                possible = lines[j].strip()
                if (
                    re.match(r'^[A-Z\s.]+$', possible)
                    and not any(x in possible for x in ["INDIA", "GOVT", "DEPARTMENT"])
                    and not re.search(r'\d', possible)
                ):
                    return possible.strip()
    return "Not found"


def extract_aadhaar_name(lines):
    for i, line in enumerate(lines):
        if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
            possible_name = lines[i - 1].strip()
            if (
                not re.search(r'\d', possible_name)
                and len(possible_name.split()) >= 2
                and not any(x in possible_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
            ):
                return possible_name
    for line in lines:
        if (
            not re.search(r'\d', line)
            and len(line.split()) >= 2
            and not any(x in line.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
        ):
            return line.strip()
    return "Not found"