from paddleocr import PaddleOCR import re # Initialize OCR ocr = PaddleOCR(use_angle_cls=True, lang='en') def extract_kyc_fields(file_path, force_type=None): try: result = ocr.ocr(file_path, cls=True) lines = [] for block in result: for line in block: text = line[1][0].strip() if text: lines.append(text) full_text = "\n".join(lines) if force_type: card_type = force_type.upper() else: pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text) aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text) card_type = "UNKNOWN" if pan_match: card_type = "PAN" elif aadhaar_match: card_type = "AADHAAR" response = {"card_type": card_type} if card_type == "PAN": pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text) if pan_match: response["pan_number"] = pan_match.group(0) response["dob"] = extract_dob(lines) response["name"] = extract_pan_name(lines) elif card_type == "AADHAAR": aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text) if aadhaar_match: response["aadhaar_number"] = aadhaar_match.group(0) response["dob"] = extract_dob(lines) response["gender"] = extract_gender(lines) response["name"] = extract_aadhaar_name(lines) else: response["error"] = "Could not identify document as PAN or Aadhaar." return response except Exception as e: return {"error": f"OCR processing failed: {str(e)}"} def extract_dob(lines): dob = "Not found" for line in lines: match = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line) if match: return match.group(0) for line in lines: match = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line) if match: return match.group(0) for line in lines: match = re.search(r'\b(19|20)\d{2}\b', line) if match and any(label in line.upper() for label in ["YOB", "YEAR", "BIRTH"]): return match.group(0) return dob def extract_gender(lines): for line in lines: if "MALE" in line.upper(): return "MALE" elif "FEMALE" in line.upper(): return "FEMALE" elif "TRANSGENDER" in line.upper(): return "TRANSGENDER" return "Not found" def extract_pan_name(lines): for i in range(len(lines)): if "INCOME TAX DEPARTMENT" in lines[i].upper(): for j in range(i + 1, len(lines)): possible = lines[j].strip() if ( re.match(r'^[A-Z\s.]+$', possible) and not any(x in possible for x in ["INDIA", "GOVT", "DEPARTMENT"]) and not re.search(r'\d', possible) ): return possible.strip() return "Not found" def extract_aadhaar_name(lines): for i, line in enumerate(lines): if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0: possible_name = lines[i - 1].strip() if ( not re.search(r'\d', possible_name) and len(possible_name.split()) >= 2 and not any(x in possible_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]) ): return possible_name for line in lines: if ( not re.search(r'\d', line) and len(line.split()) >= 2 and not any(x in line.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"]) ): return line.strip() return "Not found"