Spaces:

gopichandra
/

SMART_KYC_OCR

Runtime error

App Files Files Community

SMART_KYC_OCR / utils.py

gopichandra

Update utils.py

65bef46 verified 11 days ago

raw

history blame contribute delete

3.87 kB

	from paddleocr import PaddleOCR
	import re

	# Initialize OCR
	ocr = PaddleOCR(use_angle_cls=True, lang='en')

	def extract_kyc_fields(file_path, force_type=None):
	try:
	result = ocr.ocr(file_path, cls=True)

	lines = []
	for block in result:
	for line in block:
	text = line[1][0].strip()
	if text:
	lines.append(text)

	full_text = "\n".join(lines)

	if force_type:
	card_type = force_type.upper()
	else:
	pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
	aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
	card_type = "UNKNOWN"
	if pan_match:
	card_type = "PAN"
	elif aadhaar_match:
	card_type = "AADHAAR"

	response = {"card_type": card_type}

	if card_type == "PAN":
	pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
	if pan_match:
	response["pan_number"] = pan_match.group(0)
	response["dob"] = extract_dob(lines)
	response["name"] = extract_pan_name(lines)

	elif card_type == "AADHAAR":
	aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
	if aadhaar_match:
	response["aadhaar_number"] = aadhaar_match.group(0)
	response["dob"] = extract_dob(lines)
	response["gender"] = extract_gender(lines)
	response["name"] = extract_aadhaar_name(lines)

	else:
	response["error"] = "Could not identify document as PAN or Aadhaar."

	return response
	except Exception as e:
	return {"error": f"OCR processing failed: {str(e)}"}


	def extract_dob(lines):
	dob = "Not found"
	for line in lines:
	match = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
	if match:
	return match.group(0)
	for line in lines:
	match = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
	if match:
	return match.group(0)
	for line in lines:
	match = re.search(r'\b(19\|20)\d{2}\b', line)
	if match and any(label in line.upper() for label in ["YOB", "YEAR", "BIRTH"]):
	return match.group(0)
	return dob


	def extract_gender(lines):
	for line in lines:
	if "MALE" in line.upper():
	return "MALE"
	elif "FEMALE" in line.upper():
	return "FEMALE"
	elif "TRANSGENDER" in line.upper():
	return "TRANSGENDER"
	return "Not found"


	def extract_pan_name(lines):
	for i in range(len(lines)):
	if "INCOME TAX DEPARTMENT" in lines[i].upper():
	for j in range(i + 1, len(lines)):
	possible = lines[j].strip()
	if (
	re.match(r'^[A-Z\s.]+$', possible)
	and not any(x in possible for x in ["INDIA", "GOVT", "DEPARTMENT"])
	and not re.search(r'\d', possible)
	):
	return possible.strip()
	return "Not found"


	def extract_aadhaar_name(lines):
	for i, line in enumerate(lines):
	if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
	possible_name = lines[i - 1].strip()
	if (
	not re.search(r'\d', possible_name)
	and len(possible_name.split()) >= 2
	and not any(x in possible_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
	):
	return possible_name
	for line in lines:
	if (
	not re.search(r'\d', line)
	and len(line.split()) >= 2
	and not any(x in line.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
	):
	return line.strip()
	return "Not found"