from paddleocr import PaddleOCR
import re

# Initialize OCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

def extract_kyc_fields(file_path, force_type=None):
    try:
        result = ocr.ocr(file_path, cls=True)

        lines = []
        for block in result:
            for line in block:
                text = line[1][0].strip()
                if text:
                    lines.append(text)

        full_text = "\n".join(lines)

        if force_type:
            card_type = force_type.upper()
        else:
            pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
            aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
            card_type = "UNKNOWN"
            if pan_match:
                card_type = "PAN"
            elif aadhaar_match:
                card_type = "AADHAAR"

        response = {"card_type": card_type}

        if card_type == "PAN":
            pan_match = re.search(r'\b[A-Z]{5}[0-9]{4}[A-Z]\b', full_text)
            if pan_match:
                response["pan_number"] = pan_match.group(0)
            response["dob"] = extract_dob(lines)
            response["name"] = extract_pan_name(lines)

        elif card_type == "AADHAAR":
            aadhaar_match = re.search(r'\b\d{4}[\s\-]?\d{4}[\s\-]?\d{4}\b', full_text)
            if aadhaar_match:
                response["aadhaar_number"] = aadhaar_match.group(0)
            response["dob"] = extract_dob(lines)
            response["gender"] = extract_gender(lines)
            response["name"] = extract_aadhaar_name(lines)

        else:
            response["error"] = "Could not identify document as PAN or Aadhaar."

        return response
    except Exception as e:
        return {"error": f"OCR processing failed: {str(e)}"}


def extract_dob(lines):
    dob = "Not found"
    for line in lines:
        match = re.search(r'\b\d{2}[./-]\d{2}[./-]\d{4}\b', line)
        if match:
            return match.group(0)
    for line in lines:
        match = re.search(r'\b\d{4}-\d{2}-\d{2}\b', line)
        if match:
            return match.group(0)
    for line in lines:
        match = re.search(r'\b(19|20)\d{2}\b', line)
        if match and any(label in line.upper() for label in ["YOB", "YEAR", "BIRTH"]):
            return match.group(0)
    return dob


def extract_gender(lines):
    for line in lines:
        if "MALE" in line.upper():
            return "MALE"
        elif "FEMALE" in line.upper():
            return "FEMALE"
        elif "TRANSGENDER" in line.upper():
            return "TRANSGENDER"
    return "Not found"


def extract_pan_name(lines):
    for i in range(len(lines)):
        if "INCOME TAX DEPARTMENT" in lines[i].upper():
            for j in range(i + 1, len(lines)):
                possible = lines[j].strip()
                if (
                    re.match(r'^[A-Z\s.]+$', possible)
                    and not any(x in possible for x in ["INDIA", "GOVT", "DEPARTMENT"])
                    and not re.search(r'\d', possible)
                ):
                    return possible.strip()
    return "Not found"


def extract_aadhaar_name(lines):
    for i, line in enumerate(lines):
        if re.search(r'\d{2}[./-]\d{2}[./-]\d{4}', line) and i > 0:
            possible_name = lines[i - 1].strip()
            if (
                not re.search(r'\d', possible_name)
                and len(possible_name.split()) >= 2
                and not any(x in possible_name.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
            ):
                return possible_name
    for line in lines:
        if (
            not re.search(r'\d', line)
            and len(line.split()) >= 2
            and not any(x in line.upper() for x in ["DOB", "INDIA", "MALE", "FEMALE", "GOVERNMENT"])
        ):
            return line.strip()
    return "Not found"