pdf-parser-api / backend /text_recog.py
blaxx14's picture
Update backend/text_recog.py
bc9be72 verified
import pytesseract
import cv2
import re
import platform
from .file_utils import convert_image_to_word
def configure_tesseract():
system = platform.system()
if system == "Windows":
pytesseract.pytesseract.tesseract_cmd = r"C:\Users\hp\AppData\Local\Programs\Tesseract-OCR\tesseract.exe"
else:
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"
def extract_ktp_info(image_path, filename):
configure_tesseract()
img = cv2.imread(image_path)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
text = pytesseract.image_to_string(gray, lang='ind')
text = text.replace('\n\n\n', '\n').replace('\n\n', '\n').replace('\n', '\n')
match = re.search(r'Berlaku Hingga\n(.*?)\nGol\. Darah', text, re.DOTALL)
if match:
extracted_text = match.group(1)
lines = [
re.sub(r'^(:|\d+)?\s*', '', line.strip())
for line in extracted_text.strip().split('\n')
if line.strip()
]
print("Hasil List Bersih:")
print(lines)
city = re.search(r'PROVINSI\s+(.+?)\n(.+?)\n', text)
result = {
"nik" : lines[0],
"nama" : lines[1],
"tempat_tgl_lahir" : lines[2],
"jenis_kelamin" : lines[3],
"alamat" : lines[4],
"rt_rw" : lines[5],
"kel/desa" : lines[6],
"kecamatan" : lines[7],
"provinsi" : lines[13],
"agama" : lines[8],
"kewarganegaraan" : lines[10],
"pekerjaan" : lines[9],
}
# convert_image_to_word(result, f'KTP {filename}')
return result