Check / preprocess.py
DamLoan's picture
Update preprocess.py
1f51108 verified
# Convert PDF files to Images
import os
import cv2
import numpy as np
from pdf2image import convert_from_path
import glob
# Hàm chuyển PDF sang ảnh
def pdf_to_images(pdf_path, output_dir, dpi=300):
try:
pages = convert_from_path(pdf_path, dpi=dpi)
for i, page in enumerate(pages):
image_name = f"{os.path.splitext(os.path.basename(pdf_path))[0]}_page_{i+1}.jpg"
image_path = os.path.join(output_dir, image_name)
page.save(image_path, "JPEG", quality=95)
return len(pages) # Trả về số lượng ảnh được tạo
except Exception as e:
print(f"✗ Error processing {pdf_path}: {e}")
return 0
# Xử lý toàn bộ file PDF
def process_all_pdfs():
pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf"))
total_images = 0
if not pdf_files:
print(f"No PDF files found in {pdf_directory}")
return
for pdf_file in pdf_files:
num_pages = pdf_to_images(pdf_file, output_directory)
total_images += num_pages
# Image Preprocessing
import os
import cv2
import numpy as np
from PIL import Image
def preprocess_image(image_path):
pil_img = Image.open(image_path)
img = np.array(pil_img)
gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
contrast_img = clahe.apply(gray)
_, binary = cv2.threshold(contrast_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
kernel = np.ones((1, 1), np.uint8)
bold_img = cv2.dilate(binary, kernel, iterations=1)
return bold_img