# Convert PDF files to Images import os import cv2 import numpy as np from pdf2image import convert_from_path import glob # Hàm chuyển PDF sang ảnh def pdf_to_images(pdf_path, output_dir, dpi=300): try: pages = convert_from_path(pdf_path, dpi=dpi) for i, page in enumerate(pages): image_name = f"{os.path.splitext(os.path.basename(pdf_path))[0]}_page_{i+1}.jpg" image_path = os.path.join(output_dir, image_name) page.save(image_path, "JPEG", quality=95) return len(pages) # Trả về số lượng ảnh được tạo except Exception as e: print(f"✗ Error processing {pdf_path}: {e}") return 0 # Xử lý toàn bộ file PDF def process_all_pdfs(): pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf")) total_images = 0 if not pdf_files: print(f"No PDF files found in {pdf_directory}") return for pdf_file in pdf_files: num_pages = pdf_to_images(pdf_file, output_directory) total_images += num_pages # Image Preprocessing import os import cv2 import numpy as np from PIL import Image def preprocess_image(image_path): pil_img = Image.open(image_path) img = np.array(pil_img) gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)) contrast_img = clahe.apply(gray) _, binary = cv2.threshold(contrast_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) kernel = np.ones((1, 1), np.uint8) bold_img = cv2.dilate(binary, kernel, iterations=1) return bold_img