|
|
|
|
|
import os |
|
import cv2 |
|
import numpy as np |
|
from pdf2image import convert_from_path |
|
import glob |
|
|
|
|
|
def pdf_to_images(pdf_path, output_dir, dpi=300): |
|
try: |
|
pages = convert_from_path(pdf_path, dpi=dpi) |
|
for i, page in enumerate(pages): |
|
image_name = f"{os.path.splitext(os.path.basename(pdf_path))[0]}_page_{i+1}.jpg" |
|
image_path = os.path.join(output_dir, image_name) |
|
page.save(image_path, "JPEG", quality=95) |
|
return len(pages) |
|
except Exception as e: |
|
print(f"✗ Error processing {pdf_path}: {e}") |
|
return 0 |
|
|
|
|
|
def process_all_pdfs(): |
|
pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf")) |
|
total_images = 0 |
|
|
|
if not pdf_files: |
|
print(f"No PDF files found in {pdf_directory}") |
|
return |
|
|
|
for pdf_file in pdf_files: |
|
num_pages = pdf_to_images(pdf_file, output_directory) |
|
total_images += num_pages |
|
|
|
|
|
|
|
import os |
|
import cv2 |
|
import numpy as np |
|
from PIL import Image |
|
|
|
def preprocess_image(image_path): |
|
pil_img = Image.open(image_path) |
|
img = np.array(pil_img) |
|
|
|
gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) |
|
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)) |
|
contrast_img = clahe.apply(gray) |
|
_, binary = cv2.threshold(contrast_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) |
|
kernel = np.ones((1, 1), np.uint8) |
|
bold_img = cv2.dilate(binary, kernel, iterations=1) |
|
|
|
return bold_img |