File size: 1,594 Bytes
1f51108
b83ba6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f51108
b83ba6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f51108
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# Convert PDF files to Images

import os
import cv2
import numpy as np
from pdf2image import convert_from_path
import glob

# Hàm chuyển PDF sang ảnh
def pdf_to_images(pdf_path, output_dir, dpi=300):
    try:
        pages = convert_from_path(pdf_path, dpi=dpi)
        for i, page in enumerate(pages):
            image_name = f"{os.path.splitext(os.path.basename(pdf_path))[0]}_page_{i+1}.jpg"
            image_path = os.path.join(output_dir, image_name)
            page.save(image_path, "JPEG", quality=95)
        return len(pages)  # Trả về số lượng ảnh được tạo
    except Exception as e:
        print(f"✗ Error processing {pdf_path}: {e}")
        return 0

# Xử lý toàn bộ file PDF
def process_all_pdfs():
    pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf"))
    total_images = 0

    if not pdf_files:
        print(f"No PDF files found in {pdf_directory}")
        return

    for pdf_file in pdf_files:
        num_pages = pdf_to_images(pdf_file, output_directory)
        total_images += num_pages

# Image Preprocessing

import os
import cv2
import numpy as np
from PIL import Image

def preprocess_image(image_path):
    pil_img = Image.open(image_path)
    img = np.array(pil_img)

    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    contrast_img = clahe.apply(gray)
    _, binary = cv2.threshold(contrast_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    kernel = np.ones((1, 1), np.uint8)
    bold_img = cv2.dilate(binary, kernel, iterations=1)

    return bold_img