Spaces:
Running
Running
import os | |
import shutil | |
import cv2 | |
import numpy as np | |
from os import makedirs | |
from os.path import join | |
from pathlib import Path | |
from PIL import Image | |
from pdf2image import convert_from_path | |
from pdf_features import PdfFeatures | |
from src.configuration import IMAGES_ROOT_PATH, XMLS_PATH | |
class PdfImages: | |
def __init__(self, pdf_features: PdfFeatures, pdf_images: list[Image], dpi: int = 72): | |
self.pdf_features: PdfFeatures = pdf_features | |
self.pdf_images: list[Image] = pdf_images | |
self.dpi: int = dpi | |
self.save_images() | |
def show_images(self, next_image_delay: int = 2): | |
for image_index, image in enumerate(self.pdf_images): | |
image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) | |
cv2.imshow(f"Page: {image_index + 1}", image_np) | |
cv2.waitKey(next_image_delay * 1000) | |
cv2.destroyAllWindows() | |
def save_images(self): | |
makedirs(IMAGES_ROOT_PATH, exist_ok=True) | |
for image_index, image in enumerate(self.pdf_images): | |
image_name = f"{self.pdf_features.file_name}_{image_index}.jpg" | |
image.save(join(IMAGES_ROOT_PATH, image_name)) | |
def remove_images(): | |
shutil.rmtree(IMAGES_ROOT_PATH) | |
def from_pdf_path(pdf_path: str | Path, pdf_name: str = "", xml_file_name: str = "", dpi: int = 72): | |
xml_path = None if not xml_file_name else Path(XMLS_PATH, xml_file_name) | |
if xml_path and not xml_path.parent.exists(): | |
os.makedirs(xml_path.parent, exist_ok=True) | |
pdf_features: PdfFeatures = PdfFeatures.from_pdf_path(pdf_path, xml_path) | |
if pdf_name: | |
pdf_features.file_name = pdf_name | |
else: | |
pdf_name = Path(pdf_path).parent.name if Path(pdf_path).name == "document.pdf" else Path(pdf_path).stem | |
pdf_features.file_name = pdf_name | |
pdf_images = convert_from_path(pdf_path, dpi=dpi) | |
return PdfImages(pdf_features, pdf_images, dpi) | |