Spaces:
Running
Running
File size: 2,428 Bytes
2e237ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import json
from os import makedirs
from pdf_features import PdfToken
from domain.PdfImages import PdfImages
from configuration import DOCLAYNET_TYPE_BY_ID
from configuration import JSONS_ROOT_PATH, JSON_TEST_FILE_PATH
def save_annotations_json(annotations: list, width_height: list, images: list):
images_dict = [
{
"id": i,
"file_name": image_id + ".jpg",
"width": width_height[images.index(image_id)][0],
"height": width_height[images.index(image_id)][1],
}
for i, image_id in enumerate(images)
]
categories_dict = [{"id": key, "name": value} for key, value in DOCLAYNET_TYPE_BY_ID.items()]
info_dict = {
"description": "PDF Document Layout Analysis Dataset",
"url": "",
"version": "1.0",
"year": 2025,
"contributor": "",
"date_created": "2025-01-01",
}
coco_dict = {"info": info_dict, "images": images_dict, "categories": categories_dict, "annotations": annotations}
JSON_TEST_FILE_PATH.write_text(json.dumps(coco_dict))
def get_annotation(index: int, image_id: str, token: PdfToken):
return {
"area": 1,
"iscrowd": 0,
"score": 1,
"image_id": image_id,
"bbox": [token.bounding_box.left, token.bounding_box.top, token.bounding_box.width, token.bounding_box.height],
"category_id": token.token_type.get_index(),
"id": index,
}
def get_annotations_for_document(annotations, images, index, pdf_images, width_height):
for page_index, page in enumerate(pdf_images.pdf_features.pages):
image_id = f"{pdf_images.pdf_features.file_name}_{page.page_number - 1}"
images.append(image_id)
width_height.append((pdf_images.pdf_images[page_index].width, pdf_images.pdf_images[page_index].height))
for token in page.tokens:
annotations.append(get_annotation(index, image_id, token))
index += 1
def get_annotations(pdf_images_list: list[PdfImages]):
makedirs(JSONS_ROOT_PATH, exist_ok=True)
annotations = list()
images = list()
width_height = list()
index = 0
for pdf_images in pdf_images_list:
get_annotations_for_document(annotations, images, index, pdf_images, width_height)
index += sum([len(page.tokens) for page in pdf_images.pdf_features.pages])
save_annotations_json(annotations, width_height, images)
|