Wasim
Sync: robust vehicle parser + full project
2e237ce
raw
history blame contribute delete
970 Bytes
from statistics import mode
from pdf_features import PdfToken
from pdf_features import Rectangle
from pdf_token_type_labels import TokenType
class PdfSegment:
def __init__(
self, page_number: int, bounding_box: Rectangle, text_content: str, segment_type: TokenType, pdf_name: str = ""
):
self.page_number = page_number
self.bounding_box = bounding_box
self.text_content = text_content
self.segment_type = segment_type
self.pdf_name = pdf_name
@staticmethod
def from_pdf_tokens(pdf_tokens: list[PdfToken], pdf_name: str = ""):
text: str = " ".join([pdf_token.content for pdf_token in pdf_tokens])
bounding_boxes = [pdf_token.bounding_box for pdf_token in pdf_tokens]
segment_type = mode([token.token_type for token in pdf_tokens])
return PdfSegment(
pdf_tokens[0].page_number, Rectangle.merge_rectangles(bounding_boxes), text, segment_type, pdf_name
)