Wasim
Sync: robust vehicle parser + full project
2e237ce
from domain.PdfSegment import PdfSegment
from pdf_features import PdfPage
from pdf_token_type_labels import TokenType
from pydantic import BaseModel
class SegmentBox(BaseModel):
left: float
top: float
width: float
height: float
page_number: int
page_width: int
page_height: int
text: str = ""
type: TokenType = TokenType.TEXT
id: str = ""
def __hash__(self):
return hash(
(
self.left,
self.top,
self.width,
self.height,
self.page_number,
self.page_width,
self.page_height,
self.text,
self.type,
self.id,
)
)
def to_dict(self):
return {
"left": self.left,
"top": self.top,
"width": self.width,
"height": self.height,
"page_number": self.page_number,
"page_width": self.page_width,
"page_height": self.page_height,
"text": self.text,
"type": self.type.value,
}
@staticmethod
def from_pdf_segment(pdf_segment: PdfSegment, pdf_pages: list[PdfPage]):
return SegmentBox(
left=pdf_segment.bounding_box.left,
top=pdf_segment.bounding_box.top,
width=pdf_segment.bounding_box.width,
height=pdf_segment.bounding_box.height,
page_number=pdf_segment.page_number,
page_width=pdf_pages[pdf_segment.page_number - 1].page_width,
page_height=pdf_pages[pdf_segment.page_number - 1].page_height,
text=pdf_segment.text_content,
type=pdf_segment.segment_type,
)
if __name__ == "__main__":
a = TokenType.TEXT
print(a.value)