Spaces:
Running
Running
from statistics import mode | |
from pdf_features import PdfToken | |
from pdf_features import Rectangle | |
from pdf_token_type_labels import TokenType | |
class PdfSegment: | |
def __init__( | |
self, page_number: int, bounding_box: Rectangle, text_content: str, segment_type: TokenType, pdf_name: str = "" | |
): | |
self.page_number = page_number | |
self.bounding_box = bounding_box | |
self.text_content = text_content | |
self.segment_type = segment_type | |
self.pdf_name = pdf_name | |
def from_pdf_tokens(pdf_tokens: list[PdfToken], pdf_name: str = ""): | |
text: str = " ".join([pdf_token.content for pdf_token in pdf_tokens]) | |
bounding_boxes = [pdf_token.bounding_box for pdf_token in pdf_tokens] | |
segment_type = mode([token.token_type for token in pdf_tokens]) | |
return PdfSegment( | |
pdf_tokens[0].page_number, Rectangle.merge_rectangles(bounding_boxes), text, segment_type, pdf_name | |
) | |