File size: 970 Bytes
2e237ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
from statistics import mode
from pdf_features import PdfToken
from pdf_features import Rectangle
from pdf_token_type_labels import TokenType


class PdfSegment:
    def __init__(
        self, page_number: int, bounding_box: Rectangle, text_content: str, segment_type: TokenType, pdf_name: str = ""
    ):
        self.page_number = page_number
        self.bounding_box = bounding_box
        self.text_content = text_content
        self.segment_type = segment_type
        self.pdf_name = pdf_name

    @staticmethod
    def from_pdf_tokens(pdf_tokens: list[PdfToken], pdf_name: str = ""):
        text: str = " ".join([pdf_token.content for pdf_token in pdf_tokens])
        bounding_boxes = [pdf_token.bounding_box for pdf_token in pdf_tokens]
        segment_type = mode([token.token_type for token in pdf_tokens])
        return PdfSegment(
            pdf_tokens[0].page_number, Rectangle.merge_rectangles(bounding_boxes), text, segment_type, pdf_name
        )