Spaces:
Running
Running
| from statistics import mode | |
| from pdf_features import PdfToken | |
| from pdf_features import Rectangle | |
| from pdf_token_type_labels import TokenType | |
| class PdfSegment: | |
| def __init__( | |
| self, page_number: int, bounding_box: Rectangle, text_content: str, segment_type: TokenType, pdf_name: str = "" | |
| ): | |
| self.page_number = page_number | |
| self.bounding_box = bounding_box | |
| self.text_content = text_content | |
| self.segment_type = segment_type | |
| self.pdf_name = pdf_name | |
| def from_pdf_tokens(pdf_tokens: list[PdfToken], pdf_name: str = ""): | |
| text: str = " ".join([pdf_token.content for pdf_token in pdf_tokens]) | |
| bounding_boxes = [pdf_token.bounding_box for pdf_token in pdf_tokens] | |
| segment_type = mode([token.token_type for token in pdf_tokens]) | |
| return PdfSegment( | |
| pdf_tokens[0].page_number, Rectangle.merge_rectangles(bounding_boxes), text, segment_type, pdf_name | |
| ) | |