Spaces:
Running
Running
| from domain.PdfSegment import PdfSegment | |
| from pdf_features import PdfPage | |
| from pdf_token_type_labels import TokenType | |
| from pydantic import BaseModel | |
| class SegmentBox(BaseModel): | |
| left: float | |
| top: float | |
| width: float | |
| height: float | |
| page_number: int | |
| page_width: int | |
| page_height: int | |
| text: str = "" | |
| type: TokenType = TokenType.TEXT | |
| id: str = "" | |
| def __hash__(self): | |
| return hash( | |
| ( | |
| self.left, | |
| self.top, | |
| self.width, | |
| self.height, | |
| self.page_number, | |
| self.page_width, | |
| self.page_height, | |
| self.text, | |
| self.type, | |
| self.id, | |
| ) | |
| ) | |
| def to_dict(self): | |
| return { | |
| "left": self.left, | |
| "top": self.top, | |
| "width": self.width, | |
| "height": self.height, | |
| "page_number": self.page_number, | |
| "page_width": self.page_width, | |
| "page_height": self.page_height, | |
| "text": self.text, | |
| "type": self.type.value, | |
| } | |
| def from_pdf_segment(pdf_segment: PdfSegment, pdf_pages: list[PdfPage]): | |
| return SegmentBox( | |
| left=pdf_segment.bounding_box.left, | |
| top=pdf_segment.bounding_box.top, | |
| width=pdf_segment.bounding_box.width, | |
| height=pdf_segment.bounding_box.height, | |
| page_number=pdf_segment.page_number, | |
| page_width=pdf_pages[pdf_segment.page_number - 1].page_width, | |
| page_height=pdf_pages[pdf_segment.page_number - 1].page_height, | |
| text=pdf_segment.text_content, | |
| type=pdf_segment.segment_type, | |
| ) | |
| if __name__ == "__main__": | |
| a = TokenType.TEXT | |
| print(a.value) | |