Spaces:
Running
Running
File size: 1,820 Bytes
2e237ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
from domain.PdfSegment import PdfSegment
from pdf_features import PdfPage
from pdf_token_type_labels import TokenType
from pydantic import BaseModel
class SegmentBox(BaseModel):
left: float
top: float
width: float
height: float
page_number: int
page_width: int
page_height: int
text: str = ""
type: TokenType = TokenType.TEXT
id: str = ""
def __hash__(self):
return hash(
(
self.left,
self.top,
self.width,
self.height,
self.page_number,
self.page_width,
self.page_height,
self.text,
self.type,
self.id,
)
)
def to_dict(self):
return {
"left": self.left,
"top": self.top,
"width": self.width,
"height": self.height,
"page_number": self.page_number,
"page_width": self.page_width,
"page_height": self.page_height,
"text": self.text,
"type": self.type.value,
}
@staticmethod
def from_pdf_segment(pdf_segment: PdfSegment, pdf_pages: list[PdfPage]):
return SegmentBox(
left=pdf_segment.bounding_box.left,
top=pdf_segment.bounding_box.top,
width=pdf_segment.bounding_box.width,
height=pdf_segment.bounding_box.height,
page_number=pdf_segment.page_number,
page_width=pdf_pages[pdf_segment.page_number - 1].page_width,
page_height=pdf_pages[pdf_segment.page_number - 1].page_height,
text=pdf_segment.text_content,
type=pdf_segment.segment_type,
)
if __name__ == "__main__":
a = TokenType.TEXT
print(a.value)
|