from domain.PdfSegment import PdfSegment from pdf_features import PdfFeatures from pdf_features import PdfToken class PdfSegmentation: def __init__(self, pdf_features: PdfFeatures, pdf_segments: list[PdfSegment]): self.pdf_features: PdfFeatures = pdf_features self.pdf_segments: list[PdfSegment] = pdf_segments self.tokens_by_segments: dict[PdfSegment, list[PdfToken]] = self.find_tokens_by_segments() @staticmethod def find_segment_for_token(token: PdfToken, segments: list[PdfSegment], tokens_by_segments): best_score: float = 0 most_probable_segment: PdfSegment | None = None for segment in segments: intersection_percentage = token.bounding_box.get_intersection_percentage(segment.bounding_box) if intersection_percentage > best_score: best_score = intersection_percentage most_probable_segment = segment if best_score >= 99: break if most_probable_segment: tokens_by_segments.setdefault(most_probable_segment, list()).append(token) def find_tokens_by_segments(self): tokens_by_segments: dict[PdfSegment, list[PdfToken]] = {} for page in self.pdf_features.pages: page_segments = [segment for segment in self.pdf_segments if segment.page_number == page.page_number] for token in page.tokens: self.find_segment_for_token(token, page_segments, tokens_by_segments) return tokens_by_segments