Spaces:
Running
Running
File size: 1,844 Bytes
2e237ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
from adapters.infrastructure.toc.TitleFeatures import TitleFeatures
from adapters.infrastructure.toc.PdfSegmentation import PdfSegmentation
class MergeTwoSegmentsTitles:
def __init__(self, pdf_segmentation: PdfSegmentation):
self.title_features_list: list[TitleFeatures] = TitleFeatures.from_pdf_segmentation(pdf_segmentation)
self.titles_merged: list[TitleFeatures] = list()
self.merge()
def merge(self):
index = 0
while index < len(self.title_features_list):
if index == len(self.title_features_list) - 1:
self.titles_merged.append(self.title_features_list[index])
break
if not self.should_merge(self.title_features_list[index], self.title_features_list[index + 1]):
self.titles_merged.append(self.title_features_list[index])
index += 1
continue
self.title_features_list[index + 1] = self.title_features_list[index + 1].append(self.title_features_list[index])
index += 1
@staticmethod
def should_merge(title: TitleFeatures, other_title: TitleFeatures):
same_page = other_title.pdf_segment.page_number == title.pdf_segment.page_number
if not same_page:
return False
if abs(other_title.top - title.bottom) > 15:
return False
if abs(other_title.left - title.right) > 15 or abs(other_title.right - title.left) > 15:
return False
if title.first_characters_type in [1, 2, 3] and other_title.first_characters_type in [1, 2, 3]:
return False
if title.bullet_points_type and other_title.bullet_points_type:
return False
if title.get_features_to_merge() != other_title.get_features_to_merge():
return False
return True
|