Wasim
Sync: robust vehicle parser + full project
2e237ce
raw
history blame
1.84 kB
from adapters.infrastructure.toc.TitleFeatures import TitleFeatures
from adapters.infrastructure.toc.PdfSegmentation import PdfSegmentation
class MergeTwoSegmentsTitles:
def __init__(self, pdf_segmentation: PdfSegmentation):
self.title_features_list: list[TitleFeatures] = TitleFeatures.from_pdf_segmentation(pdf_segmentation)
self.titles_merged: list[TitleFeatures] = list()
self.merge()
def merge(self):
index = 0
while index < len(self.title_features_list):
if index == len(self.title_features_list) - 1:
self.titles_merged.append(self.title_features_list[index])
break
if not self.should_merge(self.title_features_list[index], self.title_features_list[index + 1]):
self.titles_merged.append(self.title_features_list[index])
index += 1
continue
self.title_features_list[index + 1] = self.title_features_list[index + 1].append(self.title_features_list[index])
index += 1
@staticmethod
def should_merge(title: TitleFeatures, other_title: TitleFeatures):
same_page = other_title.pdf_segment.page_number == title.pdf_segment.page_number
if not same_page:
return False
if abs(other_title.top - title.bottom) > 15:
return False
if abs(other_title.left - title.right) > 15 or abs(other_title.right - title.left) > 15:
return False
if title.first_characters_type in [1, 2, 3] and other_title.first_characters_type in [1, 2, 3]:
return False
if title.bullet_points_type and other_title.bullet_points_type:
return False
if title.get_features_to_merge() != other_title.get_features_to_merge():
return False
return True