File size: 1,844 Bytes
2e237ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from adapters.infrastructure.toc.TitleFeatures import TitleFeatures
from adapters.infrastructure.toc.PdfSegmentation import PdfSegmentation


class MergeTwoSegmentsTitles:
    def __init__(self, pdf_segmentation: PdfSegmentation):
        self.title_features_list: list[TitleFeatures] = TitleFeatures.from_pdf_segmentation(pdf_segmentation)
        self.titles_merged: list[TitleFeatures] = list()
        self.merge()

    def merge(self):
        index = 0
        while index < len(self.title_features_list):
            if index == len(self.title_features_list) - 1:
                self.titles_merged.append(self.title_features_list[index])
                break

            if not self.should_merge(self.title_features_list[index], self.title_features_list[index + 1]):
                self.titles_merged.append(self.title_features_list[index])
                index += 1
                continue

            self.title_features_list[index + 1] = self.title_features_list[index + 1].append(self.title_features_list[index])
            index += 1

    @staticmethod
    def should_merge(title: TitleFeatures, other_title: TitleFeatures):
        same_page = other_title.pdf_segment.page_number == title.pdf_segment.page_number

        if not same_page:
            return False

        if abs(other_title.top - title.bottom) > 15:
            return False

        if abs(other_title.left - title.right) > 15 or abs(other_title.right - title.left) > 15:
            return False

        if title.first_characters_type in [1, 2, 3] and other_title.first_characters_type in [1, 2, 3]:
            return False

        if title.bullet_points_type and other_title.bullet_points_type:
            return False

        if title.get_features_to_merge() != other_title.get_features_to_merge():
            return False

        return True