File size: 7,736 Bytes
2e237ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import string
import roman
import numpy as np
from domain.PdfSegment import PdfSegment
from pdf_features import PdfToken
from pdf_features import Rectangle
from domain.SegmentBox import SegmentBox
from adapters.infrastructure.toc.data.TOCItem import TOCItem
from adapters.infrastructure.toc.methods.two_models_v3_segments_context_2.Modes import Modes
from adapters.infrastructure.toc.PdfSegmentation import PdfSegmentation


class TitleFeatures:
    SPECIAL_MARKERS = [".", "(", ")", "\\", "/", ":", ";", "-", "_", "[", "]", "•", "◦", "*", ","]
    ALPHABET = list(string.ascii_lowercase)
    ALPHABET_UPPERCASE = list(string.ascii_uppercase)
    ROMAN_NUMBERS = [roman.toRoman(i) for i in range(1, 151)]
    ROMAN_NUMBERS_LOWERCASE = [x.lower() for x in ROMAN_NUMBERS]
    BULLET_POINTS = [ALPHABET, ALPHABET_UPPERCASE, ROMAN_NUMBERS, ROMAN_NUMBERS_LOWERCASE]

    def __init__(self, pdf_segment: PdfSegment, segment_tokens: list[PdfToken], pdf_features, modes: Modes):
        self.modes = modes
        self.pdf_segment = pdf_segment
        self.pdf_features = pdf_features

        self.segment_tokens: list[PdfToken] = segment_tokens
        self.first_characters: str = ""
        self.first_characters_special_markers_count: int = 0
        self.font_size: float = 0.0
        self.text_content: str = ""
        self.width: float = 0
        self.font_family: str = ""
        self.font_color: str = ""
        self.line_height: float = 0.0
        self.uppercase: bool = False
        self.bold: float = False
        self.italics: float = False
        self.first_characters_type = 0
        self.bullet_points_type = 0
        self.text_centered: int = 0
        self.is_left: bool = False
        self.indentation: int = -1
        self.left: int = self.pdf_segment.bounding_box.left
        self.top: int = self.pdf_segment.bounding_box.top
        self.right: int = self.pdf_segment.bounding_box.right
        self.bottom: int = self.pdf_segment.bounding_box.bottom

        self.initialize_text_properties()
        self.process_first_characters()
        self.process_font_properties()
        self.process_positional_properties()

    def initialize_text_properties(self):
        words = [token.content for token in self.segment_tokens]
        self.text_content = " ".join(words)

    def process_first_characters(self):
        self.first_characters = self.text_content.split(" ")[0].split("\n")[0].split("\t")[0]
        clean_first_characters = [x for x in self.first_characters if x not in self.SPECIAL_MARKERS]
        characters_checker = {
            1: lambda x_list: len(x_list) == len([letter for letter in x_list if letter in "IVXL"]),
            2: lambda x_list: len(x_list) == len([letter for letter in x_list if letter in "IVXL".lower()]),
            3: lambda x_list: len(x_list) == len([letter for letter in x_list if letter in "1234567890"]),
            4: lambda x_list: len(x_list) == len([letter for letter in x_list if letter == letter.upper()]),
        }

        self.first_characters_type = next(
            (index for index, type_checker in characters_checker.items() if type_checker(clean_first_characters)), 0
        )

        self.bullet_points_type = (
            self.SPECIAL_MARKERS.index(self.first_characters[-1]) + 1
            if self.first_characters[-1] in self.SPECIAL_MARKERS
            else 0
        )
        self.first_characters_special_markers_count = len(
            [x for x in self.first_characters[:-1] if x in self.SPECIAL_MARKERS]
        )

    def process_font_properties(self):
        self.font_family = self.segment_tokens[0].font.font_id
        self.font_color = self.segment_tokens[0].font.color
        self.bold = sum(token.font.bold for token in self.segment_tokens) / len(self.segment_tokens)
        self.italics = sum(token.font.italics for token in self.segment_tokens) / len(self.segment_tokens)
        self.uppercase = self.text_content.upper() == self.text_content
        font_sizes = [token.font.font_size for token in self.segment_tokens]
        self.font_size = np.mean(font_sizes)

    def process_positional_properties(self):
        self.line_height = self.segment_tokens[0].font.font_size
        page_width = self.pdf_features.pages[self.pdf_segment.page_number - 1].page_width
        self.text_centered = 1 if abs(self.left - (page_width - self.right)) < 10 else 0
        self.is_left = self.left < page_width - self.right if not self.text_centered else False
        self.indentation = int((self.left - self.modes.left_space_mode) / 15) if self.is_left else -1

    def get_features_to_merge(self) -> np.array:
        return (
            1 if self.bold else 0,
            1 if self.italics else 0,
        )

    def get_features_toc(self) -> np.array:
        return (
            1 if self.bold else 0,
            1 if self.italics else 0,
            self.first_characters_type,
            self.first_characters_special_markers_count,
            self.bullet_points_type,
        )

    def get_possible_previous_point(self) -> list[str]:
        previous_characters = self.first_characters
        final_special_markers = ""
        last_part = ""
        for letter in list(reversed(previous_characters)):
            if not last_part and letter in self.SPECIAL_MARKERS:
                final_special_markers = previous_characters[-1] + final_special_markers
                previous_characters = previous_characters[:-1]
                continue

            if last_part and letter in self.SPECIAL_MARKERS:
                break

            last_part = letter + last_part
            previous_characters = previous_characters[:-1]

        previous_items = self.get_previous_items(last_part)

        if not previous_items and len(self.first_characters) >= 4:
            return [self.first_characters]

        return [previous_characters + x + final_special_markers for x in previous_items]

    def get_previous_items(self, item: str):
        previous_items = []

        for bullet_points in self.BULLET_POINTS:
            if item in bullet_points and bullet_points.index(item):
                previous_items.append(bullet_points[bullet_points.index(item) - 1])

        if item.isnumeric():
            previous_items.append(str(int(item) - 1))

        return previous_items

    @staticmethod
    def from_pdf_segmentation(pdf_segmentation: PdfSegmentation) -> list["TitleFeatures"]:
        titles_features = list()
        modes = Modes(pdf_features=pdf_segmentation.pdf_features)
        for pdf_segment in pdf_segmentation.pdf_segments:
            segment_tokens = pdf_segmentation.tokens_by_segments[pdf_segment]
            titles_features.append(TitleFeatures(pdf_segment, segment_tokens, pdf_segmentation.pdf_features, modes))

        return titles_features

    def to_toc_item(self, indentation):
        return TOCItem(
            indentation=indentation,
            label=self.text_content,
            selection_rectangle=SegmentBox.from_pdf_segment(self.pdf_segment, self.pdf_features.pages),
        )

    def append(self, other_title_features: "TitleFeatures"):
        other_segment = other_title_features.pdf_segment
        merged_bounding_box = Rectangle.merge_rectangles([self.pdf_segment.bounding_box, other_segment.bounding_box])
        merged_content = self.pdf_segment.text_content + other_segment.text_content
        merged_segment = PdfSegment(
            self.pdf_segment.page_number, merged_bounding_box, merged_content, self.pdf_segment.segment_type
        )
        segment_tokens = self.segment_tokens + other_title_features.segment_tokens
        return TitleFeatures(merged_segment, segment_tokens, pdf_features=self.pdf_features, modes=self.modes)