Spaces:
Running
Running
File size: 7,736 Bytes
2e237ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
import string
import roman
import numpy as np
from domain.PdfSegment import PdfSegment
from pdf_features import PdfToken
from pdf_features import Rectangle
from domain.SegmentBox import SegmentBox
from adapters.infrastructure.toc.data.TOCItem import TOCItem
from adapters.infrastructure.toc.methods.two_models_v3_segments_context_2.Modes import Modes
from adapters.infrastructure.toc.PdfSegmentation import PdfSegmentation
class TitleFeatures:
SPECIAL_MARKERS = [".", "(", ")", "\\", "/", ":", ";", "-", "_", "[", "]", "•", "◦", "*", ","]
ALPHABET = list(string.ascii_lowercase)
ALPHABET_UPPERCASE = list(string.ascii_uppercase)
ROMAN_NUMBERS = [roman.toRoman(i) for i in range(1, 151)]
ROMAN_NUMBERS_LOWERCASE = [x.lower() for x in ROMAN_NUMBERS]
BULLET_POINTS = [ALPHABET, ALPHABET_UPPERCASE, ROMAN_NUMBERS, ROMAN_NUMBERS_LOWERCASE]
def __init__(self, pdf_segment: PdfSegment, segment_tokens: list[PdfToken], pdf_features, modes: Modes):
self.modes = modes
self.pdf_segment = pdf_segment
self.pdf_features = pdf_features
self.segment_tokens: list[PdfToken] = segment_tokens
self.first_characters: str = ""
self.first_characters_special_markers_count: int = 0
self.font_size: float = 0.0
self.text_content: str = ""
self.width: float = 0
self.font_family: str = ""
self.font_color: str = ""
self.line_height: float = 0.0
self.uppercase: bool = False
self.bold: float = False
self.italics: float = False
self.first_characters_type = 0
self.bullet_points_type = 0
self.text_centered: int = 0
self.is_left: bool = False
self.indentation: int = -1
self.left: int = self.pdf_segment.bounding_box.left
self.top: int = self.pdf_segment.bounding_box.top
self.right: int = self.pdf_segment.bounding_box.right
self.bottom: int = self.pdf_segment.bounding_box.bottom
self.initialize_text_properties()
self.process_first_characters()
self.process_font_properties()
self.process_positional_properties()
def initialize_text_properties(self):
words = [token.content for token in self.segment_tokens]
self.text_content = " ".join(words)
def process_first_characters(self):
self.first_characters = self.text_content.split(" ")[0].split("\n")[0].split("\t")[0]
clean_first_characters = [x for x in self.first_characters if x not in self.SPECIAL_MARKERS]
characters_checker = {
1: lambda x_list: len(x_list) == len([letter for letter in x_list if letter in "IVXL"]),
2: lambda x_list: len(x_list) == len([letter for letter in x_list if letter in "IVXL".lower()]),
3: lambda x_list: len(x_list) == len([letter for letter in x_list if letter in "1234567890"]),
4: lambda x_list: len(x_list) == len([letter for letter in x_list if letter == letter.upper()]),
}
self.first_characters_type = next(
(index for index, type_checker in characters_checker.items() if type_checker(clean_first_characters)), 0
)
self.bullet_points_type = (
self.SPECIAL_MARKERS.index(self.first_characters[-1]) + 1
if self.first_characters[-1] in self.SPECIAL_MARKERS
else 0
)
self.first_characters_special_markers_count = len(
[x for x in self.first_characters[:-1] if x in self.SPECIAL_MARKERS]
)
def process_font_properties(self):
self.font_family = self.segment_tokens[0].font.font_id
self.font_color = self.segment_tokens[0].font.color
self.bold = sum(token.font.bold for token in self.segment_tokens) / len(self.segment_tokens)
self.italics = sum(token.font.italics for token in self.segment_tokens) / len(self.segment_tokens)
self.uppercase = self.text_content.upper() == self.text_content
font_sizes = [token.font.font_size for token in self.segment_tokens]
self.font_size = np.mean(font_sizes)
def process_positional_properties(self):
self.line_height = self.segment_tokens[0].font.font_size
page_width = self.pdf_features.pages[self.pdf_segment.page_number - 1].page_width
self.text_centered = 1 if abs(self.left - (page_width - self.right)) < 10 else 0
self.is_left = self.left < page_width - self.right if not self.text_centered else False
self.indentation = int((self.left - self.modes.left_space_mode) / 15) if self.is_left else -1
def get_features_to_merge(self) -> np.array:
return (
1 if self.bold else 0,
1 if self.italics else 0,
)
def get_features_toc(self) -> np.array:
return (
1 if self.bold else 0,
1 if self.italics else 0,
self.first_characters_type,
self.first_characters_special_markers_count,
self.bullet_points_type,
)
def get_possible_previous_point(self) -> list[str]:
previous_characters = self.first_characters
final_special_markers = ""
last_part = ""
for letter in list(reversed(previous_characters)):
if not last_part and letter in self.SPECIAL_MARKERS:
final_special_markers = previous_characters[-1] + final_special_markers
previous_characters = previous_characters[:-1]
continue
if last_part and letter in self.SPECIAL_MARKERS:
break
last_part = letter + last_part
previous_characters = previous_characters[:-1]
previous_items = self.get_previous_items(last_part)
if not previous_items and len(self.first_characters) >= 4:
return [self.first_characters]
return [previous_characters + x + final_special_markers for x in previous_items]
def get_previous_items(self, item: str):
previous_items = []
for bullet_points in self.BULLET_POINTS:
if item in bullet_points and bullet_points.index(item):
previous_items.append(bullet_points[bullet_points.index(item) - 1])
if item.isnumeric():
previous_items.append(str(int(item) - 1))
return previous_items
@staticmethod
def from_pdf_segmentation(pdf_segmentation: PdfSegmentation) -> list["TitleFeatures"]:
titles_features = list()
modes = Modes(pdf_features=pdf_segmentation.pdf_features)
for pdf_segment in pdf_segmentation.pdf_segments:
segment_tokens = pdf_segmentation.tokens_by_segments[pdf_segment]
titles_features.append(TitleFeatures(pdf_segment, segment_tokens, pdf_segmentation.pdf_features, modes))
return titles_features
def to_toc_item(self, indentation):
return TOCItem(
indentation=indentation,
label=self.text_content,
selection_rectangle=SegmentBox.from_pdf_segment(self.pdf_segment, self.pdf_features.pages),
)
def append(self, other_title_features: "TitleFeatures"):
other_segment = other_title_features.pdf_segment
merged_bounding_box = Rectangle.merge_rectangles([self.pdf_segment.bounding_box, other_segment.bounding_box])
merged_content = self.pdf_segment.text_content + other_segment.text_content
merged_segment = PdfSegment(
self.pdf_segment.page_number, merged_bounding_box, merged_content, self.pdf_segment.segment_type
)
segment_tokens = self.segment_tokens + other_title_features.segment_tokens
return TitleFeatures(merged_segment, segment_tokens, pdf_features=self.pdf_features, modes=self.modes)
|