Spaces:
Running
Running
File size: 7,736 Bytes
2e237ce |
|
import string
import roman
import numpy as np
from domain.PdfSegment import PdfSegment
from pdf_features import PdfToken
from pdf_features import Rectangle
from domain.SegmentBox import SegmentBox
from adapters.infrastructure.toc.data.TOCItem import TOCItem
from adapters.infrastructure.toc.methods.two_models_v3_segments_context_2.Modes import Modes
from adapters.infrastructure.toc.PdfSegmentation import PdfSegmentation
class TitleFeatures:
SPECIAL_MARKERS = [".", "(", ")", "\\", "/", ":", ";", "-", "_", "[", "]", "•", "◦", "*", ","]
ALPHABET = list(string.ascii_lowercase)
ALPHABET_UPPERCASE = list(string.ascii_uppercase)
ROMAN_NUMBERS = [roman.toRoman(i) for i in range(1, 151)]
ROMAN_NUMBERS_LOWERCASE = [x.lower() for x in ROMAN_NUMBERS]
BULLET_POINTS = [ALPHABET, ALPHABET_UPPERCASE, ROMAN_NUMBERS, ROMAN_NUMBERS_LOWERCASE]
def __init__(self, pdf_segment: PdfSegment, segment_tokens: list[PdfToken], pdf_features, modes: Modes):
self.modes = modes
self.pdf_segment = pdf_segment
self.pdf_features = pdf_features
self.segment_tokens: list[PdfToken] = segment_tokens
self.first_characters: str = ""
self.first_characters_special_markers_count: int = 0
self.font_size: float = 0.0
self.text_content: str = ""
self.width: float = 0
self.font_family: str = ""
self.font_color: str = ""
self.line_height: float = 0.0
self.uppercase: bool = False
self.bold: float = False
self.italics: float = False
self.first_characters_type = 0
self.bullet_points_type = 0
self.text_centered: int = 0
self.is_left: bool = False
self.indentation: int = -1
self.left: int = self.pdf_segment.bounding_box.left
self.top: int = self.pdf_segment.bounding_box.top
self.right: int = self.pdf_segment.bounding_box.right
self.bottom: int = self.pdf_segment.bounding_box.bottom
self.initialize_text_properties()
self.process_first_characters()
self.process_font_properties()
self.process_positional_properties()
def initialize_text_properties(self):
words = [token.content for token in self.segment_tokens]
self.text_content = " ".join(words)
def process_first_characters(self):
self.first_characters = self.text_content.split(" ")[0].split("\n")[0].split("\t")[0]
clean_first_characters = [x for x in self.first_characters if x not in self.SPECIAL_MARKERS]
characters_checker = {
1: lambda x_list: len(x_list) == len([letter for letter in x_list if letter in "IVXL"]),
2: lambda x_list: len(x_list) == len([letter for letter in x_list if letter in "IVXL".lower()]),
3: lambda x_list: len(x_list) == len([letter for letter in x_list if letter in "1234567890"]),
4: lambda x_list: len(x_list) == len([letter for letter in x_list if letter == letter.upper()]),
}
self.first_characters_type = next(
(index for index, type_checker in characters_checker.items() if type_checker(clean_first_characters)), 0
)
self.bullet_points_type = (
self.SPECIAL_MARKERS.index(self.first_characters[-1]) + 1
if self.first_characters[-1] in self.SPECIAL_MARKERS
else 0
)
self.first_characters_special_markers_count = len(
[x for x in self.first_characters[:-1] if x in self.SPECIAL_MARKERS]
)
def process_font_properties(self):
self.font_family = self.segment_tokens[0].font.font_id
self.font_color = self.segment_tokens[0].font.color
self.bold = sum(token.font.bold for token in self.segment_tokens) / len(self.segment_tokens)
self.italics = sum(token.font.italics for token in self.segment_tokens) / len(self.segment_tokens)
self.uppercase = self.text_content.upper() == self.text_content
font_sizes = [token.font.font_size for token in self.segment_tokens]
self.font_size = np.mean(font_sizes)
def process_positional_properties(self):
self.line_height = self.segment_tokens[0].font.font_size
page_width = self.pdf_features.pages[self.pdf_segment.page_number - 1].page_width
self.text_centered = 1 if abs(self.left - (page_width - self.right)) < 10 else 0
self.is_left = self.left < page_width - self.right if not self.text_centered else False
self.indentation = int((self.left - self.modes.left_space_mode) / 15) if self.is_left else -1
def get_features_to_merge(self) -> np.array:
return (
1 if self.bold else 0,
1 if self.italics else 0,
)
def get_features_toc(self) -> np.array:
return (
1 if self.bold else 0,
1 if self.italics else 0,
self.first_characters_type,
self.first_characters_special_markers_count,
self.bullet_points_type,
)
def get_possible_previous_point(self) -> list[str]:
previous_characters = self.first_characters
final_special_markers = ""
last_part = ""
for letter in list(reversed(previous_characters)):
if not last_part and letter in self.SPECIAL_MARKERS:
final_special_markers = previous_characters[-1] + final_special_markers
previous_characters = previous_characters[:-1]
continue
if last_part and letter in self.SPECIAL_MARKERS:
break
last_part = letter + last_part
previous_characters = previous_characters[:-1]
previous_items = self.get_previous_items(last_part)
if not previous_items and len(self.first_characters) >= 4:
return [self.first_characters]
return [previous_characters + x + final_special_markers for x in previous_items]
def get_previous_items(self, item: str):
previous_items = []
for bullet_points in self.BULLET_POINTS:
if item in bullet_points and bullet_points.index(item):
previous_items.append(bullet_points[bullet_points.index(item) - 1])
if item.isnumeric():
previous_items.append(str(int(item) - 1))
return previous_items
@staticmethod
def from_pdf_segmentation(pdf_segmentation: PdfSegmentation) -> list["TitleFeatures"]:
titles_features = list()
modes = Modes(pdf_features=pdf_segmentation.pdf_features)
for pdf_segment in pdf_segmentation.pdf_segments:
segment_tokens = pdf_segmentation.tokens_by_segments[pdf_segment]
titles_features.append(TitleFeatures(pdf_segment, segment_tokens, pdf_segmentation.pdf_features, modes))
return titles_features
def to_toc_item(self, indentation):
return TOCItem(
indentation=indentation,
label=self.text_content,
selection_rectangle=SegmentBox.from_pdf_segment(self.pdf_segment, self.pdf_features.pages),
)
def append(self, other_title_features: "TitleFeatures"):
other_segment = other_title_features.pdf_segment
merged_bounding_box = Rectangle.merge_rectangles([self.pdf_segment.bounding_box, other_segment.bounding_box])
merged_content = self.pdf_segment.text_content + other_segment.text_content
merged_segment = PdfSegment(
self.pdf_segment.page_number, merged_bounding_box, merged_content, self.pdf_segment.segment_type
)
segment_tokens = self.segment_tokens + other_title_features.segment_tokens
return TitleFeatures(merged_segment, segment_tokens, pdf_features=self.pdf_features, modes=self.modes)
|