Spaces:
Running
Running
| import string | |
| import roman | |
| import numpy as np | |
| from domain.PdfSegment import PdfSegment | |
| from pdf_features import PdfToken | |
| from pdf_features import Rectangle | |
| from domain.SegmentBox import SegmentBox | |
| from adapters.infrastructure.toc.data.TOCItem import TOCItem | |
| from adapters.infrastructure.toc.methods.two_models_v3_segments_context_2.Modes import Modes | |
| from adapters.infrastructure.toc.PdfSegmentation import PdfSegmentation | |
| class TitleFeatures: | |
| SPECIAL_MARKERS = [".", "(", ")", "\\", "/", ":", ";", "-", "_", "[", "]", "•", "◦", "*", ","] | |
| ALPHABET = list(string.ascii_lowercase) | |
| ALPHABET_UPPERCASE = list(string.ascii_uppercase) | |
| ROMAN_NUMBERS = [roman.toRoman(i) for i in range(1, 151)] | |
| ROMAN_NUMBERS_LOWERCASE = [x.lower() for x in ROMAN_NUMBERS] | |
| BULLET_POINTS = [ALPHABET, ALPHABET_UPPERCASE, ROMAN_NUMBERS, ROMAN_NUMBERS_LOWERCASE] | |
| def __init__(self, pdf_segment: PdfSegment, segment_tokens: list[PdfToken], pdf_features, modes: Modes): | |
| self.modes = modes | |
| self.pdf_segment = pdf_segment | |
| self.pdf_features = pdf_features | |
| self.segment_tokens: list[PdfToken] = segment_tokens | |
| self.first_characters: str = "" | |
| self.first_characters_special_markers_count: int = 0 | |
| self.font_size: float = 0.0 | |
| self.text_content: str = "" | |
| self.width: float = 0 | |
| self.font_family: str = "" | |
| self.font_color: str = "" | |
| self.line_height: float = 0.0 | |
| self.uppercase: bool = False | |
| self.bold: float = False | |
| self.italics: float = False | |
| self.first_characters_type = 0 | |
| self.bullet_points_type = 0 | |
| self.text_centered: int = 0 | |
| self.is_left: bool = False | |
| self.indentation: int = -1 | |
| self.left: int = self.pdf_segment.bounding_box.left | |
| self.top: int = self.pdf_segment.bounding_box.top | |
| self.right: int = self.pdf_segment.bounding_box.right | |
| self.bottom: int = self.pdf_segment.bounding_box.bottom | |
| self.initialize_text_properties() | |
| self.process_first_characters() | |
| self.process_font_properties() | |
| self.process_positional_properties() | |
| def initialize_text_properties(self): | |
| words = [token.content for token in self.segment_tokens] | |
| self.text_content = " ".join(words) | |
| def process_first_characters(self): | |
| self.first_characters = self.text_content.split(" ")[0].split("\n")[0].split("\t")[0] | |
| clean_first_characters = [x for x in self.first_characters if x not in self.SPECIAL_MARKERS] | |
| characters_checker = { | |
| 1: lambda x_list: len(x_list) == len([letter for letter in x_list if letter in "IVXL"]), | |
| 2: lambda x_list: len(x_list) == len([letter for letter in x_list if letter in "IVXL".lower()]), | |
| 3: lambda x_list: len(x_list) == len([letter for letter in x_list if letter in "1234567890"]), | |
| 4: lambda x_list: len(x_list) == len([letter for letter in x_list if letter == letter.upper()]), | |
| } | |
| self.first_characters_type = next( | |
| (index for index, type_checker in characters_checker.items() if type_checker(clean_first_characters)), 0 | |
| ) | |
| self.bullet_points_type = ( | |
| self.SPECIAL_MARKERS.index(self.first_characters[-1]) + 1 | |
| if self.first_characters[-1] in self.SPECIAL_MARKERS | |
| else 0 | |
| ) | |
| self.first_characters_special_markers_count = len( | |
| [x for x in self.first_characters[:-1] if x in self.SPECIAL_MARKERS] | |
| ) | |
| def process_font_properties(self): | |
| self.font_family = self.segment_tokens[0].font.font_id | |
| self.font_color = self.segment_tokens[0].font.color | |
| self.bold = sum(token.font.bold for token in self.segment_tokens) / len(self.segment_tokens) | |
| self.italics = sum(token.font.italics for token in self.segment_tokens) / len(self.segment_tokens) | |
| self.uppercase = self.text_content.upper() == self.text_content | |
| font_sizes = [token.font.font_size for token in self.segment_tokens] | |
| self.font_size = np.mean(font_sizes) | |
| def process_positional_properties(self): | |
| self.line_height = self.segment_tokens[0].font.font_size | |
| page_width = self.pdf_features.pages[self.pdf_segment.page_number - 1].page_width | |
| self.text_centered = 1 if abs(self.left - (page_width - self.right)) < 10 else 0 | |
| self.is_left = self.left < page_width - self.right if not self.text_centered else False | |
| self.indentation = int((self.left - self.modes.left_space_mode) / 15) if self.is_left else -1 | |
| def get_features_to_merge(self) -> np.array: | |
| return ( | |
| 1 if self.bold else 0, | |
| 1 if self.italics else 0, | |
| ) | |
| def get_features_toc(self) -> np.array: | |
| return ( | |
| 1 if self.bold else 0, | |
| 1 if self.italics else 0, | |
| self.first_characters_type, | |
| self.first_characters_special_markers_count, | |
| self.bullet_points_type, | |
| ) | |
| def get_possible_previous_point(self) -> list[str]: | |
| previous_characters = self.first_characters | |
| final_special_markers = "" | |
| last_part = "" | |
| for letter in list(reversed(previous_characters)): | |
| if not last_part and letter in self.SPECIAL_MARKERS: | |
| final_special_markers = previous_characters[-1] + final_special_markers | |
| previous_characters = previous_characters[:-1] | |
| continue | |
| if last_part and letter in self.SPECIAL_MARKERS: | |
| break | |
| last_part = letter + last_part | |
| previous_characters = previous_characters[:-1] | |
| previous_items = self.get_previous_items(last_part) | |
| if not previous_items and len(self.first_characters) >= 4: | |
| return [self.first_characters] | |
| return [previous_characters + x + final_special_markers for x in previous_items] | |
| def get_previous_items(self, item: str): | |
| previous_items = [] | |
| for bullet_points in self.BULLET_POINTS: | |
| if item in bullet_points and bullet_points.index(item): | |
| previous_items.append(bullet_points[bullet_points.index(item) - 1]) | |
| if item.isnumeric(): | |
| previous_items.append(str(int(item) - 1)) | |
| return previous_items | |
| def from_pdf_segmentation(pdf_segmentation: PdfSegmentation) -> list["TitleFeatures"]: | |
| titles_features = list() | |
| modes = Modes(pdf_features=pdf_segmentation.pdf_features) | |
| for pdf_segment in pdf_segmentation.pdf_segments: | |
| segment_tokens = pdf_segmentation.tokens_by_segments[pdf_segment] | |
| titles_features.append(TitleFeatures(pdf_segment, segment_tokens, pdf_segmentation.pdf_features, modes)) | |
| return titles_features | |
| def to_toc_item(self, indentation): | |
| return TOCItem( | |
| indentation=indentation, | |
| label=self.text_content, | |
| selection_rectangle=SegmentBox.from_pdf_segment(self.pdf_segment, self.pdf_features.pages), | |
| ) | |
| def append(self, other_title_features: "TitleFeatures"): | |
| other_segment = other_title_features.pdf_segment | |
| merged_bounding_box = Rectangle.merge_rectangles([self.pdf_segment.bounding_box, other_segment.bounding_box]) | |
| merged_content = self.pdf_segment.text_content + other_segment.text_content | |
| merged_segment = PdfSegment( | |
| self.pdf_segment.page_number, merged_bounding_box, merged_content, self.pdf_segment.segment_type | |
| ) | |
| segment_tokens = self.segment_tokens + other_title_features.segment_tokens | |
| return TitleFeatures(merged_segment, segment_tokens, pdf_features=self.pdf_features, modes=self.modes) | |