Spaces:
Runtime error
Runtime error
| import logging | |
| import re | |
| from components.parser.features.documents_dataset import DatasetRow, DocumentsDataset | |
| from components.parser.features.hierarchy_parser import Hierarchy | |
| from components.parser.xml.structures import ParsedXML | |
| logger = logging.getLogger(__name__) | |
| class DatasetCreator: | |
| """ | |
| Класс для создания датасета из обработанных документов. | |
| """ | |
| def __init__( | |
| self, | |
| ): | |
| """ | |
| Инициализация создателя датасета. | |
| """ | |
| self._index = 0 | |
| def create_dataset( | |
| self, | |
| parsed_xmls: dict[int, ParsedXML], | |
| hierarchies: dict[int, tuple[Hierarchy, Hierarchy]], | |
| start_index: int = 0, | |
| ) -> DocumentsDataset: | |
| """ | |
| Создание датасета из обработанных документов. | |
| Аргументы: | |
| parsed_xmls: Структура с данными из XML файлов | |
| hierarchies: Словарь с иерархическими структурами чанков | |
| Возвращает: | |
| DocumentsDataset: Датасет, готовый для векторизации | |
| """ | |
| logger.info('Starting dataset creation from hierarchies') | |
| self._index = start_index | |
| dataset_rows = [] | |
| for doc_id, (text_hierarchy, table_hierarchy) in hierarchies.items(): | |
| xml_data = parsed_xmls[doc_id] | |
| logger.debug( | |
| f'Processing document {doc_id} with {len(text_hierarchy)} text sections and {len(table_hierarchy)} table sections' | |
| ) | |
| text_rows = self._process_text_hierarchy(text_hierarchy, xml_data) | |
| table_rows = self._process_table_hierarchy(table_hierarchy, xml_data) | |
| dataset_rows.extend(text_rows) | |
| dataset_rows.extend(table_rows) | |
| logger.info(f'Created dataset with {len(dataset_rows)} rows') | |
| return DocumentsDataset(dataset_rows) | |
| def _process_text_hierarchy( | |
| self, | |
| text_hierarchy: Hierarchy, | |
| xml_data: ParsedXML, | |
| ) -> list[DatasetRow]: | |
| """ | |
| Обработка иерархии текста. | |
| """ | |
| rows = [] | |
| for key in text_hierarchy.keys(): | |
| split_key = key.split('_') | |
| paragraph = 'unknown' | |
| level_paragraph = 'unknown' | |
| duplicate = 'unknown' | |
| part_lvl1 = 'unknown' | |
| part_lvl2 = 'unknown' | |
| appendix = 'unknown' | |
| paragraph_appendix = 'unknown' | |
| level_paragraph_appendix = 'unknown' | |
| duplicate_appendix = 'unknown' | |
| part_lvl1_appendix = 'unknown' | |
| if re.search(r'Содержание', key): | |
| level_paragraph = -1 | |
| paragraph = split_key[1] | |
| elif re.search(r'Предисловие', key): | |
| level_paragraph = -1 | |
| paragraph = split_key[1] | |
| if '^' in paragraph: | |
| split_parag = paragraph.split('^') | |
| paragraph = split_parag[0] | |
| # Обработка Приложений | |
| elif re.search(r'Приложение[А-Я]\d+', key): | |
| appendix = split_key[1].replace('Приложение', '')[0] | |
| if len(split_key) == 3: | |
| part_lvl1_appendix = split_key[-1] | |
| elif len(split_key) == 4: | |
| if 'Таблица' in key: | |
| level_paragraph_appendix = -1 | |
| paragraph_appendix = split_key[3] | |
| else: | |
| level_paragraph_appendix = split_key[2] | |
| paragraph_appendix = split_key[3] | |
| if ':' in paragraph_appendix: | |
| paragraph_appendix, duplicate_appendix = ( | |
| paragraph_appendix.split(':')[:2] | |
| ) | |
| paragraph_appendix = paragraph_appendix.replace( | |
| 'PatternText', '' | |
| ) | |
| duplicate_appendix = duplicate_appendix.replace('Duplicate', '') | |
| else: | |
| paragraph_appendix = paragraph_appendix.replace( | |
| 'PatternText', '' | |
| ) | |
| elif len(split_key) == 5: | |
| level_paragraph_appendix = split_key[2] | |
| paragraph_appendix = split_key[3] | |
| if ':' in paragraph_appendix: | |
| paragraph_appendix, duplicate_appendix = ( | |
| paragraph_appendix.split(':')[:2] | |
| ) | |
| paragraph_appendix = paragraph_appendix.replace( | |
| 'PatternText', '' | |
| ) | |
| duplicate_appendix = duplicate_appendix.replace('Duplicate', '') | |
| else: | |
| paragraph_appendix = paragraph_appendix.replace( | |
| 'PatternText', '' | |
| ) | |
| part_lvl1_appendix = split_key[-1].replace('PartLevel', '') | |
| else: | |
| if len(split_key) == 2: | |
| if '^' in split_key[1]: | |
| split_parag = split_key[1].split('^') | |
| level_paragraph = -1 | |
| # paragraph = split_key[1].split('^')[-1].replace('UniqueNumber', '') | |
| part_lvl1 = int(split_parag[1].replace('PartLevel', '')) | |
| else: | |
| level_paragraph = -1 | |
| elif len(split_key) >= 3: | |
| level_paragraph = split_key[1][-1] | |
| paragraph = split_key[2].replace('PatternText', '') | |
| if ':' in paragraph: | |
| paragraph, duplicate = paragraph.split(':')[:2] | |
| paragraph = paragraph.replace('PatternText', '') | |
| duplicate = duplicate.replace('Duplicate', '') | |
| if len(split_key) == 4: | |
| if 'Table' in key: | |
| part_lvl1 = split_key[3] | |
| else: | |
| part_lvl1 = split_key[3].replace('PartLevel', '') | |
| if len(split_key) == 5: | |
| part_lvl1 = split_key[3].replace('PartLevel', '') | |
| part_lvl2 = split_key[4].replace('PartLeveL', '') | |
| rows.append( | |
| DatasetRow( | |
| Index=self._index, | |
| Text=text_hierarchy[key], | |
| DocName=f'{xml_data.id}.XML', | |
| DocNumber=xml_data.id, | |
| Title=xml_data.name, | |
| LevelParagraph=level_paragraph, | |
| Pargaraph=paragraph, | |
| Duplicate=duplicate, | |
| PartLevel1=part_lvl1, | |
| PartLevel2=part_lvl2, | |
| Appendix=appendix, | |
| LevelParagraphAppendix=level_paragraph_appendix, | |
| PargaraphAppendix=paragraph_appendix, | |
| DuplicateAppendix=duplicate_appendix, | |
| PartLevel1Appendix=part_lvl1_appendix, | |
| ) | |
| ) | |
| self._index += 1 | |
| return rows | |
| def _process_table_hierarchy( | |
| self, | |
| table_hierarchy: Hierarchy, | |
| xml_data: ParsedXML, | |
| ) -> list[DatasetRow]: | |
| """ | |
| Обработка иерархии таблиц. | |
| """ | |
| rows = [] | |
| for key in table_hierarchy.keys(): | |
| rows.append( | |
| DatasetRow( | |
| Index=self._index, | |
| Text=table_hierarchy[key], | |
| DocName=f'{xml_data.id}.XML', | |
| DocNumber=xml_data.id, | |
| Title=xml_data.name, | |
| Table=key.split('_')[1].replace('Table', ''), | |
| ) | |
| ) | |
| self._index += 1 | |
| return rows | |