Spaces:
Runtime error
Runtime error
| from typing import List, Tuple, Optional | |
| import pandas as pd | |
| class MetadataManager: | |
| def __init__(self, df: pd.DataFrame, logger): | |
| self.logger = logger | |
| self.df = df | |
| self.df.drop('Embedding', axis=1, inplace=True) | |
| self.df = self.df.where(pd.notna(self.df), 'unknown') | |
| def __search_sub_level(df: pd.DataFrame, header_text: Optional[str] = None) -> List: | |
| """ | |
| Args: | |
| df: | |
| Returns: | |
| """ | |
| paragraphs = [] | |
| if header_text is None: | |
| header_text = df.iloc[0]['Text'] | |
| for ind, (_, row) in enumerate(df.iterrows()): | |
| text = row['Text'] | |
| if ind == 0: | |
| text = text.replace(f'{header_text}', f'{header_text}\n') | |
| else: | |
| text = text.replace(f'{header_text}', '') + '\n' | |
| paragraphs.append(text) | |
| return paragraphs | |
| def __check_duplicates(df: pd.DataFrame, ind: int) -> pd.DataFrame: | |
| if df.loc[ind]['Duplicate'] is not None: | |
| return df[df['Duplicate'] == df.loc[ind]['Duplicate']] | |
| else: | |
| return df[df['Duplicate'].isna()] | |
| def __check_appendix_duplicates(df: pd.DataFrame, ind: int) -> pd.DataFrame: | |
| if df.loc[ind]['DuplicateAppendix'] is not None: | |
| return df[df['DuplicateAppendix'] == df.loc[ind]['DuplicateAppendix']] | |
| else: | |
| return df[df['DuplicateAppendix'].isna()] | |
| def _paragraph_appendix_content(self, df, pattern: str, ind: int, shape: int) -> Tuple[List, int]: | |
| """ | |
| Функция возвращает контент параграфа. Если в параграфе были подпункты через "-" или буквы "а, б" | |
| Args: | |
| df: DataFrame | |
| pattern: Паттерн поиска. | |
| ind: Индекс строки в DataFrame. | |
| shape: Размер DataFrame при котором будет возвращаться пустой список. | |
| Returns: | |
| Возвращает список подразделов. | |
| Examples: | |
| 3.1. Параграф: | |
| 1) - Содержание 1; | |
| 2) - Содержание 2; | |
| 3) - Содержание 3; | |
| """ | |
| df = df[(df['PargaraphAppendix'].str.match(pattern, na=False)) | (df.index == ind)] | |
| df = self.__check_appendix_duplicates(df, ind) | |
| if df.shape[0] <= shape: | |
| return [], None | |
| start_index_paragraph = df.index[0] | |
| paragraphs = self.__search_sub_level(df) | |
| return paragraphs, start_index_paragraph | |
| def _paragraph_content(self, df, pattern: str, ind: int, shape: int) -> Tuple[List, int]: | |
| """ | |
| Функция возвращает контент параграфа. Если в параграфе были подпункты через "-" или буквы "а, б" | |
| Args: | |
| df: DataFrame | |
| pattern: Паттерн поиска. | |
| ind: Индекс строки в DataFrame. | |
| shape: Размер DataFrame при котором будет возвращаться пустой список. | |
| Returns: | |
| Возвращает список подразделов. | |
| Examples: | |
| 3.1. Параграф: | |
| 1) - Содержание 1; | |
| 2) - Содержание 2; | |
| 3) - Содержание 3; | |
| """ | |
| df = df[ | |
| (df['Pargaraph'].str.match(pattern, na=False)) & # Проверка, соответствуют ли значения паттерну | |
| (df['Duplicate'] == df.loc[ind]['Duplicate']) | # Оставить разделы только принадлежащие одному дубликату | |
| (df.index == ind)] # Оставить значение, которое нашел векторный поиск | |
| # df = self.__check_duplicates(df, ind) | |
| if df.shape[0] <= shape: | |
| return [], None | |
| start_index_paragraph = df.index[0] | |
| paragraphs = self.__search_sub_level(df) | |
| return paragraphs, start_index_paragraph | |
| def _paragraph_content2(self, df, pattern: str, ind: int, shape: int) -> Tuple[List, int]: | |
| """ | |
| Функция возвращает контент параграфа. Если в параграфе были подпункты через "-" или буквы "а, б" | |
| Args: | |
| df: DataFrame | |
| pattern: Паттерн поиска. | |
| ind: Индекс строки в DataFrame. | |
| shape: Размер DataFrame при котором будет возвращаться пустой список. | |
| Returns: | |
| Возвращает список подразделов. | |
| Examples: | |
| 3.1. Параграф: | |
| 1) - Содержание 1; | |
| 2) - Содержание 2; | |
| 3) - Содержание 3; | |
| """ | |
| df = df[df['Pargaraph'].str.match(pattern, na=False)] | |
| if df.shape[0] <= shape: | |
| return [], None | |
| # df = self.__check_duplicates(df, ind) | |
| # if df.shape[0] <= shape: | |
| # return [], None | |
| start_index_paragraph = df.index[0] | |
| paragraphs = self.__search_sub_level(df) | |
| return paragraphs, start_index_paragraph | |
| def _first_unknown_index(df): | |
| indexes = list(df[df['PartLevel1'].isin(['unknown'])].index) | |
| if len(indexes) > 0: | |
| return df.loc[indexes[-1]]['Text'] | |
| else: | |
| return None | |
| def _search_other_info(self, ind, doc_number): | |
| df = self.df[self.df['DocNumber'] == doc_number] | |
| start_index_paragraph = df.loc[ind]['Index'] - 1 | |
| if df.loc[ind]['Table'] != 'unknown': | |
| return df.loc[ind]['Text'], ind | |
| if df.loc[ind]['PartLevel1'] != 'unknown': | |
| if 'Table' in str(self.df.iloc[ind]['PartLevel1']): | |
| return [], ind | |
| if df.loc[ind]['Appendix'] != 'unknown': | |
| df = df[df['Appendix'] == self.df.iloc[ind]['Appendix']] | |
| if df.loc[ind]['LevelParagraphAppendix'] == 'unknown' and df.loc[ind]['PargaraphAppendix'] == 'unknown': | |
| # pattern = r'\d+\.?$' | |
| # df = df[(df['PargaraphAppendix'].str.match(pattern, na=False)) | (df.index == ind)] | |
| # df = df[(df['LevelParagraphAppendix'] == 'Level0') | (df.index == ind)] | |
| df = df.loc[ind:ind + 7] | |
| start_index_paragraph = df.index[0] | |
| paragraph = self.__search_sub_level(df) | |
| elif df.loc[ind]['PargaraphAppendix'] != 'unknown': | |
| pattern = df.loc[ind]["PargaraphAppendix"].replace(".", r"\.") | |
| pattern = f'^{pattern}?\\d?.?$' | |
| if df[df['PargaraphAppendix'].str.match(pattern, na=False)].shape[0] == 1: | |
| pattern = df.loc[ind]["PargaraphAppendix"].replace(".", r"\.") | |
| pattern = pattern.split('.') | |
| pattern = [elem for elem in pattern if elem] | |
| if len(pattern) == 1: | |
| pattern = '.'.join(pattern) | |
| pattern = f'^{pattern}.?\\d?.?$' | |
| else: | |
| pattern = '.'.join(pattern[:-1]) | |
| pattern = f'^{pattern}.\\d.?$' | |
| df = df[df['PargaraphAppendix'].str.match(pattern, na=False)] | |
| start_index_paragraph = df.index[0] | |
| paragraph = self.__search_sub_level(df) | |
| else: | |
| paragraph = self.df.iloc[int(ind - 10):ind + 10]['Text'].values | |
| start_index_paragraph = df.index[0] | |
| return ' '.join(paragraph), start_index_paragraph | |
| else: | |
| if df.loc[ind]['Pargaraph'] == 'unknown': | |
| header_text = self._first_unknown_index(df) | |
| df = df.loc[int(ind - 2):ind + 2] | |
| paragraph = self.__search_sub_level(df, header_text) | |
| # Связан с документами без пунктов поэтому передается несколько параграфов сверху и снизу | |
| else: | |
| pattern = df.loc[ind]["Pargaraph"].replace(".", r"\.") | |
| # Изет под пункты внутри пункта | |
| paragraph, start_index_paragraph = self._paragraph_content(df, fr'^{pattern}?$', ind, 2) | |
| if len(paragraph) == 0: | |
| pattern = f'{pattern}\\d?.?\\d?\\d?.?$' | |
| paragraph, start_index_paragraph = self._paragraph_content2(df, pattern, ind, 0) | |
| if len(paragraph) == 0 and df.loc[ind]['LevelParagraph'] != '0': | |
| pattern = df.loc[ind]["Pargaraph"].split('.') | |
| pattern = [elem for elem in pattern if elem] | |
| pattern = '.'.join(pattern[:-1]) | |
| pattern = f'^{pattern}\\.\\d\\d?.?$' | |
| paragraph, start_index_paragraph = self._paragraph_content(df, pattern, ind, 0) | |
| elif len(paragraph) == 0 and df.loc[ind]['LevelParagraph'] == '0': | |
| pattern = df.loc[ind]["Pargaraph"].replace(".", r"\.") | |
| if '.' not in pattern: | |
| pattern = pattern + '\.' | |
| pattern = f'^{pattern}\\d.?\\d?.?$' | |
| paragraph, start_index_paragraph = self._paragraph_content(df, pattern, ind, 0) | |
| return ' '.join(paragraph), start_index_paragraph | |
| def filter_answer(answer): | |
| flip_answer = [] | |
| new_answer = {} | |
| count = 0 | |
| for key in answer: | |
| if answer[key]['start_index_paragraph'] not in flip_answer: | |
| flip_answer.append(answer[key]['start_index_paragraph']) | |
| new_answer[count] = answer[key] | |
| count += 1 | |
| return new_answer | |
| def _clear_doc_name(self, ind): | |
| split_doc_name = self.df.iloc[ind]['DocName'].split('_') | |
| return ' '.join(split_doc_name[1:]).replace('.txt', '').replace('.json', '').replace('.DOCX', '').replace( | |
| '.DOC', '').replace('tables', '') | |
| def search(self, indexes: List) -> dict: | |
| """ | |
| Метод ищет ответы на запрос | |
| Args: | |
| indexes: Список индексов. | |
| Returns: | |
| Возвращает словарь с ответами и информацией об ответах. | |
| """ | |
| answers = {} | |
| for i, ind in enumerate(indexes): | |
| answers[i] = {} | |
| doc_number = self.df.iloc[ind]['DocNumber'] | |
| answers[i]['id'] = doc_number | |
| answers[i][f'index_answer'] = int(ind) | |
| answers[i][f'doc_name'] = self._clear_doc_name(ind) | |
| answers[i][f'title'] = self.df.iloc[ind]['Title'] | |
| answers[i][f'text_answer'] = self.df.iloc[ind]['Text'] | |
| try: | |
| other_info, start_index_paragraph = self._search_other_info(ind, doc_number) | |
| except KeyError: | |
| other_info, start_index_paragraph = self.df.iloc[ind]['Text'], ind | |
| self.logger.info('Ошибка в индексе, проверьте БД!') | |
| if len(other_info) == 0: | |
| other_info, start_index_paragraph = self.df.iloc[ind]['Text'], ind | |
| answers[i][f'other_info'] = [other_info] | |
| answers[i][f'start_index_paragraph'] = int(start_index_paragraph) | |
| return self.filter_answer(answers) | |