Spaces:
Runtime error
Runtime error
| from typing import Tuple, List, Dict, Union | |
| import faiss | |
| import pandas as pd | |
| import numpy as np | |
| import torch | |
| from common.constants import COLUMN_DOC_NAME | |
| from common.constants import COLUMN_EMBEDDING | |
| from common.constants import COLUMN_LABELS_STR | |
| from common.constants import COLUMN_NAMES | |
| from common.constants import COLUMN_TABLE_NAME | |
| from common.constants import COLUMN_TYPE_DOC_MAP | |
| class FaissVectorDatabase: | |
| """Класс для взаимодействия между векторами и информацией о них""" | |
| def __init__(self, path_to_metadata: str = None, df: pd.DataFrame = None, global_df: pd.DataFrame = None): | |
| if isinstance(df, pd.DataFrame): | |
| self.df = df | |
| self.global_df = global_df | |
| else: | |
| self.path_to_metadata = path_to_metadata | |
| self.__load_metadata() | |
| self.__crate_index() | |
| def __load_metadata(self): | |
| """Load the metadata file.""" | |
| self.df = pd.read_pickle(self.path_to_metadata) | |
| self.df = self.df.where(pd.notna(self.df), None) | |
| def __crate_index(self): | |
| """Create the faiss index.""" | |
| embeddings = np.array(self.df[COLUMN_EMBEDDING].tolist()) | |
| dim = embeddings.shape[1] | |
| self.index = faiss.IndexFlatL2(dim) | |
| self.index.add(embeddings) | |
| def _paragraph_content2(self, pattern: str, doc_number: str, ind: int, shape: int) -> Tuple[List, int]: | |
| """ | |
| Функция возвращает контент параграфа. Если в параграфе были подпункты через "-" или буквы "а, б" | |
| Args: | |
| pattern: Паттерн поиска. | |
| doc_number: Номер документа. | |
| ind: Индекс строки в DataFrame. | |
| shape: Размер DataFrame при котором будет возвращаться пустой список. | |
| Returns: | |
| Возвращает список подразделов. | |
| Examples: | |
| 3.1. Параграф: | |
| 1) - Содержание 1; | |
| 2) - Содержание 2; | |
| 3) - Содержание 3; | |
| """ | |
| # TODO: Удалить функцию! Объединить с первой! | |
| df = self.df[(self.df['DocNumber'] == doc_number) & (self.df['Pargaraph'].str.match(pattern, na=False))] | |
| if self.df.iloc[ind]['Duplicate'] is not None: | |
| df = df[df['Duplicate'] == self.df.iloc[ind]['Duplicate']] | |
| if df.shape[0] <= shape: | |
| return [], None | |
| header_text = df.iloc[0]['Text'] | |
| start_index_paragraph = df.index[0] | |
| paragraphs = [] | |
| for ind2, (_, row) in enumerate(df.iterrows()): | |
| text = row['Text'] | |
| if ind2 == 0: | |
| text = text.replace(f'{header_text}', f'{header_text}\n') | |
| else: | |
| text = text.replace(f'{header_text}', '') + '\n' | |
| paragraphs.append(text) | |
| return paragraphs, start_index_paragraph | |
| def _paragraph_content(self, pattern: str, doc_number: str, ind: int, shape: int) -> Tuple[List, int]: | |
| """ | |
| Функция возвращает контент параграфа. Если в параграфе были подпункты через "-" или буквы "а, б" | |
| Args: | |
| pattern: Паттерн поиска. | |
| doc_number: Номер документа. | |
| ind: Индекс строки в DataFrame. | |
| shape: Размер DataFrame при котором будет возвращаться пустой список. | |
| Returns: | |
| Возвращает список подразделов. | |
| Examples: | |
| 3.1. Параграф: | |
| 1) - Содержание 1; | |
| 2) - Содержание 2; | |
| 3) - Содержание 3; | |
| """ | |
| df = self.df[(self.df['DocNumber'] == doc_number) & (self.df['Pargaraph'].str.match(pattern, na=False))] | |
| if self.df.iloc[ind]['Duplicate'] is not None: | |
| df = df[df['Duplicate'] == self.df.iloc[ind]['Duplicate']] | |
| else: | |
| df = df[df['Duplicate'].isna()] | |
| if df.shape[0] <= shape: | |
| return [], None | |
| header_text = df.iloc[0]['Text'] | |
| start_index_paragraph = df.index[0] | |
| paragraphs = [] | |
| for ind2, (_, row) in enumerate(df.iterrows()): | |
| text = row['Text'] | |
| if ind2 == 0: | |
| text = text.replace(f'{header_text}', f'{header_text}\n') | |
| else: | |
| text = text.replace(f'{header_text}', '') + '\n' | |
| paragraphs.append(text) | |
| return paragraphs, start_index_paragraph | |
| def _get_top_paragraph(self): | |
| pass | |
| def _search_other_info(self, ind, doc_number): | |
| other_info = [] | |
| start_index_paragraph = [] | |
| if self.df.iloc[ind]['PartLevel1'] is not None: | |
| if 'Table' in str(self.df.iloc[ind]['PartLevel1']): | |
| return [], ind | |
| if self.df.iloc[ind]['Appendix'] is not None: | |
| df = self.df[(self.df['DocNumber'] == doc_number) & (self.df['Appendix'] == self.df.iloc[ind]['Appendix'])] | |
| other_info.append(f'{df.loc[ind]["Text"]}') | |
| return other_info, ind | |
| else: | |
| if self.df.iloc[ind]['Pargaraph'] is None: | |
| other_info.append(f'{self.df.iloc[ind]["Text"]}') | |
| else: | |
| pattern = self.df.iloc[ind]["Pargaraph"].replace(".", r"\.") | |
| paragraph, start_index_paragraph = self._paragraph_content(fr'^{pattern}?$', doc_number, ind, 1) | |
| if 'Компания обязуется в области охраны труда' in pattern: | |
| other_info.append(f'{self.df.iloc[ind + 1]["Text"]}') | |
| # TODO Баг который нужно исправить!!!! Связан с документами без пунктов | |
| if not paragraph and self.df.iloc[ind]['LevelParagraph'] != '0': | |
| pattern = self.df.iloc[ind]["Pargaraph"] | |
| pattern = pattern.split('.') | |
| pattern = [elem for elem in pattern if elem] | |
| pattern = '.'.join(pattern[:-1]) | |
| pattern = f'^{pattern}\\.\\d.?$' | |
| paragraph, start_index_paragraph = self._paragraph_content2(pattern, doc_number, ind, 0) | |
| elif not paragraph and self.df.iloc[ind]['LevelParagraph'] == '0': | |
| pattern = self.df.iloc[ind]["Pargaraph"].replace(".", r"\.") | |
| if '.' not in pattern: | |
| pattern = pattern + '\.' | |
| pattern = f'^{pattern}\\d.?$' | |
| paragraph, start_index_paragraph = self._paragraph_content2(pattern, doc_number, ind, 0) | |
| other_info.append(' '.join(paragraph)) | |
| return other_info, start_index_paragraph | |
| def search(self, emb_query: torch.Tensor, k_neighbors: int, other_information: bool) -> dict: | |
| """ | |
| Метод ищет ответы на запрос | |
| Args: | |
| emb_query: Embedding вопроса. | |
| k_neighbors: Количество ближайших ответов к вопросу. | |
| other_information: | |
| Returns: | |
| Возвращает словарь с ответами и информацией об ответах. | |
| """ | |
| if len(emb_query.shape) != 2: | |
| assert print('Не правильный размер вектора!') | |
| distances, indexes = self.index.search(emb_query, k_neighbors) | |
| answers = {} | |
| for i, ind in enumerate(indexes[0]): | |
| answers[i] = {} | |
| answers[i][f'distance'] = float(distances[0][i]) | |
| answers[i][f'index_answer'] = int(ind) | |
| answers[i][f'doc_name'] = self.df.iloc[ind]['DocName'] | |
| # answers[i][f'title'] = self.df.iloc[ind]['Title'] | |
| answers[i][f'text_answer'] = self.df.iloc[ind]['Text'] | |
| doc_number = self.df.iloc[ind]['DocNumber'] | |
| if other_information: | |
| other_info, start_index_paragraph = self._search_other_info(ind, doc_number) | |
| answers[i][f'other_info'] = other_info | |
| answers[i][f'start_index_paragraph'] = start_index_paragraph | |
| return answers | |
| def search_transaction_map(self, emb_query: torch.Tensor, k_neighbors: int) -> Dict[str, Union[str, int]]: | |
| """ | |
| Метод ищет ответы на запрос по картам проводок | |
| Args: | |
| emb_query: Embedding вопроса. | |
| k_neighbors: Количество ближайших ответов к вопросу. | |
| Returns: | |
| Возвращает словарь с ответами и информацией об ответах. | |
| Notes: | |
| Будет возвращаться словарь вида | |
| { | |
| 'distance': Дистанция между векторами | |
| 'index_answer': Индекс ответа как в df index | |
| 'doc_name': Наименование документа | |
| 'text_answer': Название таблицы / Названия файла | |
| 'labels': Метка для расчета метрик | |
| 'Columns': Наименования колонок в карте проводок | |
| 'TypeDocs': К кому разделу относится карта проводок (1С или SAP) | |
| } | |
| """ | |
| if len(emb_query.shape) != 2: | |
| assert print('Не правильный размер вектора!') | |
| distances, indexes = self.index.search(emb_query, k_neighbors) | |
| answers = {} | |
| for i, ind in enumerate(indexes[0]): | |
| answers[i] = {} | |
| answers[i][f'distance'] = distances[0][i] | |
| answers[i][f'index_answer'] = ind | |
| answers[i][f'doc_name'] = self.df.iloc[ind][COLUMN_DOC_NAME] | |
| answers[i][f'text_answer'] = self.df.iloc[ind][COLUMN_TABLE_NAME] | |
| answers[i][COLUMN_LABELS_STR] = self.df.iloc[ind][COLUMN_LABELS_STR] | |
| answers[i][COLUMN_NAMES] = self.df.iloc[ind][COLUMN_NAMES] | |
| answers[i][COLUMN_TYPE_DOC_MAP] = self.df.iloc[ind][COLUMN_TYPE_DOC_MAP] | |
| return answers | |
| def search_by_group_and_person(self, emb_query: torch.Tensor, query: str, k_neighbors: int) -> Dict[str, Union[str, int]]: | |
| if len(emb_query.shape) != 2: | |
| assert print('Не правильный размер вектора!') | |
| answers = {} | |
| for i, name in enumerate(self.global_df['ФИО'].unique()): | |
| if name in query or name.split(' ')[0] in query: | |
| answers[i] = {} | |
| df = self.global_df[self.global_df['ФИО'] == name] | |
| answers[i][f'name'] = name | |
| answers[i][f'position'] = df['Должность'].unique() | |
| answers[i][f'group'] = df['Группа'].unique() | |
| answers[i][f'position_in_group'] = df['Должность внутри группы'].unique() | |
| return answers | |
| distances, indexes = self.index.search(emb_query, k_neighbors) | |
| for i, ind in enumerate(indexes[0]): | |
| answers[i] = {} | |
| unique_value = self.df.iloc[ind]['unique_value'] | |
| df = self.global_df[(self.global_df['Должность'] == unique_value) | (self.global_df['Группа'] == unique_value)] | |
| answers[i][f'name'] = df['ФИО'].unique() | |
| answers[i][f'position'] = df['Должность'].unique() | |
| answers[i][f'group'] = df['Группа'].unique() | |
| answers[i][f'position_in_group'] = df['Должность внутри группы'].unique() | |
| return answers |