Spaces:
Runtime error
Runtime error
| import json | |
| import logging | |
| import sys | |
| import time | |
| from pathlib import Path | |
| from elasticsearch import Elasticsearch | |
| from tqdm import tqdm | |
| ROOT_DIR = Path(__file__).resolve().parent.parent.parent | |
| if ROOT_DIR not in sys.path: | |
| sys.path.append(str(ROOT_DIR)) | |
| def create_index_elastic_people( | |
| path: str, | |
| logger: logging.Logger | None = None, | |
| ): | |
| if logger is None: | |
| logger = logging.getLogger(__name__) | |
| # Подключение к Elasticsearch | |
| es = Elasticsearch(hosts='localhost:9200') | |
| INDEX_NAME = 'people_search' | |
| # Удаление старого индекса, если он существует | |
| if es.indices.exists(index=INDEX_NAME): | |
| es.indices.delete(index=INDEX_NAME) | |
| mapping = { | |
| "settings": { | |
| "analysis": { | |
| "char_filter": { | |
| "quote_removal": { | |
| "type": "pattern_replace", | |
| "pattern": "[\"«»]", | |
| "replacement": "", | |
| } | |
| }, | |
| "filter": { | |
| # "russian_stemmer": { | |
| # "type": "stemmer", | |
| # "name": "russian" | |
| # }, | |
| "custom_stopwords": { | |
| "type": "stop", | |
| "stopwords": [ | |
| "кто", | |
| "является", | |
| "куратором", | |
| "руководит", | |
| "отвечает", | |
| "бизнес", | |
| "за что", | |
| "ООО", | |
| "ОАО", | |
| "НН", | |
| "персональный", | |
| "состав", | |
| "персональный", | |
| "состав", | |
| "Комитета", | |
| "ПАО", | |
| "ГМК", | |
| "Норильский никель", | |
| "Рабочей группы", | |
| "что", | |
| "как", | |
| "почему", | |
| "зачем", | |
| "где", | |
| "когда", | |
| ], | |
| } | |
| }, | |
| "analyzer": { | |
| "custom_analyzer": { | |
| "type": "custom", | |
| "char_filter": ["quote_removal"], | |
| "tokenizer": "standard", | |
| "filter": [ | |
| "lowercase", | |
| "custom_stopwords", | |
| # "russian_stemmer" | |
| ], | |
| } | |
| }, | |
| } | |
| }, | |
| "mappings": { | |
| "properties": { | |
| "business_processes": { | |
| "type": "nested", | |
| "properties": { | |
| "production_activities_section": { | |
| "type": "text", | |
| "analyzer": "custom_analyzer", | |
| "search_analyzer": "custom_analyzer", | |
| }, | |
| "processes_name": { | |
| "type": "text", | |
| "analyzer": "custom_analyzer", | |
| "search_analyzer": "custom_analyzer", | |
| }, | |
| "level_process": { | |
| "type": "text", | |
| "analyzer": "custom_analyzer", | |
| "search_analyzer": "custom_analyzer", | |
| }, | |
| }, | |
| }, | |
| "organizatinal_structure": { | |
| "type": "nested", | |
| "properties": { | |
| "position": { | |
| "type": "text", | |
| "analyzer": "custom_analyzer", | |
| "search_analyzer": "custom_analyzer", | |
| }, | |
| "leads": { | |
| "type": "nested", | |
| "properties": { | |
| "0": { | |
| "type": "text", | |
| "analyzer": "custom_analyzer", | |
| "search_analyzer": "custom_analyzer", | |
| }, | |
| "1": { | |
| "type": "text", | |
| "analyzer": "custom_analyzer", | |
| "search_analyzer": "custom_analyzer", | |
| }, | |
| }, | |
| }, | |
| "subordinate": { | |
| "type": "object", | |
| "properties": { | |
| "person_name": { | |
| "type": "text", | |
| "analyzer": "custom_analyzer", | |
| "search_analyzer": "custom_analyzer", | |
| }, | |
| "position": { | |
| "type": "text", | |
| "analyzer": "custom_analyzer", | |
| "search_analyzer": "custom_analyzer", | |
| }, | |
| }, | |
| }, | |
| }, | |
| }, | |
| "business_curator": { | |
| "type": "nested", | |
| "properties": { | |
| "division": { | |
| "type": "text", | |
| "analyzer": "custom_analyzer", | |
| "search_analyzer": "custom_analyzer", | |
| }, | |
| "company_name": { | |
| "type": "text", | |
| "analyzer": "custom_analyzer", | |
| "search_analyzer": "custom_analyzer", | |
| }, | |
| }, | |
| }, | |
| "groups": { | |
| "type": "nested", | |
| "properties": { | |
| "group_name": { | |
| "type": "text", | |
| "analyzer": "custom_analyzer", | |
| "search_analyzer": "custom_analyzer", | |
| }, | |
| "position_in_group": { | |
| "type": "text", | |
| "analyzer": "custom_analyzer", | |
| "search_analyzer": "custom_analyzer", | |
| }, | |
| "block": {"type": "keyword", "null_value": "unknown"}, | |
| }, | |
| }, | |
| "person_name": { | |
| "type": "text", | |
| "analyzer": "custom_analyzer", | |
| "search_analyzer": "custom_analyzer", | |
| }, | |
| } | |
| }, | |
| } | |
| # Создание индекса с указанным маппингом | |
| es.indices.create(index=INDEX_NAME, body=mapping) | |
| group_names = [] | |
| for ind, path in tqdm(enumerate(Path(path).iterdir())): | |
| # Открываем файл и читаем его содержимое | |
| try: | |
| with open(path, 'r', encoding='utf-8') as file: | |
| data = json.load(file) | |
| # Индексирование документа в Elasticsearch | |
| es.index(index=INDEX_NAME, id=ind + 1, body=data) | |
| time.sleep(0.5) | |
| except: | |
| print(f"Ошибка при чтении или добавлении файла {path.name} в индекс") | |
| if es.indices.exists(index=INDEX_NAME): | |
| print(f"Index '{INDEX_NAME}' exists.") | |
| # Подсчет количества документов в индексе | |
| count_response = es.count(index=INDEX_NAME) | |
| print(f"Total documents in '{INDEX_NAME}': {count_response['count']}") | |
| def get_elastic_people_query(query): | |
| has_business_curator = ( | |
| "бизнес куратор" in query.lower() or "бизнес-куратор" in query.lower() | |
| ) | |
| business_curator_boost = 20 if has_business_curator else 15 | |
| return { | |
| "query": { | |
| "function_score": { | |
| "query": { | |
| "bool": { | |
| "should": [ | |
| { | |
| "multi_match": { | |
| "query": query, | |
| "fields": ["person_name^3"], | |
| "fuzziness": "AUTO", | |
| "analyzer": "custom_analyzer", | |
| } | |
| }, | |
| { | |
| "nested": { | |
| "path": "business_processes", | |
| "query": { | |
| "multi_match": { | |
| "query": query, | |
| "fields": [ | |
| "business_processes.production_activities_section", | |
| "business_processes.processes_name", | |
| ], | |
| "fuzziness": "AUTO", | |
| "analyzer": "custom_analyzer", | |
| } | |
| }, | |
| } | |
| }, | |
| { | |
| "nested": { | |
| "path": "organizatinal_structure", | |
| "query": { | |
| "multi_match": { | |
| "query": query, | |
| "fields": [ | |
| "organizatinal_structure.position^2" | |
| ], | |
| "fuzziness": "AUTO", | |
| "analyzer": "custom_analyzer", | |
| } | |
| }, | |
| } | |
| }, | |
| { | |
| "nested": { | |
| "path": "business_curator", | |
| "query": { | |
| "multi_match": { | |
| "query": query, | |
| "fields": [ | |
| f"business_curator.company_name^{business_curator_boost}" | |
| ], | |
| "fuzziness": "AUTO", | |
| "analyzer": "custom_analyzer", | |
| } | |
| }, | |
| } | |
| }, | |
| ] | |
| } | |
| } | |
| } | |
| } | |
| } | |
| query = 'кто бизнес куратор ООО Медвежий ручей?' | |
| # Выполнение поиска в Elasticsearch | |
| response = es.search(index=INDEX_NAME, body=get_elastic_people_query(query), size=2) | |
| logger.info(f"Number of hits: {response['hits']['total']['value']}") | |
| # Вывод результата поиска | |
| for hit in response['hits']['hits']: | |
| logger.info(hit['_source']) | |
| if __name__ == '__main__': | |
| path = '/mnt/ntr_work/data/фывфыаыфвфы/person_card' | |
| create_index_elastic_people(path) | |