Spaces:
Runtime error
Runtime error
| from underthesea import sent_tokenize | |
| def substring(w, ls): | |
| for w2 in ls: | |
| if w != w2 and w in w2: | |
| return True | |
| return False | |
| def get_ner_phrases(sent_ner_result): | |
| ner_list = [] | |
| current_ner = [sent_ner_result[0]["word"]] | |
| current_idx = sent_ner_result[0]["index"] | |
| for i in range(1, len(sent_ner_result)): | |
| if sent_ner_result[i]["index"] == current_idx + 1: | |
| current_ner.append(sent_ner_result[i]["word"]) | |
| else: | |
| ner_list.append((' '.join(current_ner), sent_ner_result[i - 1]['entity'])) | |
| current_ner = [sent_ner_result[i]["word"]] | |
| current_idx = sent_ner_result[i]["index"] | |
| ner_list.append((' '.join(current_ner), sent_ner_result[len(sent_ner_result) - 1]['entity'])) | |
| return ner_list | |
| def get_named_entities(nlp, doc): | |
| ner_lists = [] | |
| for sent in sent_tokenize(doc): | |
| sent_ner_result = nlp(sent) | |
| if len(sent_ner_result) > 0: | |
| ner_lists += get_ner_phrases(sent_ner_result) | |
| ner_list_non_dup = [] | |
| for (entity, ner_type) in ner_lists: | |
| if entity not in ner_list_non_dup and ner_type.startswith('I'): | |
| ner_list_non_dup.append(entity) | |
| ner_list_final = [w.replace(" ##", "") for w in ner_list_non_dup if not substring(w, ner_list_non_dup)] | |
| return ner_list_final | |