Spaces:

dperales
/

ITACA_Insurance_Core_v4

Sleeping

App Files Files Community

ITACA_Insurance_Core_v4 / named_entity_recognition.py

dperales

Upload 12 files

b2fbe3d over 2 years ago

raw

history blame contribute delete

2.19 kB

	from transformers import AutoTokenizer, AutoModelForTokenClassification
	from transformers import pipeline


	class NamedEntityRecognition:
	"""
	Named Entity Recognition on text data.
	Attributes:
	tokenizer: An instance of Hugging Face Tokenizer
	model: An instance of Hugging Face Model
	nlp: An instance of Hugging Face Named Entity Recognition pipeline
	"""

	def __init__(self):
	tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
	model = AutoModelForTokenClassification.from_pretrained("xlm-roberta-large-finetuned-conll03-english")
	self.nlp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)

	def get_annotation(self, preds, text):
	"""
	Get html annotation for displaying entities over text.
	Parameters:
	preds (dict): List of entities and their associated metadata
	text (str): The user input string to generate entity tags for
	Returns:
	final_annotation (list): List of tuples to pass to text annotation html creator
	"""

	splits = [0]
	entities = {}
	for i in preds:
	splits.append(i['start'])
	splits.append(i['end'])
	entities[i['word']] = i['entity_group']

	# Exclude bad preds
	exclude = ['', '.', '. ', ' ']
	for x in exclude:
	if x in entities.keys():
	entities.pop(x)

	parts = [text[i:j] for i, j in zip(splits, splits[1:] + [None])]

	final_annotation = [(x, entities[x], "") if x in entities.keys() else x for x in parts]

	return final_annotation

	def classify(self, text):
	"""
	Recognize Named Entities in text.
	Parameters:
	text (str): The user input string to generate entity tags for
	Returns:
	predictions (str): The user input string to generate entity tags for
	ner_annotation (str): The user input string to generate entity tags for
	"""

	preds = self.nlp(text)
	ner_annotation = self.get_annotation(preds, text)
	return preds, ner_annotation