Spaces:

somosnlp-hackathon-2023
/

demo_DiagTrast

Sleeping

demo_DiagTrast / utils.py

First try testing model

3be88b5 over 2 years ago

1.16 kB

	import re
	import nltk
	from nltk.corpus import stopwords

	def eliminar_acento(s):
	replacements = (
	("á", "a"),
	("é", "e"),
	("í", "i"),
	("ó", "o"),
	("ú", "u"),
	)
	for a, b in replacements:
	s = s.replace(a, b).replace(a.upper(), b.upper())
	return s

	def eliminar_patrones_stopwords(text):
	nltk.download('stopwords')
	lstopwords = set(stopwords.words('spanish'))

	text = [word for word in text.strip().split() if not word in lstopwords]
	text = ' '.join(text)
	return text

	def eliminar_espacios_blancos(texto):
	texto = re.sub(r"\:\|\_", '', texto)
	texto = re.sub(r"o\/a", 'o', texto)

	texto = re.sub(r'[^\w\s]', '', texto)
	return texto

	def clean_text(original):
	original = re.sub(r'\w+(?:\.+\w+)*', lambda x: x.group(0).replace('.', ' '), original)
	original = re.sub(r'\.','' , original)

	texto = eliminar_acento(original)
	texto = eliminar_espacios_blancos(texto)
	texto = re.sub(r" +", ' ', texto)

	texto = texto.lower()
	texto = eliminar_patrones_stopwords(texto)
	original = re.sub(r" +", ' ', texto)
	return texto