| import re | |
| import nltk | |
| from nltk.corpus import stopwords | |
| def eliminar_acento(s): | |
| replacements = ( | |
| ("á", "a"), | |
| ("é", "e"), | |
| ("í", "i"), | |
| ("ó", "o"), | |
| ("ú", "u"), | |
| ) | |
| for a, b in replacements: | |
| s = s.replace(a, b).replace(a.upper(), b.upper()) | |
| return s | |
| def eliminar_patrones_stopwords(text): | |
| nltk.download('stopwords') | |
| lstopwords = set(stopwords.words('spanish')) | |
| text = [word for word in text.strip().split() if not word in lstopwords] | |
| text = ' '.join(text) | |
| return text | |
| def eliminar_espacios_blancos(texto): | |
| texto = re.sub(r"\:|\_", '', texto) | |
| texto = re.sub(r"o\/a", 'o', texto) | |
| texto = re.sub(r'[^\w\s]', '', texto) | |
| return texto | |
| def clean_text(original): | |
| original = re.sub(r'\w+(?:\.+\w+)*', lambda x: x.group(0).replace('.', ' '), original) | |
| original = re.sub(r'\.','' , original) | |
| texto = eliminar_acento(original) | |
| texto = eliminar_espacios_blancos(texto) | |
| texto = re.sub(r" +", ' ', texto) | |
| texto = texto.lower() | |
| texto = eliminar_patrones_stopwords(texto) | |
| original = re.sub(r" +", ' ', texto) | |
| return texto |