Spaces:
Running
Running
| # core/van_normalizer.py | |
| import re | |
| import nltk | |
| from nltk import pos_tag, word_tokenize | |
| from nltk.stem import WordNetLemmatizer | |
| # make sure you have these (run once if missing): | |
| # python -m nltk.downloader punkt averaged_perceptron_tagger wordnet omw-1.4 | |
| lemmatizer = WordNetLemmatizer() | |
| def normalize_to_van(text: str) -> str: | |
| """ | |
| VAN-based normalization (optimized for clinical trial domain): | |
| - Lowercases and removes punctuation | |
| - Tokenizes and POS-tags | |
| - Keeps only Nouns (N), Adjectives (J), and key Verbs (V) | |
| - Explicitly removes determiners/articles (a, an, the) | |
| - Lemmatizes each token to its base form | |
| - Returns a space-joined string suitable for FAISS embedding | |
| """ | |
| if not text: | |
| return "" | |
| # Basic cleanup | |
| text = text.lower().strip() | |
| text = re.sub(r"[^a-z0-9\s-]", " ", text) # remove punctuation | |
| tokens = word_tokenize(text) | |
| # POS tagging | |
| tagged = pos_tag(tokens) | |
| filtered = [] | |
| for word, tag in tagged: | |
| # Skip common determiners, articles, and auxiliary verbs | |
| if word in {"a", "an", "the", "is", "are", "was", "were", "be", "been", "being"}: | |
| continue | |
| # Keep only verbs, adjectives, and nouns | |
| if tag.startswith("V") or tag.startswith("J") or tag.startswith("N"): | |
| filtered.append((word, tag)) | |
| # Lemmatize each word to its appropriate part of speech | |
| lemmas = [] | |
| for word, tag in filtered: | |
| pos = ( | |
| "v" if tag.startswith("V") | |
| else "a" if tag.startswith("J") | |
| else "n" | |
| ) | |
| lemmas.append(lemmatizer.lemmatize(word, pos)) | |
| # Join and clean | |
| normalized = " ".join(lemmas).strip() | |
| normalized = re.sub(r"\s+", " ", normalized) # collapse multiple spaces | |
| return normalized | |