# core/van_normalizer.py import re import nltk from nltk import pos_tag, word_tokenize from nltk.stem import WordNetLemmatizer # make sure you have these (run once if missing): # python -m nltk.downloader punkt averaged_perceptron_tagger wordnet omw-1.4 lemmatizer = WordNetLemmatizer() def normalize_to_van(text: str) -> str: """ VAN-based normalization (optimized for clinical trial domain): - Lowercases and removes punctuation - Tokenizes and POS-tags - Keeps only Nouns (N), Adjectives (J), and key Verbs (V) - Explicitly removes determiners/articles (a, an, the) - Lemmatizes each token to its base form - Returns a space-joined string suitable for FAISS embedding """ if not text: return "" # Basic cleanup text = text.lower().strip() text = re.sub(r"[^a-z0-9\s-]", " ", text) # remove punctuation tokens = word_tokenize(text) # POS tagging tagged = pos_tag(tokens) filtered = [] for word, tag in tagged: # Skip common determiners, articles, and auxiliary verbs if word in {"a", "an", "the", "is", "are", "was", "were", "be", "been", "being"}: continue # Keep only verbs, adjectives, and nouns if tag.startswith("V") or tag.startswith("J") or tag.startswith("N"): filtered.append((word, tag)) # Lemmatize each word to its appropriate part of speech lemmas = [] for word, tag in filtered: pos = ( "v" if tag.startswith("V") else "a" if tag.startswith("J") else "n" ) lemmas.append(lemmatizer.lemmatize(word, pos)) # Join and clean normalized = " ".join(lemmas).strip() normalized = re.sub(r"\s+", " ", normalized) # collapse multiple spaces return normalized