import os import re from typing import Dict, Tuple, List import nltk import spacy import torch import torch.nn.functional as F import matplotlib.pyplot as plt import pandas as pd import gradio as gr from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification from sentence_transformers import SentenceTransformer, util # ========================= # 0) Lightweight setup # ========================= def ensure_spacy(): try: return spacy.load("en_core_web_sm") except Exception: import spacy.cli spacy.cli.download("en_core_web_sm") return spacy.load("en_core_web_sm") def ensure_nltk(): try: nltk.data.find("tokenizers/punkt") except LookupError: nltk.download("punkt") ensure_nltk() nlp = ensure_spacy() # ========================= # 1) Models (cached) # ========================= sbert_model = SentenceTransformer("all-MiniLM-L6-v2") bert_sentiment = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english") emotion_model_name = "j-hartmann/emotion-english-distilroberta-base" emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_name) emotion_model = AutoModelForSequenceClassification.from_pretrained(emotion_model_name) # ========================= # 2) GNH definitions # ========================= GNH_DOMAINS: Dict[str, str] = { "Mental Wellness": "mental health, emotional clarity, peace of mind", "Social Wellness": "relationships, community, friendship, social harmony", "Economic Wellness": "income, savings, financial stability, cost of living", "Workplace Wellness": "career, work-life balance, promotion, productivity", "Physical Wellness": "physical health, sleep, fitness, exercise", "Environmental Wellness": "green space, nature, environmental care", "Health": "healthcare, medical care, recovery, well-being", "Education Value": "learning, education, school, knowledge, wisdom", "Good Governance": "freedom, justice, fairness, democratic participation", "Living Standards": "housing, wealth, basic needs, affordability", "Cultural Diversity": "tradition, language, cultural expression, heritage", "Political Wellness": "rights, law, free speech, civic participation", "Ecological Diversity": "biodiversity, forest, ecosystem, wildlife", } GNH_COLORS: Dict[str, str] = { "Economic Wellness": "#808080", "Mental Wellness": "#ffc0cb", "Workplace Wellness": "#ffd700", "Physical Wellness": "#f5deb3", "Social Wellness": "#ffa500", "Political Wellness": "#ffffff", "Environmental Wellness": "#87ceeb", "Ecological Diversity": "#228B22", "Health": "#ff6347", "Good Governance": "#000000", "Education Value": "#8b4513", "Living Standards": "#ffff00", "Cultural Diversity": "#9370db", } # ========================= # 3) Pathways (CSV + images) # ========================= CSV_PATH = "la matrice plus.csv" # UI label → internal key SEQUENCE_ALIASES = { "Auto (recommend)": "auto", "Direct": "direct", "Fem": "feminine", "Knot": "knot", "Masc": "masculine", "Pain": "pain", "Prayer": "prayer", "Precise": "precise", "Practical": "practical", "Plot": "plot", "Spiritual": "spiritual", "Sad": "sad" } SEQUENCE_IMAGE_FILES = { "direct": "direct pathway.png", "feminine": "fem pathway.png", "knot": "knot pathway.png", "masc": "masc pathway.png", "pain": "pain pathway.png", "prayer": "prayer pathway.png", "precise": "precise pathway.png", "practical": "practical pathway.png", "plot": "plot pathway.png", "spiritual": "spiritual pathway.png", "sad": "sad pathway.png" } # ---- load pathway phrases + colors (many-to-many) ---- def load_pathway_info(csv_path: str): df = pd.read_csv(csv_path) keys_we_know = set(SEQUENCE_ALIASES.values()) - {"auto"} rows = df[df["color"].astype(str).str.lower().isin(keys_we_know)].copy() phrases: Dict[str, str] = {} seq_to_colors: Dict[str, List[str]] = {} color_to_seqs: Dict[str, List[str]] = {} # columns to stitch into a phrase (all except color/r/g/b) cols_for_phrase = [c for c in df.columns if c not in ("color", "r", "g", "b")] for _, row in rows.iterrows(): key = str(row["color"]).strip().lower() # parse colors list from column 'r' (e.g., "red, orange") colors_field = str(row.get("r", "") or "") colors = [c.strip().lower() for c in re.split(r"[,\s]+", colors_field) if c.strip()] colors = list(dict.fromkeys(colors)) # dedupe, keep order seq_to_colors[key] = colors for c in colors: color_to_seqs.setdefault(c, []) if key not in color_to_seqs[c]: color_to_seqs[c].append(key) # phrase: join all non-null from the other columns (keeps "let's ..." fragments etc.) vals = [] for c in cols_for_phrase: v = row.get(c) if pd.notna(v): vs = str(v).strip() if vs and vs.lower() != "nan": vals.append(vs) phrase = " ".join(vals) phrase = " ".join(phrase.split()) phrases[key] = phrase # color vocab for parsing "red-pathway" in text color_vocab = sorted(color_to_seqs.keys()) return phrases, seq_to_colors, color_to_seqs, color_vocab PATHWAY_PHRASES, SEQ_TO_COLORS, COLOR_TO_SEQS, COLOR_VOCAB = load_pathway_info(CSV_PATH) def sequence_to_image_path(seq_key: str) -> str | None: fname = SEQUENCE_IMAGE_FILES.get(seq_key) return fname if (fname and os.path.exists(fname)) else None # ========================= # 4) Scoring # ========================= def classify_emotion(text: str) -> Tuple[str, float]: inputs = emotion_tokenizer(text, return_tensors="pt", truncation=True) with torch.no_grad(): logits = emotion_model(**inputs).logits probs = F.softmax(logits, dim=1).squeeze() labels = emotion_model.config.id2label idx = int(torch.argmax(probs).item()) return labels[idx], float(probs[idx].item()) def score_sentiment(text: str) -> float: out = bert_sentiment(text[:512])[0] label, score = out["label"], out["score"] scaled = 5 + 5 * score if label == "POSITIVE" else 1 + 4 * (1 - score) return round(min(10, max(1, scaled)), 2) def score_accomplishment(text: str) -> float: doc = nlp(text) score = 5.0 key_phrases = {"finally", "told", "decided", "quit", "refused", "stood", "walked", "walked away", "returned", "return"} for token in doc: if token.text.lower() in key_phrases: score += 1.5 if token.tag_ in {"VBD", "VBN"}: score += 0.5 return round(min(10, max(1, score)), 2) # ========================= # 5) Pathway-aware vector math # ========================= def encode_text(t: str): return sbert_model.encode(t, convert_to_tensor=True) def composite_vector( base_text: str, boost_terms: List[str], boost_seq_keys: List[str], limit_seq_keys: List[str], boost_w: float = 0.6, limit_w: float = 0.6, ): v = encode_text(base_text) for term in boost_terms: t = term.strip() if t: v = v + boost_w * encode_text(t) for key in boost_seq_keys: phrase = PATHWAY_PHRASES.get(key, "") if phrase: v = v + boost_w * encode_text(phrase) for key in limit_seq_keys: phrase = PATHWAY_PHRASES.get(key, "") if phrase: v = v - limit_w * encode_text(phrase) return v def best_sequence_for_vector(vec) -> Tuple[str, float]: best_key, best_sim = None, -1.0 for key, phrase in PATHWAY_PHRASES.items(): if not phrase: continue sim = float(util.cos_sim(vec, encode_text(phrase)).item()) if sim > best_sim: best_key, best_sim = key, sim return best_key or "direct", best_sim def semantic_indicator_mapping_from_vec(vec, sentiment_score: float, sentiment_weight: float = 0.3) -> Dict[str, float]: out: Dict[str, float] = {} for label, desc in GNH_DOMAINS.items(): desc_vec = encode_text(desc) sim = float(util.cos_sim(vec, desc_vec).item()) sim = max(0.0, min(1.0, sim)) blended = (1 - sentiment_weight) * sim + sentiment_weight * (sentiment_score / 10.0) out[label] = round(blended, 3) return dict(sorted(out.items(), key=lambda kv: -kv[1])) # ========================= # 6) Color cues from free text (many-to-many) # ========================= _COLOR_RE = re.compile(r"\b(" + "|".join(map(re.escape, COLOR_VOCAB)) + r")\s*(?:\-?\s*pathway)?\b", re.I) _LIMIT_CUES = {"limit", "reduce", "lessen", "avoid", "diminish", "lower", "constrain", "suppress"} def infer_color_directives(text: str) -> Tuple[List[str], List[str]]: """ Parse '... limit ... red-pathway ...' → limit 'red' otherwise treat mentioned colors as boost. Returns (boost_colors, limit_colors) as lists of color strings. """ tokens = re.findall(r"\w+|\S", text.lower()) idxs = [] for m in _COLOR_RE.finditer(text): start = m.start() # find token index closest to this span char_count = 0 tok_index = 0 for i, tok in enumerate(tokens): char_count += len(tok) + 1 # crude but ok if char_count > start: tok_index = i break idxs.append((tok_index, m.group(1).lower())) boost_colors, limit_colors = [], [] for idx, col in idxs: # look back a small window for a limit cue window = tokens[max(0, idx-4):idx] if any(w in _LIMIT_CUES for w in window): limit_colors.append(col) else: boost_colors.append(col) # dedupe boost_colors = list(dict.fromkeys(boost_colors)) limit_colors = list(dict.fromkeys(limit_colors)) return boost_colors, limit_colors def colors_to_seq_keys(colors: List[str]) -> List[str]: keys: List[str] = [] for c in colors: for k in COLOR_TO_SEQS.get(c, []): if k not in keys: keys.append(k) return keys # ========================= # 7) Plot helper # ========================= def indicators_plot(indicators: Dict[str, float]): labels = list(indicators.keys()) values = list(indicators.values()) colors = [GNH_COLORS.get(label, "#cccccc") for label in labels] fig = plt.figure(figsize=(8, 5)) plt.barh(labels, values, color=colors) plt.gca().invert_yaxis() plt.title("GNH Indicator Similarity (Pathway-weighted)") plt.xlabel("Score") plt.tight_layout() return fig # ========================= # 8) Gradio app # ========================= SEQ_CHOICES = list(SEQUENCE_ALIASES.keys()) SEQ_MULTI_CHOICES = [k for k in SEQUENCE_ALIASES.keys() if k != "Auto (recommend)"] def normalize_seq_keys(ui_labels: List[str]) -> List[str]: keys = [] for lab in ui_labels: k = SEQUENCE_ALIASES.get(lab, lab).lower() keys.append(k) return keys def analyze( text: str, seq_choice: str, boost_terms_raw: str, boost_seq_labels: List[str], limit_seq_labels: List[str], boost_w: float, limit_w: float, ): if not text or not text.strip(): return (5.0, "neutral (0.0)", 5.0, "—", "—", "{}", None, None) # 1) scores sentiment = score_sentiment(text) emotion, emo_conf = classify_emotion(text) accomplishment = score_accomplishment(text) # 2) UI selections boost_seqs_user = normalize_seq_keys(boost_seq_labels) limit_seqs_user = normalize_seq_keys(limit_seq_labels) # 3) parse boosts/limits boost_terms = [t.strip() for t in boost_terms_raw.split(",")] if boost_terms_raw else [] # --- NEW: Color cues from text (many-to-many) --- boost_colors, limit_colors = infer_color_directives(text) boost_seqs_from_colors = colors_to_seq_keys(boost_colors) limit_seqs_from_colors = colors_to_seq_keys(limit_colors) # combine lists (dedupe preserving order) def _merge(a: List[str], b: List[str]) -> List[str]: out = list(a) for x in b: if x not in out: out.append(x) return out boost_seq_keys = _merge(boost_seqs_user, boost_seqs_from_colors) limit_seq_keys = _merge(limit_seqs_user, limit_seqs_from_colors) # 4) build context vector context_vec = composite_vector( base_text=text, boost_terms=boost_terms, boost_seq_keys=boost_seq_keys, limit_seq_keys=limit_seq_keys, boost_w=boost_w, limit_w=limit_w, ) # 5) choose pathway (Auto or specific) chosen_key = SEQUENCE_ALIASES.get(seq_choice, "auto") if chosen_key == "auto": final_key, final_sim = best_sequence_for_vector(context_vec) else: final_key = chosen_key phrase_for_final = PATHWAY_PHRASES.get(final_key, "") final_sim = float(util.cos_sim(context_vec, encode_text(phrase_for_final)).item()) if phrase_for_final else 0.0 # 6) outputs phrase = PATHWAY_PHRASES.get(final_key, "—") img_path = sequence_to_image_path(final_key) indicators = semantic_indicator_mapping_from_vec(context_vec, sentiment_score=sentiment) fig = indicators_plot(indicators) top5 = list(indicators.items())[:5] top5_str = "\n".join(f"{k}: {v}" for k, v in top5) # annotated meta emo_str = f"{emotion} ({emo_conf:.3f})" meta = f"{final_key} (relevance={final_sim:.3f})" # show how color cues mapped if boost_colors or limit_colors: meta += f" | boost colors: {', '.join(boost_colors) or '—'} → {', '.join(boost_seqs_from_colors) or '—'}" meta += f" | limit colors: {', '.join(limit_colors) or '—'} → {', '.join(limit_seqs_from_colors) or '—'}" return ( sentiment, # number emo_str, # text accomplishment, # number meta, # chosen pathway + relevance + color cue mapping phrase, # pathway phrase top5_str, # GNH top5 fig, # plot img_path, # image path (optional) ) with gr.Blocks(title="RGB Root Matriz Color Plotter") as demo: gr.Markdown("## RGB Root Matriz Color Plotter\n" "Type a phrase. Choose a **Sequence** or keep **Auto** to recommend a pathway. " "You’ll get sentiment, emotion, accomplishment, GNH bars, and the pathway phrase + image from the dataset.") with gr.Row(): inp = gr.Textbox( lines=4, label="Input text", placeholder="e.g., use gratitude from a return and inspiration from clarity to limit from red-pathway the pain from orange-pathway." ) with gr.Row(): seq = gr.Dropdown(choices=SEQ_CHOICES, value="Auto (recommend)", label="Primary Pathway") with gr.Row(): boost_terms = gr.Textbox(label="Boost terms (comma-separated)", placeholder="gratitude, inspiration, clarity") with gr.Row(): boost_seqs = gr.CheckboxGroup(choices=[c for c in SEQ_CHOICES if c != "Auto (recommend)"], label="Boost sequences (optional)") limit_seqs = gr.CheckboxGroup(choices=[c for c in SEQ_CHOICES if c != "Auto (recommend)"], label="Limit sequences (optional)") with gr.Row(): boost_w = gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="Boost weight") limit_w = gr.Slider(0.0, 1.5, value=0.6, step=0.05, label="Limit weight") btn = gr.Button("Analyze", variant="primary") with gr.Row(): sent = gr.Number(label="Sentiment (1–10)") emo = gr.Text(label="Emotion") acc = gr.Number(label="Accomplishment (1–10)") with gr.Row(): chosen = gr.Text(label="Chosen pathway (relevance + color mapping)") phrase_out = gr.Text(label="Pathway phrase") with gr.Row(): gnh_top = gr.Text(label="Top GNH Indicators (Top 5)") gnh_plot = gr.Plot(label="GNH Similarity (Pathway-weighted)") with gr.Row(): pathway_img = gr.Image(label="Pathway image", type="filepath") btn.click( fn=analyze, inputs=[inp, seq, boost_terms, boost_seqs, limit_seqs, boost_w, limit_w], outputs=[sent, emo, acc, chosen, phrase_out, gnh_top, gnh_plot, pathway_img] ) if __name__ == "__main__": demo.launch()