Spaces:

jeevitha-app
/

Sentiment_analyzer

Sleeping

App Files Files Community

jeevitha-app commited on 24 days ago

Commit

dbfb384

verified ·

1 Parent(s): 1ef9d51

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -210

app.py CHANGED Viewed

@@ -1,221 +1,84 @@
-# app.py
-# Gradio app: English + Persian sentiment with SHAP-based interpretability and word highlighting
 import joblib
-import numpy as np
-import pandas as pd
 import shap
 import matplotlib.pyplot as plt
-import io
-import base64
-import html
-from typing import Tuple, Dict, List
-import math
-import gradio as gr
-# --------- Load models (replace filenames if you used different names) ----------
-ENG_MODEL_PATH = "best_model.pkl"
-ENG_VEC_PATH   = "tfidf_vectorizer.pkl"
-PER_MODEL_PATH = "logistic_regression.pkl"
-PER_VEC_PATH   = "tfidf_vectorizer_persian.pkl"
-eng_model = joblib.load(ENG_MODEL_PATH)
-eng_vectorizer = joblib.load(ENG_VEC_PATH)
-per_model = joblib.load(PER_MODEL_PATH)
-per_vectorizer = joblib.load(PER_VEC_PATH)
-CLASS_NAMES_EN = ["Negative", "Neutral", "Positive"]
-CLASS_NAMES_PER = ["منفی", "خنثی", "مثبت"]
-# --------- Utility: create bar data for gradio BarPlot ----------
-def probs_to_bar(probs: List[float], lang: str):
-    names = CLASS_NAMES_EN if lang == "English" else CLASS_NAMES_PER
-    return {names[i]: float(probs[i]) for i in range(len(probs))}
-# --------- Utility: create HTML highlight from SHAP values ----------
-def make_html_highlight(original_text: str,
-                        feature_names: np.ndarray,
-                        shap_values_feature: np.ndarray,
-                        vectorizer_vocab: dict,
-                        max_display: int = 30) -> str:
-    """
-    Simple token-level highlighting:
-    - Tokenize by whitespace (preserves original punctuation).
-    - For each token, attempt to map token.lower() to the vectorizer vocab;
-      if found, get SHAP impact for that feature name.
-    - Color red for positive contribution, blue for negative.
-    Returns an HTML-safe string.
-    """
-    # Build mapping word -> shap value if present in vocabulary
-    # vectorizer_vocab maps token -> idx in feature_names
-    token_to_shap = {}
-    for idx, fname in enumerate(feature_names):
-        # Often fname is the token/ngram itself
-        token_to_shap[fname] = shap_values_feature[idx]
-    # Tokenize (simple)
-    tokens = original_text.split()
-    # Compute max magnitude for scaling opacity
-    mags = []
-    for t in tokens:
-        key = t.lower()
-        val = None
-        # Try several common variants: exact, lower, strip punctuation from ends
-        if key in vectorizer_vocab:
-            val = shap_values_feature[vectorizer_vocab[key]]
-        else:
-            key2 = ''.join(ch for ch in key if ch.isalnum())
-            if key2 in vectorizer_vocab:
-                val = shap_values_feature[vectorizer_vocab[key2]]
-        mags.append(abs(val) if val is not None else 0.0)
-    max_mag = max(mags) if mags else 1.0
-    if max_mag == 0:
-        max_mag = 1.0
-    # Build HTML with span coloring
-    html_tokens = []
-    for t in tokens:
-        display = html.escape(t)
-        key = t.lower()
-        val = None
-        if key in vectorizer_vocab:
-            val = shap_values_feature[vectorizer_vocab[key]]
-        else:
-            key2 = ''.join(ch for ch in key if ch.isalnum())
-            if key2 in vectorizer_vocab:
-                val = shap_values_feature[vectorizer_vocab[key2]]
-        if val is None or abs(val) < 1e-6:
-            html_tokens.append(f"<span style='padding:2px'>{display}</span>")
-        else:
-            sign = "pos" if val > 0 else "neg"
-            mag = min(1.0, abs(val) / max_mag)  # scale 0..1
-            opacity = 0.15 + 0.85 * mag  # avoid fully transparent
-            color = f"rgba(220,20,60,{opacity})" if sign == "pos" else f"rgba(30,144,255,{opacity})"
-            border = "1px solid rgba(0,0,0,0.04)"
-            html_tokens.append(
-                f"<span style='background:{color};padding:2px;margin:1px;border-radius:4px;display:inline-block;{border}'>"
-                f"{display}</span>"
-            )
-    highlighted_html = "<div style='line-height:1.6;font-size:16px'>" + " ".join(html_tokens) + "</div>"
-    return highlighted_html
-# --------- Core function: predict + interpret ----------
-def explain_and_predict(text: str, language: str):
-    text = text or ""
     if language == "English":
-        model = eng_model
-        vectorizer = eng_vectorizer
-        class_names = CLASS_NAMES_EN
     else:
-        model = per_model
-        vectorizer = per_vectorizer
-        class_names = CLASS_NAMES_PER
-    if text.strip() == "":
-        return "⚠️ Please enter text.", {}, {"Word": [], "SHAP Impact": []}, "<i>No input</i>"
-    # vectorize
-    vec = vectorizer.transform([text])
-    probs = model.predict_proba(vec)[0]
-    pred_class = int(np.argmax(probs))
-    label = class_names[pred_class]
-    confidence = float(probs[pred_class])
-    # Build SHAP explainer on a small background (use small subset via dummy background)
-    # NOTE: building explainer can be slow; in Spaces you can build once at import
-    # For robustness we build a simple LinearExplainer on vector space
-    # Use small dense sample from training if available - here use vectorizer vocabulary size fallback
-    # Convert to dense for LinearExplainer
-    try:
-        # Use a small background of zeros (cheap) — LinearExplainer can accept arrays
-        background = np.zeros((1, vec.shape[1]))
-        explainer = shap.LinearExplainer(model, background, feature_names=vectorizer.get_feature_names_out())
-        # compute shap on the numeric vector
-        vec_dense = vec.toarray()
-        shap_vals = explainer(vec_dense)  # returns shap.Explanation
-    except Exception:
-        # fallback: use PermutationExplainer on numeric input (slower)
-        explainer = shap.Explainer(model.predict_proba, vec)
-        shap_vals = explainer(vec)
-    # shap_vals.values shape: (n_outputs, n_features) OR Explanation with values (n_features, n_classes)
-    # Normalize to feature vector for chosen class
-    # shap_vals may be multi-output: shap_vals.values => (n_samples, n_features, n_classes) or similar
-    try:
-        # preferred shape: shap_vals.values -> (1, n_features, n_classes)
-        values = shap_vals.values  # ND array
-        if values.ndim == 3:
-            # pick sample 0, class pred_class
-            shap_per_feature = values[0, :, pred_class]
-        elif values.ndim == 2:
-            # shape (n_samples, n_features) for single class models — take sample 0
-            shap_per_feature = values[0, :]
-        else:
-            # try to flatten
-            shap_per_feature = np.ravel(values)[0:vec.shape[1]]
-    except Exception:
-        # Last resort: try shap_vals[0].values
-        try:
-            shap_per_feature = shap_vals[0].values[:, pred_class]
-        except Exception:
-            shap_per_feature = np.zeros(vec.shape[1])
-    # Feature names & vocab
-    feature_names = np.array(vectorizer.get_feature_names_out())
-    vocab = {k: v for k, v in (getattr(vectorizer, "vocabulary_", {})).items()}
-    # Build top contributing words list (pairs)
-    # shap_per_feature length must match len(feature_names)
-    if len(shap_per_feature) != len(feature_names):
-        # try to align by vectorizer.vocabulary_
-        full_shap = np.zeros(len(feature_names))
-        # if shap_per_feature smaller, attempt to use indices from vocab
-        min_len = min(len(shap_per_feature), len(full_shap))
-        full_shap[:min_len] = shap_per_feature[:min_len]
-        shap_per_feature = full_shap
-    # Top positive and negative features
-    n = 10
-    idx_sorted = np.argsort(-np.abs(shap_per_feature))
-    top_idx = idx_sorted[:n]
-    top_words = feature_names[top_idx].tolist()
-    top_contribs = shap_per_feature[top_idx].tolist()
-    # Build word table for display
-    word_table = {"Word": top_words, "SHAP Impact": top_contribs}
-    # Build highlight HTML (token-level approx using unigram mapping)
-    highlight_html = make_html_highlight(text, feature_names, shap_per_feature, vocab)
-    # Return: label string, probabilities dict, table dict, html highlight
-    return f"🎯 **{label}** (confidence: {confidence:.2f})", probs_to_bar(probs.tolist(), language), word_table, highlight_html
-# --------- Gradio UI build ----------
-with gr.Blocks() as demo:
-    gr.Markdown("## 🌍 Multilingual Sentiment Analysis (English 🇬🇧 & Persian 🇮🇷) — Interpretable")
-    with gr.Row():
-        language = gr.Radio(["English", "Persian"], value="English", label="Choose language")
-        text_input = gr.Textbox(lines=4, placeholder="Type comment here...", label="Input text")
-    with gr.Row():
-        btn = gr.Button("Analyze")
-    with gr.Row():
-        pred_out = gr.Markdown()
-    with gr.Row():
-        bar = gr.BarPlot(label="Class probabilities")
-        table = gr.Dataframe(headers=["Word", "SHAP Impact"], label="Top contributing words")
-    with gr.Row():
-        html_out = gr.HTML(label="Word-level Highlight (red = pushes toward prediction, blue = pushes away)")
-    def run(text, lang):
-        label, probs, word_table, html_highlight = explain_and_predict(text, lang)
-        # format outputs for gradio
-        return label, probs, pd.DataFrame(word_table), html_highlight
-    btn.click(fn=run, inputs=[text_input, language], outputs=[pred_out, bar, table, html_out])
-if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", share=True)

+import gradio as gr
 import joblib
 import shap
+import numpy as np
 import matplotlib.pyplot as plt
+import tempfile
+import os
+# ---------------------------------------------------------
+# Load both models and vectorizers
+# ---------------------------------------------------------
+english_model = joblib.load("models/english_model.pkl")
+english_vec   = joblib.load("models/english_vectorizer.pkl")
+persian_model = joblib.load("models/persian_model.pkl")
+persian_vec   = joblib.load("models/persian_vectorizer.pkl")
+class_names = ["Negative", "Neutral", "Positive"]
+# ---------------------------------------------------------
+# Prediction + Interpretability Function
+# ---------------------------------------------------------
+def predict_sentiment(text, language):
+    if not text.strip():
+        return "Please enter text!", None
     if language == "English":
+        model, vec = english_model, english_vec
     else:
+        model, vec = persian_model, persian_vec
+    X = vec.transform([text])
+    probs = model.predict_proba(X)[0]
+    pred_idx = np.argmax(probs)
+    label = class_names[pred_idx]
+    # --- SHAP interpretability ---
+    explainer = shap.LinearExplainer(model, vec.transform([text]))
+    shap_vals = explainer(X)
+    shap_values = shap_vals.values[0][:, pred_idx]
+    feature_names = vec.get_feature_names_out()
+    top_idx = np.argsort(-abs(shap_values))[:10]
+    tokens = [feature_names[i] for i in top_idx]
+    impacts = [shap_values[i] for i in top_idx]
+    # Save temporary bar chart
+    fig, ax = plt.subplots(figsize=(6, 3))
+    colors = ["crimson" if v > 0 else "steelblue" for v in impacts]
+    ax.barh(tokens, impacts, color=colors)
+    ax.invert_yaxis()
+    ax.set_title(f"Top Words driving {label} prediction")
+    tmp_path = tempfile.mktemp(suffix=".png")
+    plt.tight_layout()
+    plt.savefig(tmp_path)
+    plt.close(fig)
+    explanation = f"""
+**Predicted Sentiment:** {label}\n
+**Confidence:** {probs[pred_idx]:.2f}\n
+**Top Influential Words:**\n
+{', '.join(tokens)}
+"""
+    return explanation, tmp_path
+# ---------------------------------------------------------
+# Gradio UI
+# ---------------------------------------------------------
+iface = gr.Interface(
+    fn=predict_sentiment,
+    inputs=[
+        gr.Textbox(lines=3, label="Enter comment"),
+        gr.Radio(["English", "Persian"], label="Choose Dataset/Language")
+    ],
+    outputs=[
+        gr.Markdown(label="Prediction + Interpretation"),
+        gr.Image(label="Top Word Contributions")
+    ],
+    title="🌍 Multi-Lingual Sentiment Analysis (English + Persian)",
+    description="Select a language, type a comment, and see both the prediction and SHAP interpretability."
+)
+iface.launch()