Spaces:

entropy25
/

multilingual-sentiment-analyzer

Sleeping

App Files Files Community

entropy25 commited on Jul 27

Commit

71b3ce2

verified ·

1 Parent(s): 3b49170

Update app.py

Browse files

Files changed (1) hide show

app.py +285 -91

app.py CHANGED Viewed

@@ -21,6 +21,8 @@ import nltk
 from nltk.corpus import stopwords
 import langdetect
 import pandas as pd
 # Configuration
 @dataclass
@@ -43,7 +45,8 @@ class Config:
     MODELS = {
         'en': "cardiffnlp/twitter-roberta-base-sentiment-latest",
-        'multilingual': "cardiffnlp/twitter-xlm-roberta-base-sentiment"
     }
     # Color themes
@@ -77,22 +80,33 @@ class ModelManager:
         self._load_default_model()
     def _load_default_model(self):
-        """Load the default English model"""
         try:
-            model_name = config.MODELS['multilingual']  # Use multilingual as default
             self.tokenizers['default'] = AutoTokenizer.from_pretrained(model_name)
             self.models['default'] = AutoModelForSequenceClassification.from_pretrained(model_name)
             self.models['default'].to(self.device)
             logger.info(f"Default model loaded: {model_name}")
         except Exception as e:
-            logger.error(f"Failed to load default model: {e}")
             raise
     def get_model(self, language='en'):
         """Get model for specific language"""
-        if language in ['en', 'auto'] or language not in config.SUPPORTED_LANGUAGES:
             return self.models['default'], self.tokenizers['default']
-        return self.models['default'], self.tokenizers['default']  # Use multilingual for all
     @staticmethod
     def detect_language(text: str) -> str:
@@ -318,7 +332,166 @@ class SentimentAnalyzer:
                 })
         return results
-class PlotlyVisualizer:
     """Enhanced visualizations with Plotly"""
     @staticmethod
@@ -675,12 +848,12 @@ def analyze_batch_texts(batch_text: str, language: str, theme: str,
         logger.error(f"Batch analysis failed: {e}")
         return f"Error: {str(e)}", None, None, None
-def analyze_advanced_text(text: str, language: str, theme: str, include_keywords: bool,
-                         keyword_count: int, min_confidence: float):
-    """Advanced analysis with additional features"""
     try:
         if not text.strip():
-            return "Please enter text", None, None
         # Map display names back to language codes
         language_map = {
@@ -694,14 +867,31 @@ def analyze_advanced_text(text: str, language: str, theme: str, include_keywords
         }
         language_code = language_map.get(language, 'auto')
         result = SentimentAnalyzer.analyze_text(text, language_code)
-        # Advanced keyword extraction
-        if include_keywords:
-            result['keywords'] = TextProcessor.extract_keywords(text, keyword_count)
-        # Confidence filtering
-        meets_confidence = result['confidence'] >= min_confidence
         # Add to history
         history_entry = {
@@ -715,39 +905,45 @@ def analyze_advanced_text(text: str, language: str, theme: str, include_keywords
             'language': result['language'],
             'timestamp': datetime.now().isoformat(),
             'analysis_type': 'advanced',
-            'meets_confidence_threshold': meets_confidence
         }
         history_manager.add_entry(history_entry)
-        # Create visualizations
         gauge_fig = PlotlyVisualizer.create_sentiment_gauge(result, theme)
         bars_fig = PlotlyVisualizer.create_probability_bars(result, theme)
         # Create detailed info text
-        confidence_status = "✅ High Confidence" if meets_confidence else "⚠️ Low Confidence"
         info_text = f"""
 **Advanced Analysis Results:**
 - **Sentiment:** {result['sentiment']} ({result['confidence']:.3f} confidence)
-- **Confidence Status:** {confidence_status}
 - **Language:** {result['language'].upper()}
 - **Text Statistics:**
   - Words: {result['word_count']}
   - Characters: {result['char_count']}
   - Average word length: {result['char_count']/max(result['word_count'], 1):.1f}
         """
-        if include_keywords:
-            info_text += f"\n- **Top Keywords:** {', '.join(result['keywords'])}"
-        if not meets_confidence:
-            info_text += f"\n\n⚠️ **Note:** Confidence ({result['confidence']:.3f}) is below threshold ({min_confidence})"
-        return info_text, gauge_fig, bars_fig
     except Exception as e:
         logger.error(f"Advanced analysis failed: {e}")
-        return f"Error: {str(e)}", None, None
 def get_history_stats():
     """Get enhanced history statistics"""
@@ -868,36 +1064,35 @@ def get_recent_analyses():
     return summary_text
 SAMPLE_TEXTS = [
     # Auto Detect
-    ["The film had its moments, but overall it felt a bit too long and lacked emotional depth. Some scenes were visually impressive, yet they failed to connect emotionally. By the end, I found myself disengaged and unsatisfied."],
     # English
-    ["I was completely blown away by the movie — the performances were raw and powerful, and the story stayed with me long after the credits rolled. Every scene felt purposeful, and the emotional arc was handled with incredible nuance. It's the kind of film that makes you reflect deeply on your own life."],
     # Chinese
-    ["这部电影节奏拖沓，剧情老套，完全没有让我产生任何共鸣，是一次失望的观影体验。演员的表演也显得做作，缺乏真实感。看到最后甚至有点不耐烦，整体表现乏善可陈。"],
     # Spanish
-    ["Una obra maestra del cine contemporáneo, con actuaciones sobresalientes, un guion bien escrito y una dirección impecable. Cada plano parecía cuidadosamente pensado, y la historia avanzaba con una intensidad emocional que mantenía al espectador cautivado. Definitivamente una película que vale la pena volver a ver."],
     # French
-    ["Je m'attendais à beaucoup mieux. Le scénario était confus, les dialogues ennuyeux, et je me suis presque endormi au milieu du film. Même la mise en scène, habituellement un point fort, manquait cruellement d'inspiration cette fois-ci."],
     # German
-    ["Der Film war ein emotionales Erlebnis mit großartigen Bildern, einem mitreißenden Soundtrack und einer Geschichte, die zum Nachdenken anregt. Besonders beeindruckend war die schauspielerische Leistung der Hauptdarsteller, die eine tiefe Menschlichkeit vermittelten. Es ist ein Film, der lange nachwirkt."],
     # Swedish
-    ["Filmen var en besvikelse – tråkig handling, överdrivet skådespeleri och ett slut som inte gav något avslut alls. Den kändes forcerad och saknade en tydlig röd tråd. Jag gick från biografen med en känsla av tomhet och frustration."]
 ]
-BATCH_SAMPLE = """I love this product! It works perfectly and exceeded my expectations. I've been using it every day and it hasn’t let me down once.
-The service was terrible and slow. I had to wait over an hour, and no one seemed to care about helping me. Really frustrating experience overall.
-Not sure if I like it or not. Some features are nice, but others are confusing or don’t work as expected. I’m still deciding whether it’s worth keeping.
-Amazing quality and fast delivery! The packaging was secure, and the product looked even better than in the pictures. I’ll definitely order from here again.
-Could be better, but it's okay. It does the job, but there are some issues with the build quality. Not bad, just not great either."""
 # Gradio Interface
 with gr.Blocks(theme=gr.themes.Soft(), title="Advanced Multilingual Sentiment Analyzer") as demo:
@@ -945,6 +1140,53 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Advanced Multilingual Sentiment An
             gauge_plot = gr.Plot(label="Sentiment Gauge")
             bars_plot = gr.Plot(label="Probability Distribution")
     with gr.Tab("📊 Batch Analysis"):
         with gr.Row():
             with gr.Column(scale=2):
@@ -992,54 +1234,6 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Advanced Multilingual Sentiment An
             batch_summary_plot = gr.Plot(label="Sentiment Summary")
             batch_confidence_plot = gr.Plot(label="Confidence Distribution")
-    with gr.Tab("🔬 Advanced Analysis"):
-        with gr.Row():
-            with gr.Column(scale=2):
-                advanced_input = gr.Textbox(
-                    label="Text for Advanced Analysis",
-                    placeholder="Enter text for detailed analysis...",
-                    lines=4
-                )
-                with gr.Row():
-                    advanced_language = gr.Dropdown(
-                        choices=['Auto Detect', 'English', 'Chinese', 'Spanish', 'French', 'German', 'Swedish'],
-                        value='Auto Detect',
-                        label="Language"
-                    )
-                    advanced_theme = gr.Dropdown(
-                        choices=list(config.THEMES.keys()),
-                        value='default',
-                        label="Theme"
-                    )
-                with gr.Row():
-                    include_keywords = gr.Checkbox(label="Extract Keywords", value=True)
-                    keyword_count = gr.Slider(
-                        minimum=3,
-                        maximum=10,
-                        value=5,
-                        step=1,
-                        label="Number of Keywords"
-                    )
-                min_confidence_slider = gr.Slider(
-                    minimum=0.0,
-                    maximum=1.0,
-                    value=0.7,
-                    step=0.1,
-                    label="Minimum Confidence Threshold"
-                )
-                advanced_analyze_btn = gr.Button("🔬 Advanced Analyze", variant="primary", size="lg")
-            with gr.Column(scale=1):
-                advanced_result_info = gr.Markdown("Configure settings and click Advanced Analyze")
-        with gr.Row():
-            advanced_gauge_plot = gr.Plot(label="Sentiment Gauge")
-            advanced_bars_plot = gr.Plot(label="Probability Distribution")
     with gr.Tab("📈 History & Analytics"):
         with gr.Row():
             with gr.Column():
@@ -1109,8 +1303,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Advanced Multilingual Sentiment An
     # Advanced Analysis
     advanced_analyze_btn.click(
         analyze_advanced_text,
-        inputs=[advanced_input, advanced_language, advanced_theme, include_keywords, keyword_count, min_confidence_slider],
-        outputs=[advanced_result_info, advanced_gauge_plot, advanced_bars_plot]
     )
     # History & Analytics

 from nltk.corpus import stopwords
 import langdetect
 import pandas as pd
+import shap
+from lime.lime_text import LimeTextExplainer
 # Configuration
 @dataclass
     MODELS = {
         'en': "cardiffnlp/twitter-roberta-base-sentiment-latest",
+        'multilingual': "cardiffnlp/twitter-xlm-roberta-base-sentiment",
+        'zh': "uer/roberta-base-finetuned-dianping-chinese"
     }
     # Color themes
         self._load_default_model()
     def _load_default_model(self):
+        """Load the default models"""
         try:
+            # Load multilingual model as default
+            model_name = config.MODELS['multilingual']
             self.tokenizers['default'] = AutoTokenizer.from_pretrained(model_name)
             self.models['default'] = AutoModelForSequenceClassification.from_pretrained(model_name)
             self.models['default'].to(self.device)
             logger.info(f"Default model loaded: {model_name}")
+            # Load Chinese model
+            zh_model_name = config.MODELS['zh']
+            self.tokenizers['zh'] = AutoTokenizer.from_pretrained(zh_model_name)
+            self.models['zh'] = AutoModelForSequenceClassification.from_pretrained(zh_model_name)
+            self.models['zh'].to(self.device)
+            logger.info(f"Chinese model loaded: {zh_model_name}")
         except Exception as e:
+            logger.error(f"Failed to load models: {e}")
             raise
     def get_model(self, language='en'):
         """Get model for specific language"""
+        if language == 'zh':
+            return self.models['zh'], self.tokenizers['zh']
+        elif language in ['en', 'auto'] or language not in config.SUPPORTED_LANGUAGES:
             return self.models['default'], self.tokenizers['default']
+        return self.models['default'], self.tokenizers['default']  # Use multilingual for other languages
     @staticmethod
     def detect_language(text: str) -> str:
                 })
         return results
+class ExplainabilityAnalyzer:
+    """SHAP and LIME explainability analysis"""
+    @staticmethod
+    def create_prediction_function(model, tokenizer, device):
+        """Create prediction function for LIME"""
+        def predict_proba(texts):
+            if isinstance(texts, str):
+                texts = [texts]
+            results = []
+            for text in texts:
+                inputs = tokenizer(text, return_tensors="pt", padding=True,
+                                 truncation=True, max_length=config.MAX_TEXT_LENGTH).to(device)
+                with torch.no_grad():
+                    outputs = model(**inputs)
+                    probs = torch.nn.functional.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
+                results.append(probs)
+            return np.array(results)
+        return predict_proba
+    @staticmethod
+    def analyze_with_lime(text: str, model, tokenizer, device, num_features: int = 10) -> Dict:
+        """Analyze text with LIME"""
+        try:
+            # Create prediction function
+            predict_fn = ExplainabilityAnalyzer.create_prediction_function(model, tokenizer, device)
+            # Initialize LIME explainer
+            explainer = LimeTextExplainer(class_names=['Negative', 'Neutral', 'Positive'] if len(predict_fn([text])[0]) == 3 else ['Negative', 'Positive'])
+            # Generate explanation
+            explanation = explainer.explain_instance(
+                text,
+                predict_fn,
+                num_features=num_features,
+                num_samples=100
+            )
+            # Extract feature importance
+            feature_importance = explanation.as_list()
+            return {
+                'method': 'LIME',
+                'feature_importance': feature_importance,
+                'explanation': explanation
+            }
+        except Exception as e:
+            logger.error(f"LIME analysis failed: {e}")
+            return {'method': 'LIME', 'error': str(e)}
+    @staticmethod
+    def analyze_with_attention(text: str, model, tokenizer, device) -> Dict:
+        """Analyze text with attention weights"""
+        try:
+            # Tokenize input
+            inputs = tokenizer(text, return_tensors="pt", padding=True,
+                             truncation=True, max_length=config.MAX_TEXT_LENGTH,
+                             return_attention_mask=True).to(device)
+            # Get model outputs with attention
+            with torch.no_grad():
+                outputs = model(**inputs, output_attentions=True)
+                attentions = outputs.attentions
+            # Get tokens
+            tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
+            # Average attention across layers and heads
+            avg_attention = torch.mean(torch.stack(attentions), dim=(0, 1, 2)).cpu().numpy()
+            # Create attention weights for each token
+            attention_weights = []
+            for i, token in enumerate(tokens):
+                if i < len(avg_attention):
+                    attention_weights.append((token, float(avg_attention[i])))
+            return {
+                'method': 'Attention',
+                'tokens': tokens,
+                'attention_weights': attention_weights
+            }
+        except Exception as e:
+            logger.error(f"Attention analysis failed: {e}")
+            return {'method': 'Attention', 'error': str(e)}
+class AdvancedVisualizer:
+    """Visualizations for explainability analysis"""
+    @staticmethod
+    def create_lime_plot(lime_result: Dict, theme: str = 'default') -> go.Figure:
+        """Create LIME feature importance plot"""
+        if 'error' in lime_result:
+            fig = go.Figure()
+            fig.add_annotation(text=f"LIME Error: {lime_result['error']}",
+                             x=0.5, y=0.5, showarrow=False)
+            return fig
+        features, scores = zip(*lime_result['feature_importance'])
+        colors = ['red' if score < 0 else 'green' for score in scores]
+        fig = go.Figure(data=[
+            go.Bar(
+                y=features,
+                x=scores,
+                orientation='h',
+                marker_color=colors,
+                text=[f'{score:.3f}' for score in scores],
+                textposition='auto'
+            )
+        ])
+        fig.update_layout(
+            title="LIME Feature Importance",
+            xaxis_title="Importance Score",
+            yaxis_title="Features",
+            height=400,
+            showlegend=False
+        )
+        return fig
+    @staticmethod
+    def create_attention_plot(attention_result: Dict, theme: str = 'default') -> go.Figure:
+        """Create attention weights visualization"""
+        if 'error' in attention_result:
+            fig = go.Figure()
+            fig.add_annotation(text=f"Attention Error: {attention_result['error']}",
+                             x=0.5, y=0.5, showarrow=False)
+            return fig
+        tokens, weights = zip(*attention_result['attention_weights'])
+        # Normalize weights for better visualization
+        weights = np.array(weights)
+        normalized_weights = (weights - weights.min()) / (weights.max() - weights.min()) if weights.max() > weights.min() else weights
+        fig = go.Figure(data=[
+            go.Bar(
+                x=list(range(len(tokens))),
+                y=normalized_weights,
+                text=tokens,
+                textposition='outside',
+                marker_color=normalized_weights,
+                colorscale='Viridis'
+            )
+        ])
+        fig.update_layout(
+            title="Attention Weights",
+            xaxis_title="Token Position",
+            yaxis_title="Attention Weight (Normalized)",
+            height=400,
+            showlegend=False
+        )
+        return fig
     """Enhanced visualizations with Plotly"""
     @staticmethod
         logger.error(f"Batch analysis failed: {e}")
         return f"Error: {str(e)}", None, None, None
+def analyze_advanced_text(text: str, language: str, theme: str, use_lime: bool,
+                         use_attention: bool, lime_features: int):
+    """Advanced analysis with SHAP and LIME explainability"""
     try:
         if not text.strip():
+            return "Please enter text", None, None, None, None
         # Map display names back to language codes
         language_map = {
         }
         language_code = language_map.get(language, 'auto')
+        # Basic sentiment analysis
         result = SentimentAnalyzer.analyze_text(text, language_code)
+        # Get model for explainability analysis
+        model, tokenizer = model_manager.get_model(language_code)
+        # Initialize explainability results
+        lime_result = None
+        attention_result = None
+        lime_plot = None
+        attention_plot = None
+        # LIME Analysis
+        if use_lime:
+            lime_result = ExplainabilityAnalyzer.analyze_with_lime(
+                text, model, tokenizer, model_manager.device, lime_features
+            )
+            lime_plot = AdvancedVisualizer.create_lime_plot(lime_result, theme)
+        # Attention Analysis
+        if use_attention:
+            attention_result = ExplainabilityAnalyzer.analyze_with_attention(
+                text, model, tokenizer, model_manager.device
+            )
+            attention_plot = AdvancedVisualizer.create_attention_plot(attention_result, theme)
         # Add to history
         history_entry = {
             'language': result['language'],
             'timestamp': datetime.now().isoformat(),
             'analysis_type': 'advanced',
+            'explainability_used': use_lime or use_attention
         }
         history_manager.add_entry(history_entry)
+        # Create basic visualizations
         gauge_fig = PlotlyVisualizer.create_sentiment_gauge(result, theme)
         bars_fig = PlotlyVisualizer.create_probability_bars(result, theme)
         # Create detailed info text
         info_text = f"""
 **Advanced Analysis Results:**
 - **Sentiment:** {result['sentiment']} ({result['confidence']:.3f} confidence)
 - **Language:** {result['language'].upper()}
 - **Text Statistics:**
   - Words: {result['word_count']}
   - Characters: {result['char_count']}
   - Average word length: {result['char_count']/max(result['word_count'], 1):.1f}
+- **Keywords:** {', '.join(result['keywords'])}
+**Explainability Analysis:**
         """
+        if use_lime:
+            if 'error' not in lime_result:
+                info_text += f"\n- **LIME:** ✅ Analyzed top {lime_features} features"
+            else:
+                info_text += f"\n- **LIME:** ❌ Error occurred"
+        if use_attention:
+            if 'error' not in attention_result:
+                info_text += f"\n- **Attention:** ✅ Token-level attention weights computed"
+            else:
+                info_text += f"\n- **Attention:** ❌ Error occurred"
+        return info_text, gauge_fig, bars_fig, lime_plot, attention_plot
     except Exception as e:
         logger.error(f"Advanced analysis failed: {e}")
+        return f"Error: {str(e)}", None, None, None, None
 def get_history_stats():
     """Get enhanced history statistics"""
     return summary_text
+# Sample data
 SAMPLE_TEXTS = [
     # Auto Detect
+    ["The film had its moments, but overall it felt a bit too long and lacked emotional depth."],
     # English
+    ["I was completely blown away by the movie — the performances were raw and powerful, and the story stayed with me long after the credits rolled."],
     # Chinese
+    ["这部电影节奏拖沓，剧情老套，完全没有让我产生任何共鸣，是一次失望的观影体验。"],
     # Spanish
+    ["Una obra maestra del cine contemporáneo, con actuaciones sobresalientes, un guion bien escrito y una dirección impecable."],
     # French
+    ["Je m'attendais à beaucoup mieux. Le scénario était confus, les dialogues ennuyeux, et je me suis presque endormi au milieu du film."],
     # German
+    ["Der Film war ein emotionales Erlebnis mit großartigen Bildern, einem mitreißenden Soundtrack und einer Geschichte, die zum Nachdenken anregt."],
     # Swedish
+    ["Filmen var en besvikelse – tråkig handling, överdrivet skådespeleri och ett slut som inte gav något avslut alls."]
 ]
+BATCH_SAMPLE = """I love this product! It works perfectly.
+The service was terrible and slow.
+Not sure if I like it or not.
+Amazing quality and fast delivery!
+Could be better, but it's okay."""
 # Gradio Interface
 with gr.Blocks(theme=gr.themes.Soft(), title="Advanced Multilingual Sentiment Analyzer") as demo:
             gauge_plot = gr.Plot(label="Sentiment Gauge")
             bars_plot = gr.Plot(label="Probability Distribution")
+    with gr.Tab("🔬 Advanced Analysis"):
+        with gr.Row():
+            with gr.Column(scale=2):
+                advanced_input = gr.Textbox(
+                    label="Text for Advanced Analysis",
+                    placeholder="Enter text for explainability analysis...",
+                    lines=4
+                )
+                with gr.Row():
+                    advanced_language = gr.Dropdown(
+                        choices=['Auto Detect', 'English', 'Chinese', 'Spanish', 'French', 'German', 'Swedish'],
+                        value='Auto Detect',
+                        label="Language"
+                    )
+                    advanced_theme = gr.Dropdown(
+                        choices=list(config.THEMES.keys()),
+                        value='default',
+                        label="Theme"
+                    )
+                gr.Markdown("### 🔍 Explainability Options")
+                with gr.Row():
+                    use_lime = gr.Checkbox(label="Use LIME Analysis", value=True)
+                    use_attention = gr.Checkbox(label="Use Attention Weights", value=True)
+                lime_features = gr.Slider(
+                    minimum=5,
+                    maximum=20,
+                    value=10,
+                    step=1,
+                    label="LIME Features Count"
+                )
+                advanced_analyze_btn = gr.Button("🔬 Advanced Analyze", variant="primary", size="lg")
+            with gr.Column(scale=1):
+                advanced_result_info = gr.Markdown("Configure explainability settings and click Advanced Analyze")
+        with gr.Row():
+            advanced_gauge_plot = gr.Plot(label="Sentiment Gauge")
+            advanced_bars_plot = gr.Plot(label="Probability Distribution")
+        with gr.Row():
+            lime_plot = gr.Plot(label="LIME Feature Importance")
+            attention_plot = gr.Plot(label="Attention Weights")
     with gr.Tab("📊 Batch Analysis"):
         with gr.Row():
             with gr.Column(scale=2):
             batch_summary_plot = gr.Plot(label="Sentiment Summary")
             batch_confidence_plot = gr.Plot(label="Confidence Distribution")
     with gr.Tab("📈 History & Analytics"):
         with gr.Row():
             with gr.Column():
     # Advanced Analysis
     advanced_analyze_btn.click(
         analyze_advanced_text,
+        inputs=[advanced_input, advanced_language, advanced_theme, use_lime, use_attention, lime_features],
+        outputs=[advanced_result_info, advanced_gauge_plot, advanced_bars_plot, lime_plot, attention_plot]
     )
     # History & Analytics