Spaces:

AIEcosystem
/

DataHarvest

Running

App Files Files Community

AIEcosystem commited on Oct 11

Commit

27fe016

verified ·

1 Parent(s): b5938f9

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +42 -7

src/streamlit_app.py CHANGED Viewed

@@ -83,42 +83,77 @@ def highlight_entities(text, df_entities):
         highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
     # Use a div to mimic the Streamlit input box style for the report
     return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
 def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
     """
-    Performs basic Topic Modeling using LDA on the extracted entities
-    and returns structured data for visualization.
     """
     documents = df_entities['text'].unique().tolist()
     if len(documents) < 2:
         return None
     N = min(num_top_words, len(documents))
     try:
         tfidf_vectorizer = TfidfVectorizer(
-            max_df=1,
-            min_df=1,
-            stop_words='english'
         )
         tfidf = tfidf_vectorizer.fit_transform(documents)
         tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
         lda = LatentDirichletAllocation(
-            n_components=num_topics, max_iter=5, learning_method='online',random_state=42, n_jobs=-1
         )
         lda.fit(tfidf)
         topic_data_list = []
         for topic_idx, topic in enumerate(lda.components_):
             top_words_indices = topic.argsort()[:-N - 1:-1]
             top_words = [tfidf_feature_names[i] for i in top_words_indices]
             word_weights = [topic[i] for i in top_words_indices]
             for word, weight in zip(top_words, word_weights):
                 topic_data_list.append({
                     'Topic_ID': f'Topic #{topic_idx + 1}',
                     'Word': word,
                     'Weight': weight,
                 })
         return pd.DataFrame(topic_data_list)
     except Exception as e:
-        st.error(f"Topic modeling failed: {e}")
         return None
 def create_topic_word_bubbles(df_topic_data):
     """Generates a Plotly Bubble Chart for top words across

         highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
     # Use a div to mimic the Streamlit input box style for the report
     return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
 def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
     """
+    Performs basic Topic Modeling using LDA on the extracted entities,
+    allowing for n-grams to capture multi-word entities like 'Dr. Emily Carter'.
     """
+    # 1. Prepare Documents: Use unique entities (they are short, clean documents)
     documents = df_entities['text'].unique().tolist()
     if len(documents) < 2:
         return None
     N = min(num_top_words, len(documents))
     try:
+        # 2. Vectorizer: Use TfidfVectorizer, but allow unigrams, bigrams, and trigrams (ngram_range)
+        # to capture multi-word entities. We keep stop_words='english' for the *components* of the entity.
         tfidf_vectorizer = TfidfVectorizer(
+            max_df=0.95,
+            min_df=2, # Only consider words/phrases that appear at least twice to find topics
+            stop_words='english',
+            ngram_range=(1, 3) # This is the KEY to capturing "Dr. Emily Carter" as a single token (if it appears enough times)
         )
         tfidf = tfidf_vectorizer.fit_transform(documents)
         tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
+        # Check if the vocabulary is too small after tokenization/ngram generation
+        if len(tfidf_feature_names) < num_topics:
+            # Re-run with min_df=1 if vocab is too small
+            tfidf_vectorizer = TfidfVectorizer(
+                max_df=1.0, min_df=1, stop_words='english', ngram_range=(1, 3)
+            )
+            tfidf = tfidf_vectorizer.fit_transform(documents)
+            tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
+            if len(tfidf_feature_names) < num_topics:
+                return None
+        # 3. LDA Model Fit
         lda = LatentDirichletAllocation(
+            n_components=num_topics, max_iter=5, learning_method='online',
+            random_state=42, n_jobs=-1
         )
         lda.fit(tfidf)
+        # 4. Extract Topic Data
         topic_data_list = []
         for topic_idx, topic in enumerate(lda.components_):
             top_words_indices = topic.argsort()[:-N - 1:-1]
+            # These top_words will now include phrases like 'emily carter' or 'european space agency'
             top_words = [tfidf_feature_names[i] for i in top_words_indices]
             word_weights = [topic[i] for i in top_words_indices]
             for word, weight in zip(top_words, word_weights):
                 topic_data_list.append({
                     'Topic_ID': f'Topic #{topic_idx + 1}',
                     'Word': word,
                     'Weight': weight,
                 })
         return pd.DataFrame(topic_data_list)
     except Exception as e:
+        # A broader catch for robustness
+        # st.error(f"Topic modeling failed: {e}") # Keep commented out for cleaner app
         return None
 def create_topic_word_bubbles(df_topic_data):
     """Generates a Plotly Bubble Chart for top words across