AIEcosystem commited on
Commit
27fe016
·
verified ·
1 Parent(s): b5938f9

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +42 -7
src/streamlit_app.py CHANGED
@@ -83,42 +83,77 @@ def highlight_entities(text, df_entities):
83
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
84
  # Use a div to mimic the Streamlit input box style for the report
85
  return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
 
 
86
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
87
  """
88
- Performs basic Topic Modeling using LDA on the extracted entities
89
- and returns structured data for visualization.
90
  """
 
91
  documents = df_entities['text'].unique().tolist()
 
92
  if len(documents) < 2:
93
  return None
 
94
  N = min(num_top_words, len(documents))
 
95
  try:
 
 
96
  tfidf_vectorizer = TfidfVectorizer(
97
- max_df=1,
98
- min_df=1,
99
- stop_words='english'
 
100
  )
 
101
  tfidf = tfidf_vectorizer.fit_transform(documents)
102
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  lda = LatentDirichletAllocation(
104
- n_components=num_topics, max_iter=5, learning_method='online',random_state=42, n_jobs=-1
 
105
  )
106
  lda.fit(tfidf)
 
 
107
  topic_data_list = []
108
  for topic_idx, topic in enumerate(lda.components_):
109
  top_words_indices = topic.argsort()[:-N - 1:-1]
 
110
  top_words = [tfidf_feature_names[i] for i in top_words_indices]
111
  word_weights = [topic[i] for i in top_words_indices]
 
112
  for word, weight in zip(top_words, word_weights):
113
  topic_data_list.append({
114
  'Topic_ID': f'Topic #{topic_idx + 1}',
115
  'Word': word,
116
  'Weight': weight,
117
  })
 
118
  return pd.DataFrame(topic_data_list)
 
119
  except Exception as e:
120
- st.error(f"Topic modeling failed: {e}")
 
121
  return None
 
 
 
 
122
 
123
  def create_topic_word_bubbles(df_topic_data):
124
  """Generates a Plotly Bubble Chart for top words across
 
83
  highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
84
  # Use a div to mimic the Streamlit input box style for the report
85
  return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
86
+
87
+
88
  def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
89
  """
90
+ Performs basic Topic Modeling using LDA on the extracted entities,
91
+ allowing for n-grams to capture multi-word entities like 'Dr. Emily Carter'.
92
  """
93
+ # 1. Prepare Documents: Use unique entities (they are short, clean documents)
94
  documents = df_entities['text'].unique().tolist()
95
+
96
  if len(documents) < 2:
97
  return None
98
+
99
  N = min(num_top_words, len(documents))
100
+
101
  try:
102
+ # 2. Vectorizer: Use TfidfVectorizer, but allow unigrams, bigrams, and trigrams (ngram_range)
103
+ # to capture multi-word entities. We keep stop_words='english' for the *components* of the entity.
104
  tfidf_vectorizer = TfidfVectorizer(
105
+ max_df=0.95,
106
+ min_df=2, # Only consider words/phrases that appear at least twice to find topics
107
+ stop_words='english',
108
+ ngram_range=(1, 3) # This is the KEY to capturing "Dr. Emily Carter" as a single token (if it appears enough times)
109
  )
110
+
111
  tfidf = tfidf_vectorizer.fit_transform(documents)
112
  tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
113
+
114
+ # Check if the vocabulary is too small after tokenization/ngram generation
115
+ if len(tfidf_feature_names) < num_topics:
116
+ # Re-run with min_df=1 if vocab is too small
117
+ tfidf_vectorizer = TfidfVectorizer(
118
+ max_df=1.0, min_df=1, stop_words='english', ngram_range=(1, 3)
119
+ )
120
+ tfidf = tfidf_vectorizer.fit_transform(documents)
121
+ tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
122
+ if len(tfidf_feature_names) < num_topics:
123
+ return None
124
+
125
+ # 3. LDA Model Fit
126
  lda = LatentDirichletAllocation(
127
+ n_components=num_topics, max_iter=5, learning_method='online',
128
+ random_state=42, n_jobs=-1
129
  )
130
  lda.fit(tfidf)
131
+
132
+ # 4. Extract Topic Data
133
  topic_data_list = []
134
  for topic_idx, topic in enumerate(lda.components_):
135
  top_words_indices = topic.argsort()[:-N - 1:-1]
136
+ # These top_words will now include phrases like 'emily carter' or 'european space agency'
137
  top_words = [tfidf_feature_names[i] for i in top_words_indices]
138
  word_weights = [topic[i] for i in top_words_indices]
139
+
140
  for word, weight in zip(top_words, word_weights):
141
  topic_data_list.append({
142
  'Topic_ID': f'Topic #{topic_idx + 1}',
143
  'Word': word,
144
  'Weight': weight,
145
  })
146
+
147
  return pd.DataFrame(topic_data_list)
148
+
149
  except Exception as e:
150
+ # A broader catch for robustness
151
+ # st.error(f"Topic modeling failed: {e}") # Keep commented out for cleaner app
152
  return None
153
+
154
+
155
+
156
+
157
 
158
  def create_topic_word_bubbles(df_topic_data):
159
  """Generates a Plotly Bubble Chart for top words across