AIEcosystem commited on
Commit
758f35b
·
verified ·
1 Parent(s): 0b61c41

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +620 -227
src/streamlit_app.py CHANGED
@@ -2,263 +2,656 @@ import os
2
  os.environ['HF_HOME'] = '/tmp'
3
  import time
4
  import streamlit as st
 
5
  import pandas as pd
6
  import io
7
  import plotly.express as px
8
- import zipfile
 
 
9
  import string
10
- from cryptography.fernet import Fernet
11
- from streamlit_extras.stylable_container import stylable_container
12
- from typing import Optional
 
 
 
 
 
 
 
 
 
13
  from gliner import GLiNER
14
- from comet_ml import Experiment
15
-
16
- # --- Page Configuration and UI Elements ---
17
- st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
18
- st.subheader("DataHarvest", divider="violet")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
20
- st.markdown(':rainbow[**Supported Languages: English**]')
21
-
22
- expander = st.expander("**Important notes**")
23
- expander.write("""**Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"
24
 
25
- **Results:** Results are presented in easy-to-read tables, visualized in an interactive tree map, pie chart and bar chart, and are available for download along with a Glossary of tags.
26
 
27
- **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract and tag entities in your text data.
28
 
29
- **Usage Limits:** You can request results unlimited times for one (1) month.
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.""")
 
 
 
 
 
 
 
 
32
 
33
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
34
 
35
- with st.sidebar:
36
- st.write("Use the following code to embed the DataHarvest web app on your website. Feel free to adjust the width and height values to fit your page.")
37
- code = '''
38
- <iframe
39
- src="https://aiecosystem-dataharvest.hf.space"
40
- frameborder="0"
41
- width="850"
42
- height="450"
43
- ></iframe>
44
-
45
- '''
46
- st.code(code, language="html")
47
- st.text("")
48
- st.text("")
49
-
50
- st.subheader("🚀 Ready to build your own AI Web App?", divider="violet")
51
- st.link_button("AI Web App Builder", "https://nlpblogs.com/build-your-named-entity-recognition-app/", type="primary")
52
-
53
- # --- Comet ML Setup ---
54
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
55
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
56
  COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
57
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
58
- if not comet_initialized:
59
- st.warning("Comet ML not initialized. Check environment variables.")
60
- print("Warning: Comet ML environment variables are not set. Logging will be disabled.")
61
-
62
- # --- Label Definitions ---
63
- labels = ["person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"]
64
- category_mapping = {
65
- "People": ["person", "organization", "position"],
66
- "Locations": ["country", "city"],
67
- "Time": ["date", "time"],
68
- "Numbers": ["money", "cardinal"]
69
- }
70
-
71
  # --- Model Loading ---
72
  @st.cache_resource
73
  def load_ner_model():
74
- """Loads the GLiNER model and caches it."""
75
- try:
76
- return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
77
- except Exception as e:
78
- st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
79
- st.stop()
80
  model = load_ner_model()
81
- reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
82
-
83
- # --- Session State Initialization ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  if 'show_results' not in st.session_state:
85
- st.session_state.show_results = False
86
  if 'last_text' not in st.session_state:
87
- st.session_state.last_text = ""
88
  if 'results_df' not in st.session_state:
89
- st.session_state.results_df = pd.DataFrame()
90
  if 'elapsed_time' not in st.session_state:
91
- st.session_state.elapsed_time = 0.0
92
-
 
 
 
 
 
 
 
 
 
 
 
 
93
  # --- Text Input and Clear Button ---
94
- word_limit = 200
95
- text = st.text_area(f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter", height=250, key='my_text_area')
 
 
 
 
96
  word_count = len(text.split())
97
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
98
-
99
- def clear_text():
100
- """Clears the text area and hides results."""
101
- st.session_state['my_text_area'] = ""
102
- st.session_state.show_results = False
103
- st.session_state.last_text = ""
104
- st.session_state.results_df = pd.DataFrame()
105
- st.session_state.elapsed_time = 0.0
106
-
107
  st.button("Clear text", on_click=clear_text)
108
-
109
- # --- Post-processing function to remove trailing punctuation ---
110
- def remove_trailing_punctuation(text_string):
111
- """
112
- Removes trailing punctuation from a string.
113
-
114
- Args:
115
- text_string (str): The input string.
116
-
117
- Returns:
118
- str: The string with trailing punctuation removed.
119
- """
120
- return text_string.rstrip(string.punctuation)
121
-
122
- # --- Results Section ---
123
  if st.button("Results"):
124
- if not text.strip():
125
- st.warning("Please enter some text to extract entities.")
126
- st.session_state.show_results = False
127
- elif word_count > word_limit:
128
- st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
129
- st.session_state.show_results = False
130
- else:
131
- # Check if the text is different from the last time
132
- if text != st.session_state.last_text:
133
- st.session_state.show_results = True
134
- st.session_state.last_text = text
135
- start_time = time.time()
136
- with st.spinner("Extracting entities...", show_time=True):
137
- # Pass the raw text directly to the model
138
- entities = model.predict_entities(text, labels)
139
- df = pd.DataFrame(entities)
140
-
141
- # Apply post-processing to remove punctuation
142
- if not df.empty:
143
- df['text'] = df['text'].apply(remove_trailing_punctuation)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
 
145
- st.session_state.results_df = df
146
- if not df.empty:
147
- df['category'] = df['label'].map(reverse_category_mapping)
148
- if comet_initialized:
149
- experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
150
- experiment.log_parameter("input_text", text)
151
- experiment.log_table("predicted_entities", df)
152
- experiment.end()
153
- end_time = time.time()
154
- st.session_state.elapsed_time = end_time - start_time
155
- # Place the message here, so it only runs once per button click
156
- st.info(f"Results processed in **{st.session_state.elapsed_time:.2f} seconds**.")
157
- # If the text is the same, do nothing but keep results displayed
158
- else:
159
- st.session_state.show_results = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
160
 
161
- # Display results if the state variable is True
162
- if st.session_state.show_results:
163
- df = st.session_state.results_df
164
- if not df.empty:
165
- df['category'] = df['label'].map(reverse_category_mapping)
166
- st.subheader("Grouped Entities by Category", divider="violet")
167
-
168
- category_names = sorted(list(category_mapping.keys()))
169
- category_tabs = st.tabs(category_names)
170
-
171
- for i, category_name in enumerate(category_names):
172
- with category_tabs[i]:
173
- df_category_filtered = df[df['category'] == category_name]
174
- if not df_category_filtered.empty:
175
- st.dataframe(df_category_filtered.drop(columns=['category']), use_container_width=True)
176
- else:
177
- st.info(f"No entities found for the '{category_name}' category.")
178
-
179
- with st.expander("See Glossary of tags"):
180
- st.write('''
181
- - **start**: ['index of the start of the corresponding entity']
182
- - **end**: ['index of the end of the corresponding entity']
183
- - **text**: ['entity extracted from your text data']
184
- - **label**: ['label (tag) assigned to a given extracted entity']
185
- - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
186
-
187
- ''')
188
-
189
- st.divider()
190
- # Tree map
191
- st.subheader("Tree map", divider="violet")
192
- fig_treemap = px.treemap(df, path=[px.Constant("all"), 'category', 'label', 'text'], values='score', color='category')
193
- fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
194
- expander = st.expander("**Download**")
195
- expander.write("""You can easily download the tree map by hovering over it. Look for the download icon that appears in the top right corner.
196
- """)
197
- st.plotly_chart(fig_treemap)
198
-
199
- # Pie and Bar charts
200
- grouped_counts = df['category'].value_counts().reset_index()
201
- grouped_counts.columns = ['category', 'count']
202
- col1, col2 = st.columns(2)
203
-
204
- with col1:
205
- st.subheader("Pie chart", divider="violet")
206
- fig_pie = px.pie(grouped_counts, values='count', names='category', hover_data=['count'], labels={'count': 'count'}, title='Percentage of predicted categories')
207
- fig_pie.update_traces(textposition='inside', textinfo='percent+label')
208
- expander = st.expander("**Download**")
209
- expander.write("""You can easily download the pie chart by hovering over it. Look for the download icon that appears in the top right corner.
210
- """)
211
- st.plotly_chart(fig_pie)
212
-
213
- with col2:
214
- st.subheader("Bar chart", divider="violet")
215
- fig_bar = px.bar(grouped_counts, x="count", y="category", color="category", text_auto=True, title='Occurrences of predicted categories')
216
- expander = st.expander("**Download**")
217
- expander.write("""You can easily download the bar chart by hovering over it. Look for the download icon that appears in the top right corner.
218
- """)
219
- st.plotly_chart(fig_bar)
220
-
221
- # Most Frequent Entities
222
- st.subheader("Most Frequent Entities", divider="violet")
223
- word_counts = df['text'].value_counts().reset_index()
224
- word_counts.columns = ['Entity', 'Count']
225
- repeating_entities = word_counts[word_counts['Count'] > 1]
226
-
227
- if not repeating_entities.empty:
228
- st.dataframe(repeating_entities, use_container_width=True)
229
- fig_repeating_bar = px.bar(repeating_entities, x='Entity', y='Count', color='Entity')
230
- fig_repeating_bar.update_layout(xaxis={'categoryorder': 'total descending'})
231
- expander = st.expander("**Download**")
232
- expander.write("""You can easily download the bar chart by hovering over it. Look for the download icon that appears in the top right corner.
233
- """)
234
- st.plotly_chart(fig_repeating_bar)
235
- else:
236
- st.warning("No entities were found that occur more than once.")
237
-
238
- # Download Section
239
- st.divider()
240
- dfa = pd.DataFrame(data={'Column Name': ['start', 'end', 'text', 'label', 'score'],
241
- 'Description': ['index of the start of the corresponding entity', 'index of the end of the corresponding entity', 'entity extracted from your text data', 'label (tag) assigned to a given extracted entity', 'accuracy score; how accurately a tag has been assigned to a given entity']})
242
-
243
- buf = io.BytesIO()
244
- with zipfile.ZipFile(buf, "w") as myzip:
245
- myzip.writestr("Summary of the results.csv", df.to_csv(index=False))
246
- myzip.writestr("Most Frequent Entities.csv", repeating_entities.to_csv(index=False))
247
- myzip.writestr("Glossary of tags.csv", dfa.to_csv(index=False))
248
-
249
- with stylable_container(
250
- key="download_button",
251
- css_styles="""button { background-color: #8A2BE2; border: 1px solid black; padding: 5px; color: white; }""",
252
- ):
253
- st.download_button(
254
- label="Download results and glossary (zip)",
255
- data=buf.getvalue(),
256
- file_name="nlpblogs_results.zip",
257
- mime="application/zip"
258
- )
259
- st.text("")
260
- st.text("")
261
- else:
262
- st.warning("No entities were found in the provided text.")
263
 
264
-
 
2
  os.environ['HF_HOME'] = '/tmp'
3
  import time
4
  import streamlit as st
5
+ import streamlit.components.v1 as components
6
  import pandas as pd
7
  import io
8
  import plotly.express as px
9
+ import plotly.graph_objects as go
10
+ import numpy as np
11
+ import re
12
  import string
13
+ import json
14
+ # --- PPTX Imports ---
15
+ from io import BytesIO
16
+ from pptx import Presentation
17
+ from pptx.util import Inches, Pt
18
+ from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
19
+ import plotly.io as pio # Required for image export
20
+ # ---------------------------
21
+ # --- Stable Scikit-learn LDA Imports ---
22
+ from sklearn.feature_extraction.text import TfidfVectorizer
23
+ from sklearn.decomposition import LatentDirichletAllocation
24
+ # ------------------------------
25
  from gliner import GLiNER
26
+ from streamlit_extras.stylable_container import stylable_container
27
+ # Using a try/except for comet_ml import
28
+ try:
29
+ from comet_ml import Experiment
30
+ except ImportError:
31
+ class Experiment:
32
+ def __init__(self, **kwargs): pass
33
+ def log_parameter(self, *args): pass
34
+ def log_table(self, *args): pass
35
+ def end(self): pass
36
+ # --- Model Home Directory (Fix for deployment environments) ---
37
+ # Set HF_HOME environment variable to a writable path
38
+ os.environ['HF_HOME'] = '/tmp'
39
+ # --- Color Map for Highlighting and Network Graph Nodes ---
40
+ entity_color_map = {
41
+ "person": "#10b981",
42
+ "country": "#3b82f6",
43
+ "city": "#4ade80",
44
+ "organization": "#f59e0b",
45
+ "date": "#8b5cf6",
46
+ "time": "#ec4899",
47
+ "cardinal": "#06b6d4",
48
+ "money": "#f43f5e",
49
+ "position": "#a855f7",
50
+ }
51
+ # --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
52
+ labels = list(entity_color_map.keys())
53
+ category_mapping = {
54
+ "People": ["person", "organization", "position"],
55
+ "Locations": ["country", "city"],
56
+ "Time": ["date", "time"],
57
+ "Numbers": ["money", "cardinal"]}
58
+ reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
59
+ # --- Utility Functions for Analysis and Plotly ---
60
+ def extract_label(node_name):
61
+ """Extracts the label from a node string like 'Text (Label)'."""
62
+ match = re.search(r'\(([^)]+)\)$', node_name)
63
+ return match.group(1) if match else "Unknown"
64
+ def remove_trailing_punctuation(text_string):
65
+ """Removes trailing punctuation from a string."""
66
+ return text_string.rstrip(string.punctuation)
67
+ def highlight_entities(text, df_entities):
68
+ """Generates HTML to display text with entities highlighted and colored."""
69
+ if df_entities.empty:
70
+ return text
71
+ # Sort entities by start index descending to insert highlights without affecting subsequent indices
72
+ entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
73
+ highlighted_text = text
74
+ for entity in entities:
75
+ start = entity['start']
76
+ end = entity['end']
77
+ label = entity['label']
78
+ entity_text = entity['text']
79
+ color = entity_color_map.get(label, '#000000')
80
+ # Create a span with background color and tooltip
81
+ highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text}</span>'
82
+ # Replace the original text segment with the highlighted HTML
83
+ highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
84
+ # Use a div to mimic the Streamlit input box style for the report
85
+ return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
86
+ def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
87
+ """
88
+ Performs basic Topic Modeling using LDA on the extracted entities
89
+ and returns structured data for visualization.
90
+ """
91
+ documents = df_entities['text'].unique().tolist()
92
+ if len(documents) < 2:
93
+ return None
94
+ N = min(num_top_words, len(documents))
95
+ try:
96
+ tfidf_vectorizer = TfidfVectorizer(
97
+ max_df=0.95,
98
+ min_df=1,
99
+ stop_words='english'
100
+ )
101
+ tfidf = tfidf_vectorizer.fit_transform(documents)
102
+ tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
103
+ lda = LatentDirichletAllocation(
104
+ n_components=num_topics, max_iter=5, learning_method='online',random_state=42, n_jobs=-1
105
+ )
106
+ lda.fit(tfidf)
107
+ topic_data_list = []
108
+ for topic_idx, topic in enumerate(lda.components_):
109
+ top_words_indices = topic.argsort()[:-N - 1:-1]
110
+ top_words = [tfidf_feature_names[i] for i in top_words_indices]
111
+ word_weights = [topic[i] for i in top_words_indices]
112
+ for word, weight in zip(top_words, word_weights):
113
+ topic_data_list.append({
114
+ 'Topic_ID': f'Topic #{topic_idx + 1}',
115
+ 'Word': word,
116
+ 'Weight': weight,
117
+ })
118
+ return pd.DataFrame(topic_data_list)
119
+ except Exception as e:
120
+ st.error(f"Topic modeling failed: {e}")
121
+ return None
122
+ def create_topic_word_bubbles(df_topic_data):
123
+ """Generates a Plotly Bubble Chart for top words across all topics."""
124
+ # Renaming columns to match the output of perform_topic_modeling
125
+ df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
126
+ df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position in the app
127
+ if df_topic_data.empty:
128
+ return None
129
+ fig = px.scatter(
130
+ df_topic_data,
131
+ x='x_pos',
132
+ y='weight',
133
+ size='weight',
134
+ color='topic',
135
+ hover_name='word',
136
+ size_max=80,
137
+ title='Topic Word Weights (Bubble Chart)',
138
+ color_discrete_sequence=px.colors.qualitative.Bold,
139
+ labels={
140
+ 'x_pos': 'Entity/Word Index',
141
+ 'weight': 'Word Weight',
142
+ 'topic': 'Topic ID'
143
+ },
144
+ custom_data=['word', 'weight', 'topic']
145
+ )
146
+ fig.update_layout(
147
+ xaxis_title="Entity/Word (Bubble size = Word Weight)",
148
+ yaxis_title="Word Weight",
149
+ xaxis={'tickangle': -45, 'showgrid': False},
150
+ yaxis={'showgrid': True},
151
+ showlegend=True,
152
+ plot_bgcolor='#f9f9f9', # Changed from pink
153
+ paper_bgcolor='#f9f9f9', # Changed from pink
154
+ height=600,
155
+ margin=dict(t=50, b=100, l=50, r=10),
156
+ )
157
+ fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>', marker=dict(line=dict(width=1, color='DarkSlateGrey')))
158
+ return fig
159
+ def generate_network_graph(df, raw_text):
160
+ """
161
+ Generates a network graph visualization (Node Plot) with edges
162
+ based on entity co-occurrence in sentences. (Content omitted for brevity but assumed to be here).
163
+ """
164
+ # Using the existing generate_network_graph logic from previous context...
165
+ entity_counts = df['text'].value_counts().reset_index()
166
+ entity_counts.columns = ['text', 'frequency']
167
+ unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
168
+ if unique_entities.shape[0] < 2:
169
+ return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
170
+ num_nodes = len(unique_entities)
171
+ thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
172
+ radius = 10
173
+ unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
174
+ unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
175
+ pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
176
+ edges = set()
177
+ sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
178
+ for sentence in sentences:
179
+ entities_in_sentence = []
180
+ for entity_text in unique_entities['text'].unique():
181
+ if entity_text.lower() in sentence.lower():
182
+ entities_in_sentence.append(entity_text)
183
+ unique_entities_in_sentence = list(set(entities_in_sentence))
184
+ for i in range(len(unique_entities_in_sentence)):
185
+ for j in range(i + 1, len(unique_entities_in_sentence)):
186
+ node1 = unique_entities_in_sentence[i]
187
+ node2 = unique_entities_in_sentence[j]
188
+ edge_tuple = tuple(sorted((node1, node2)))
189
+ edges.add(edge_tuple)
190
+ edge_x = []
191
+ edge_y = []
192
+ for edge in edges:
193
+ n1, n2 = edge
194
+ if n1 in pos_map and n2 in pos_map:
195
+ edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
196
+ edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
197
+ fig = go.Figure()
198
+ edge_trace = go.Scatter(
199
+ x=edge_x, y=edge_y,
200
+ line=dict(width=0.5, color='#888'),
201
+ hoverinfo='none',
202
+ mode='lines',
203
+ name='Co-occurrence Edges',
204
+ showlegend=False
205
+ )
206
+ fig.add_trace(edge_trace)
207
+ fig.add_trace(go.Scatter(
208
+ x=unique_entities['x'],
209
+ y=unique_entities['y'],
210
+ mode='markers+text',
211
+ name='Entities',
212
+ text=unique_entities['text'],
213
+ textposition="top center",
214
+ showlegend=False,
215
+ marker=dict(
216
+ size=unique_entities['frequency'] * 5 + 10,
217
+ color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']],
218
+ line_width=1,
219
+ line_color='black',
220
+ opacity=0.9
221
+ ),
222
+ textfont=dict(size=10),
223
+ customdata=unique_entities[['label', 'score', 'frequency']],
224
+ hovertemplate=(
225
+ "<b>%{text}</b><br>" +
226
+ "Label: %{customdata[0]}<br>" +
227
+ "Score: %{customdata[1]:.2f}<br>" +
228
+ "Frequency: %{customdata[2]}<extra></extra>"
229
+ )
230
+ ))
231
+ legend_traces = []
232
+ seen_labels = set()
233
+ for index, row in unique_entities.iterrows():
234
+ label = row['label']
235
+ if label not in seen_labels:
236
+ seen_labels.add(label)
237
+ color = entity_color_map.get(label, '#cccccc')
238
+ legend_traces.append(go.Scatter(
239
+ x=[None], y=[None], mode='markers', marker=dict(size=10, color=color), name=f"{label.capitalize()}", showlegend=True
240
+ ))
241
+ for trace in legend_traces:
242
+ fig.add_trace(trace)
243
+ fig.update_layout(
244
+ title='Entity Co-occurrence Network (Edges = Same Sentence)',
245
+ showlegend=True,
246
+ hovermode='closest',
247
+ xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
248
+ yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
249
+ plot_bgcolor='#f9f9f9',
250
+ paper_bgcolor='#f9f9f9',
251
+ margin=dict(t=50, b=10, l=10, r=10),
252
+ height=600
253
+ )
254
+ return fig
255
+ # --- NEW CSV GENERATION FUNCTION ---
256
+ def generate_entity_csv(df):
257
+ """
258
+ Generates a CSV file of the extracted entities in an in-memory buffer,
259
+ including text, label, category, score, start, and end indices.
260
+ """
261
+ csv_buffer = BytesIO()
262
+ # Select desired columns and write to buffer
263
+ df_export = df[['text', 'label', 'category', 'score', 'start', 'end']]
264
+ csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
265
+ csv_buffer.seek(0)
266
+ return csv_buffer
267
+ # -----------------------------------
268
+ # --- Existing App Functionality (HTML) ---
269
+ def generate_html_report(df, text_input, elapsed_time, df_topic_data):
270
+ """
271
+ Generates a full HTML report containing all analysis results and visualizations.
272
+ (Content omitted for brevity but assumed to be here).
273
+ """
274
+ # 1. Generate Visualizations (Plotly HTML)
275
+ # 1a. Treemap
276
+ fig_treemap = px.treemap(
277
+ df,
278
+ path=[px.Constant("All Entities"), 'category', 'label', 'text'],
279
+ values='score',
280
+ color='category',
281
+ title="Entity Distribution by Category and Label",
282
+ color_discrete_sequence=px.colors.qualitative.Dark24
283
+ )
284
+ fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
285
+ treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
286
+ # 1b. Pie Chart
287
+ grouped_counts = df['category'].value_counts().reset_index()
288
+ grouped_counts.columns = ['Category', 'Count']
289
+ # Changed color_discrete_sequence from sequential.RdBu (which has reds) to sequential.Cividis
290
+ fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
291
+ fig_pie.update_layout(margin=dict(t=50, b=10))
292
+ pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
293
+ # 1c. Bar Chart (Category Count)
294
+ fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
295
+ fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
296
+ bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
297
+ # 1d. Bar Chart (Most Frequent Entities)
298
+ word_counts = df['text'].value_counts().reset_index()
299
+ word_counts.columns = ['Entity', 'Count']
300
+ repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
301
+ bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
302
+ if not repeating_entities.empty:
303
+ # Changed color_discrete_sequence from sequential.Plasma (which has pink/magenta) to sequential.Viridis
304
+ fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
305
+ fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
306
+ bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
307
+ # 1e. Network Graph HTML
308
+ network_fig = generate_network_graph(df, text_input)
309
+ network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
310
+ # 1f. Topic Charts HTML
311
+ topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
312
+ if df_topic_data is not None and not df_topic_data.empty:
313
+ bubble_figure = create_topic_word_bubbles(df_topic_data)
314
+ if bubble_figure:
315
+ topic_charts_html += f'<div class="chart-box">{bubble_figure.to_html(full_html=False, include_plotlyjs="cdn")}</div>'
316
+ else:
317
+ topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
318
+ else:
319
+ topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #888888;">' # Changed border color
320
+ topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
321
+ topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
322
+ topic_charts_html += '</div>'
323
+ # 2. Get Highlighted Text
324
+ highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
325
+ # 3. Entity Tables (Pandas to HTML)
326
+ entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
327
+ classes='table table-striped',
328
+ index=False
329
+ )
330
+ # 4. Construct the Final HTML
331
+ html_content = f"""<!DOCTYPE html><html lang="en"><head>
332
+ <meta charset="UTF-8">
333
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
334
+ <title>Entity and Topic Analysis Report</title>
335
+ <script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
336
+ <style>
337
+ body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f9; color: #333; }}
338
+ .container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
339
+ h1 {{ color: #007bff; border-bottom: 3px solid #007bff; padding-bottom: 10px; margin-top: 0; }}
340
+ h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
341
+ h3 {{ color: #555; margin-top: 20px; }}
342
+ .metadata {{ background-color: #e6f0ff; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
343
+ .chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px; }}
344
+ table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
345
+ table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
346
+ table th {{ background-color: #f0f0f0; }}
347
+ .highlighted-text {{ border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
348
+ </style></head><body>
349
+ <div class="container">
350
+ <h1>Entity and Topic Analysis Report</h1>
351
+ <div class="metadata">
352
+ <p><strong>Generated on:</strong> {time.strftime('%Y-%m-%d')}</p>
353
+ <p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
354
+ </div>
355
+ <h2>1. Analyzed Text & Extracted Entities</h2>
356
+ <h3>Original Text with Highlighted Entities</h3>
357
+ <div class="highlighted-text-container">
358
+ {highlighted_text_html}
359
+ </div>
360
+ <h2>2. Full Extracted Entities Table</h2>
361
+ {entity_table_html}
362
+ <h2>3. Data Visualizations</h2>
363
+ <h3>3.1 Entity Distribution Treemap</h3>
364
+ <div class="chart-box">{treemap_html}</div>
365
+ <h3>3.2 Comparative Charts (Pie, Category Count, Frequency) - *Stacked Vertically*</h3>
366
+ <div class="chart-box">{pie_html}</div>
367
+ <div class="chart-box">{bar_category_html}</div>
368
+ <div class="chart-box">{bar_freq_html}</div>
369
+ <h3>3.3 Entity Relationship Map (Edges = Same Sentence)</h3>
370
+ <div class="chart-box">{network_html}</div>
371
+ <h2>4. Topic Modelling</h2>
372
+ {topic_charts_html}
373
+ </div></body></html>
374
+ """
375
+ return html_content
376
+ # --- Page Configuration and Styling (No Sidebar) ---
377
+ st.set_page_config(layout="wide", page_title="NER & Topic Report App")
378
+ st.markdown(
379
+ """
380
+ <style>
381
+ /* Overall app container - NO SIDEBAR */
382
+ .main {
383
+ background-color: #f4f4f9; /* Changed from light pink */
384
+ color: #333333; /* Dark grey text for contrast */
385
+ }
386
+ .stApp {
387
+ background-color: #f4f4f9; /* Changed from light pink */
388
+ }
389
+ /* Text Area background and text color (input fields) */
390
+ .stTextArea textarea {
391
+ background-color: #ffffff; /* Changed from near white/pinkish */
392
+ color: #000000; /* Black text for input */
393
+ border: 1px solid #888888; /* Changed border from pink to grey */
394
+ }
395
+ /* Button styling */
396
+ .stButton > button {
397
+ background-color: #007bff; /* Changed from Deep Pink to Blue */
398
+ color: #FFFFFF; /* White text for contrast */
399
+ border: none;
400
+ padding: 10px 20px;
401
+ border-radius: 5px;
402
+ }
403
+ /* Expander header and content background */
404
+ .streamlit-expanderHeader, .streamlit-expanderContent {
405
+ background-color: #e9ecef; /* Changed from lighter pink to light grey/blue */
406
+ color: #333333;
407
+ }
408
+ </style>
409
+ """,
410
+ unsafe_allow_html=True)
411
+ st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue"
412
  st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
413
+ tab1, tab2 = st.tabs(["Embed", "Important Notes"])
 
 
 
414
 
 
415
 
 
416
 
417
+ with tab1:
418
+ with st.expander("Embed"):
419
+ st.write("Use the following code to embed the DataHarvest web app on your website. Feel free to adjust the width and height values to fit your page.")
420
+ code = '''
421
+ <iframe
422
+ src="https://aiecosystem-dataharvest.hf.space"
423
+ frameborder="0"
424
+ width="850"
425
+ height="450"
426
+ ></iframe>
427
+ '''
428
+ st.code(code, language="html")
429
 
430
+ with tab2:
431
+ expander = st.expander("**Important Notes**")
432
+ expander.write("""**Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"
433
+ **Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
434
+
435
+ **How to Use:** Type or paste your text into the text area below, press Ctrl + Enter, and then click the 'Results' button.
436
+
437
+ **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.""")
438
+
439
 
440
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
441
 
442
+ # --- Comet ML Setup (Placeholder/Conditional) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
444
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
445
  COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
446
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
 
 
 
 
 
 
 
 
 
 
 
 
 
447
  # --- Model Loading ---
448
  @st.cache_resource
449
  def load_ner_model():
450
+ """Loads the GLiNER model and caches it."""
451
+ try:
452
+ return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
453
+ except Exception as e:
454
+ st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
455
+ st.stop()
456
  model = load_ner_model()
457
+ # --- LONG DEFAULT TEXT (178 Words) ---
458
+ DEFAULT_TEXT = (
459
+ "In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
460
+ "TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
461
+ "leap forward for commercial space technology across the entire European Union. The agreement, finalized "
462
+ "on Monday in Paris, France, focuses specifically on jointly developing the next generation of the 'Astra' "
463
+ "software platform. This platform is critical for processing and managing the vast amounts of data being sent "
464
+ "back from the recent Mars rover mission. The core team, including lead engineer Marcus Davies, will hold "
465
+ "their first collaborative workshop in Berlin, Germany, on August 15th. The community response on social "
466
+ "media platform X (under the username @TechSolutionsCEO) was overwhelmingly positive, with many major tech "
467
+ "publications, including Wired Magazine, predicting a major impact on the space technology industry by the "
468
+ "end of the year. The platform is designed to be compatible with both Windows and Linux operating systems. "
469
+ "The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
470
+ "are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
471
+ "general public by October 1st. The goal is to deploy the Astra v2 platform before the next solar eclipse event in 2026.")
472
+ # -----------------------------------
473
+ # --- Session State Initialization (CRITICAL FIX) ---
474
  if 'show_results' not in st.session_state:
475
+ st.session_state.show_results = False
476
  if 'last_text' not in st.session_state:
477
+ st.session_state.last_text = ""
478
  if 'results_df' not in st.session_state:
479
+ st.session_state.results_df = pd.DataFrame()
480
  if 'elapsed_time' not in st.session_state:
481
+ st.session_state.elapsed_time = 0.0
482
+ if 'topic_results' not in st.session_state:
483
+ st.session_state.topic_results = None
484
+ if 'my_text_area' not in st.session_state:
485
+ st.session_state.my_text_area = DEFAULT_TEXT
486
+ # --- Clear Button Function (MODIFIED) ---
487
+ def clear_text():
488
+ """Clears the text area (sets it to an empty string) and hides results."""
489
+ st.session_state['my_text_area'] = ""
490
+ st.session_state.show_results = False
491
+ st.session_state.last_text = ""
492
+ st.session_state.results_df = pd.DataFrame()
493
+ st.session_state.elapsed_time = 0.0
494
+ st.session_state.topic_results = None
495
  # --- Text Input and Clear Button ---
496
+ word_limit = 1000
497
+ text = st.text_area(
498
+ f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
499
+ height=250,
500
+ key='my_text_area',
501
+ value=st.session_state.my_text_area)
502
  word_count = len(text.split())
503
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
 
 
 
 
 
 
 
 
 
504
  st.button("Clear text", on_click=clear_text)
505
+ # --- Results Trigger and Processing (Updated Logic) ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
506
  if st.button("Results"):
507
+ if not text.strip():
508
+ st.warning("Please enter some text to extract entities.")
509
+ st.session_state.show_results = False
510
+ elif word_count > word_limit:
511
+ st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
512
+ st.session_state.show_results = False
513
+ else:
514
+ with st.spinner("Extracting entities and generating report data...", show_time=True):
515
+ if text != st.session_state.last_text:
516
+ st.session_state.last_text = text
517
+ start_time = time.time()
518
+ # --- Model Prediction & Dataframe Creation ---
519
+ entities = model.predict_entities(text, labels)
520
+ df = pd.DataFrame(entities)
521
+ if not df.empty:
522
+ df['text'] = df['text'].apply(remove_trailing_punctuation)
523
+ df['category'] = df['label'].map(reverse_category_mapping)
524
+ st.session_state.results_df = df
525
+ unique_entity_count = len(df['text'].unique())
526
+ N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
527
+ st.session_state.topic_results = perform_topic_modeling(
528
+ df,
529
+ num_topics=2,
530
+ num_top_words=N_TOP_WORDS_TO_USE
531
+ )
532
+ if comet_initialized:
533
+ experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
534
+ experiment.log_parameter("input_text", text)
535
+ experiment.log_table("predicted_entities", df)
536
+ experiment.end()
537
+ else:
538
+ st.session_state.results_df = pd.DataFrame()
539
+ st.session_state.topic_results = None
540
+ end_time = time.time()
541
+ st.session_state.elapsed_time = end_time - start_time
542
+ st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
543
+ st.session_state.show_results = True
544
+ # --- Display Download Link and Results ---
545
+ if st.session_state.show_results:
546
+ df = st.session_state.results_df
547
+ df_topic_data = st.session_state.topic_results
548
+ if df.empty:
549
+ st.warning("No entities were found in the provided text.")
550
+ else:
551
+ st.subheader("Analysis Results", divider="blue")
552
+ # 1. Highlighted Text
553
+ st.markdown("### 1. Analyzed Text with Highlighted Entities")
554
+ st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
555
 
556
+ # 2. Detailed Entity Analysis Tabs
557
+ st.markdown("### 2. Detailed Entity Analysis")
558
+ tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
559
+ with tab_category_details:
560
+ st.markdown("#### Detailed Entities Table (Grouped by Category)")
561
+ with st.expander("See Glossary of tags"):
562
+ st.write('''
563
+ - **text**: ['entity extracted from your text data']
564
+ - **label**: ['label (tag) assigned to a given extracted entity']
565
+ - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
566
+ - **start**: ['index of the start of the corresponding entity']
567
+ - **end**: ['index of the end of the corresponding entity']
568
+ ''')
569
+ unique_categories = list(category_mapping.keys())
570
+ tabs_category = st.tabs(unique_categories)
571
+ for category, tab in zip(unique_categories, tabs_category):
572
+ df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
573
+ with tab:
574
+ st.markdown(f"##### {category} Entities ({len(df_category)} total)")
575
+ if not df_category.empty:
576
+ st.dataframe(
577
+ df_category,
578
+ use_container_width=True,
579
+ column_config={'score': st.column_config.NumberColumn(format="%.4f")}
580
+ )
581
+ else:
582
+ st.info(f"No entities of category **{category}** were found in the text.")
583
+ with tab_treemap_viz:
584
+ st.markdown("#### Treemap: Entity Distribution")
585
+ fig_treemap = px.treemap(
586
+ df,
587
+ path=[px.Constant("All Entities"), 'category', 'label', 'text'],
588
+ values='score',
589
+ color='category',
590
+ color_discrete_sequence=px.colors.qualitative.Dark24
591
+ )
592
+ fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
593
+ st.plotly_chart(fig_treemap, use_container_width=True)
594
+ # 3. Comparative Charts
595
+ st.markdown("---")
596
+ st.markdown("### 3. Comparative Charts")
597
+ col1, col2, col3 = st.columns(3)
598
+ grouped_counts = df['category'].value_counts().reset_index()
599
+ grouped_counts.columns = ['Category', 'Count']
600
+ with col1: # Pie Chart
601
+ # Changed color_discrete_sequence
602
+ fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
603
+ fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
604
+ st.plotly_chart(fig_pie, use_container_width=True)
605
+ with col2: # Bar Chart (Category Count)
606
+ fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
607
+ fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
608
+ st.plotly_chart(fig_bar_category, use_container_width=True)
609
+ with col3: # Bar Chart (Most Frequent Entities)
610
+ word_counts = df['text'].value_counts().reset_index()
611
+ word_counts.columns = ['Entity', 'Count']
612
+ repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
613
+ if not repeating_entities.empty:
614
+ # Changed color_discrete_sequence
615
+ fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
616
+ fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
617
+ st.plotly_chart(fig_bar_freq, use_container_width=True)
618
+ else:
619
+ st.info("No entities repeat for frequency chart.")
620
+ st.markdown("---")
621
+ st.markdown("### 4. Entity Relationship Map")
622
+ network_fig = generate_network_graph(df, st.session_state.last_text)
623
+ st.plotly_chart(network_fig, use_container_width=True)
624
+ st.markdown("---")
625
+ st.markdown("### 5. Topic Modelling Analysis")
626
+ if df_topic_data is not None and not df_topic_data.empty:
627
+ bubble_figure = create_topic_word_bubbles(df_topic_data)
628
+ if bubble_figure:
629
+ st.plotly_chart(bubble_figure, use_container_width=True)
630
+ else:
631
+ st.error("Error generating Topic Word Bubble Chart.")
632
+ else:
633
+ st.info("Topic modeling requires more unique input (at least two unique entities).")
634
+ # --- Report Download ---
635
+ st.markdown("---")
636
+ st.markdown("### Download Full Report Artifacts")
637
+ # 1. HTML Report Download (Retained)
638
+ html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
639
+ st.download_button(
640
+ label="Download Comprehensive HTML Report",
641
+ data=html_report,
642
+ file_name="ner_topic_report.html",
643
+ mime="text/html",
644
+ type="primary"
645
+ )
646
 
647
+ # 2. CSV Data Download (NEW)
648
+ csv_buffer = generate_entity_csv(df)
649
+ st.download_button(
650
+ label="Download Extracted Entities (CSV)",
651
+ data=csv_buffer,
652
+ file_name="extracted_entities.csv",
653
+ mime="text/csv",
654
+ type="secondary"
655
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
656
 
657
+