AIEcosystem commited on
Commit
0b61c41
·
verified ·
1 Parent(s): 1f6b923

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +183 -162
src/streamlit_app.py CHANGED
@@ -33,21 +33,22 @@ expander.write("""**Named Entities:** This DataHarvest web app predicts nine (9)
33
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
34
 
35
  with st.sidebar:
36
- st.write("Use the following code to embed the DataHarvest web app on your website. Feel free to adjust the width and height values to fit your page.")
37
- code = '''
38
- <iframe
39
  src="https://aiecosystem-dataharvest.hf.space"
40
  frameborder="0"
41
  width="850"
42
  height="450"
43
  ></iframe>
44
- '''
45
- st.code(code, language="html")
46
- st.text("")
47
- st.text("")
48
-
49
- st.subheader("🚀 Ready to build your own AI Web App?", divider="violet")
50
- st.link_button("AI Web App Builder", "https://nlpblogs.com/build-your-named-entity-recognition-app/", type="primary")
 
51
 
52
  # --- Comet ML Setup ---
53
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
@@ -55,39 +56,39 @@ COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
55
  COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
56
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
57
  if not comet_initialized:
58
- st.warning("Comet ML not initialized. Check environment variables.")
59
- print("Warning: Comet ML environment variables are not set. Logging will be disabled.")
60
 
61
  # --- Label Definitions ---
62
  labels = ["person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"]
63
  category_mapping = {
64
- "People": ["person", "organization", "position"],
65
- "Locations": ["country", "city"],
66
- "Time": ["date", "time"],
67
- "Numbers": ["money", "cardinal"]
68
  }
69
 
70
  # --- Model Loading ---
71
  @st.cache_resource
72
  def load_ner_model():
73
- """Loads the GLiNER model and caches it."""
74
- try:
75
- return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
76
- except Exception as e:
77
- st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
78
- st.stop()
79
  model = load_ner_model()
80
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
81
 
82
  # --- Session State Initialization ---
83
  if 'show_results' not in st.session_state:
84
- st.session_state.show_results = False
85
  if 'last_text' not in st.session_state:
86
- st.session_state.last_text = ""
87
  if 'results_df' not in st.session_state:
88
- st.session_state.results_df = pd.DataFrame()
89
  if 'elapsed_time' not in st.session_state:
90
- st.session_state.elapsed_time = 0.0
91
 
92
  # --- Text Input and Clear Button ---
93
  word_limit = 200
@@ -96,148 +97,168 @@ word_count = len(text.split())
96
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
97
 
98
  def clear_text():
99
- """Clears the text area and hides results."""
100
- st.session_state['my_text_area'] = ""
101
- st.session_state.show_results = False
102
- st.session_state.last_text = ""
103
- st.session_state.results_df = pd.DataFrame()
104
- st.session_state.elapsed_time = 0.0
105
 
106
  st.button("Clear text", on_click=clear_text)
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  # --- Results Section ---
109
  if st.button("Results"):
110
- if not text.strip():
111
- st.warning("Please enter some text to extract entities.")
112
- st.session_state.show_results = False
113
- elif word_count > word_limit:
114
- st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
115
- st.session_state.show_results = False
116
- else:
117
- # Check if the text is different from the last time
118
- if text != st.session_state.last_text:
119
- st.session_state.show_results = True
120
- st.session_state.last_text = text
121
- start_time = time.time()
122
- with st.spinner("Extracting entities...", show_time=True):
123
- # Pass the raw text directly to the model
124
- entities = model.predict_entities(text, labels)
125
- df = pd.DataFrame(entities)
126
- st.session_state.results_df = df
127
- if not df.empty:
128
- df['category'] = df['label'].map(reverse_category_mapping)
129
- if comet_initialized:
130
- experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
131
- experiment.log_parameter("input_text", text)
132
- experiment.log_table("predicted_entities", df)
133
- experiment.end()
134
- end_time = time.time()
135
- st.session_state.elapsed_time = end_time - start_time
136
- # Place the message here, so it only runs once per button click
137
- st.info(f"Results processed in **{st.session_state.elapsed_time:.2f} seconds**.")
138
- # If the text is the same, do nothing but keep results displayed
139
- else:
140
- st.session_state.show_results = True
 
 
 
 
 
141
 
142
  # Display results if the state variable is True
143
  if st.session_state.show_results:
144
- df = st.session_state.results_df
145
- if not df.empty:
146
- df['category'] = df['label'].map(reverse_category_mapping)
147
- st.subheader("Grouped Entities by Category", divider="violet")
148
-
149
- category_names = sorted(list(category_mapping.keys()))
150
- category_tabs = st.tabs(category_names)
151
-
152
- for i, category_name in enumerate(category_names):
153
- with category_tabs[i]:
154
- df_category_filtered = df[df['category'] == category_name]
155
- if not df_category_filtered.empty:
156
- st.dataframe(df_category_filtered.drop(columns=['category']), use_container_width=True)
157
- else:
158
- st.info(f"No entities found for the '{category_name}' category.")
159
-
160
- with st.expander("See Glossary of tags"):
161
- st.write('''
162
- - **start**: ['index of the start of the corresponding entity']
163
- - **end**: ['index of the end of the corresponding entity']
164
- - **text**: ['entity extracted from your text data']
165
- - **label**: ['label (tag) assigned to a given extracted entity']
166
- - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
167
-
168
- ''')
169
-
170
- st.divider()
171
- # Tree map
172
- st.subheader("Tree map", divider="violet")
173
- fig_treemap = px.treemap(df, path=[px.Constant("all"), 'category', 'label', 'text'], values='score', color='category')
174
- fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
175
- expander = st.expander("**Download**")
176
- expander.write("""You can easily download the tree map by hovering over it. Look for the download icon that appears in the top right corner.
177
- """)
178
- st.plotly_chart(fig_treemap)
179
-
180
- # Pie and Bar charts
181
- grouped_counts = df['category'].value_counts().reset_index()
182
- grouped_counts.columns = ['category', 'count']
183
- col1, col2 = st.columns(2)
184
-
185
- with col1:
186
- st.subheader("Pie chart", divider="violet")
187
- fig_pie = px.pie(grouped_counts, values='count', names='category', hover_data=['count'], labels={'count': 'count'}, title='Percentage of predicted categories')
188
- fig_pie.update_traces(textposition='inside', textinfo='percent+label')
189
- expander = st.expander("**Download**")
190
- expander.write("""You can easily download the pie chart by hovering over it. Look for the download icon that appears in the top right corner.
191
- """)
192
- st.plotly_chart(fig_pie)
193
-
194
- with col2:
195
- st.subheader("Bar chart", divider="violet")
196
- fig_bar = px.bar(grouped_counts, x="count", y="category", color="category", text_auto=True, title='Occurrences of predicted categories')
197
- expander = st.expander("**Download**")
198
- expander.write("""You can easily download the bar chart by hovering over it. Look for the download icon that appears in the top right corner.
199
- """)
200
- st.plotly_chart(fig_bar)
201
-
202
- # Most Frequent Entities
203
- st.subheader("Most Frequent Entities", divider="violet")
204
- word_counts = df['text'].value_counts().reset_index()
205
- word_counts.columns = ['Entity', 'Count']
206
- repeating_entities = word_counts[word_counts['Count'] > 1]
207
-
208
- if not repeating_entities.empty:
209
- st.dataframe(repeating_entities, use_container_width=True)
210
- fig_repeating_bar = px.bar(repeating_entities, x='Entity', y='Count', color='Entity')
211
- fig_repeating_bar.update_layout(xaxis={'categoryorder': 'total descending'})
212
- expander = st.expander("**Download**")
213
- expander.write("""You can easily download the bar chart by hovering over it. Look for the download icon that appears in the top right corner.
214
- """)
215
- st.plotly_chart(fig_repeating_bar)
216
- else:
217
- st.warning("No entities were found that occur more than once.")
218
-
219
- # Download Section
220
- st.divider()
221
- dfa = pd.DataFrame(data={'Column Name': ['start', 'end', 'text', 'label', 'score'],
222
  'Description': ['index of the start of the corresponding entity', 'index of the end of the corresponding entity', 'entity extracted from your text data', 'label (tag) assigned to a given extracted entity', 'accuracy score; how accurately a tag has been assigned to a given entity']})
223
-
224
- buf = io.BytesIO()
225
- with zipfile.ZipFile(buf, "w") as myzip:
226
- myzip.writestr("Summary of the results.csv", df.to_csv(index=False))
227
- myzip.writestr("Most Frequent Entities.csv", repeating_entities.to_csv(index=False))
228
- myzip.writestr("Glossary of tags.csv", dfa.to_csv(index=False))
229
-
230
- with stylable_container(
231
- key="download_button",
232
- css_styles="""button { background-color: #8A2BE2; border: 1px solid black; padding: 5px; color: white; }""",
233
- ):
234
- st.download_button(
235
- label="Download results and glossary (zip)",
236
- data=buf.getvalue(),
237
- file_name="nlpblogs_results.zip",
238
- mime="application/zip"
239
- )
240
- st.text("")
241
- st.text("")
242
- else:
243
- st.warning("No entities were found in the provided text.")
 
 
 
33
  st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
34
 
35
  with st.sidebar:
36
+ st.write("Use the following code to embed the DataHarvest web app on your website. Feel free to adjust the width and height values to fit your page.")
37
+ code = '''
38
+ <iframe
39
  src="https://aiecosystem-dataharvest.hf.space"
40
  frameborder="0"
41
  width="850"
42
  height="450"
43
  ></iframe>
44
+
45
+ '''
46
+ st.code(code, language="html")
47
+ st.text("")
48
+ st.text("")
49
+
50
+ st.subheader("🚀 Ready to build your own AI Web App?", divider="violet")
51
+ st.link_button("AI Web App Builder", "https://nlpblogs.com/build-your-named-entity-recognition-app/", type="primary")
52
 
53
  # --- Comet ML Setup ---
54
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
 
56
  COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
57
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
58
  if not comet_initialized:
59
+ st.warning("Comet ML not initialized. Check environment variables.")
60
+ print("Warning: Comet ML environment variables are not set. Logging will be disabled.")
61
 
62
  # --- Label Definitions ---
63
  labels = ["person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"]
64
  category_mapping = {
65
+ "People": ["person", "organization", "position"],
66
+ "Locations": ["country", "city"],
67
+ "Time": ["date", "time"],
68
+ "Numbers": ["money", "cardinal"]
69
  }
70
 
71
  # --- Model Loading ---
72
  @st.cache_resource
73
  def load_ner_model():
74
+ """Loads the GLiNER model and caches it."""
75
+ try:
76
+ return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
77
+ except Exception as e:
78
+ st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
79
+ st.stop()
80
  model = load_ner_model()
81
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
82
 
83
  # --- Session State Initialization ---
84
  if 'show_results' not in st.session_state:
85
+ st.session_state.show_results = False
86
  if 'last_text' not in st.session_state:
87
+ st.session_state.last_text = ""
88
  if 'results_df' not in st.session_state:
89
+ st.session_state.results_df = pd.DataFrame()
90
  if 'elapsed_time' not in st.session_state:
91
+ st.session_state.elapsed_time = 0.0
92
 
93
  # --- Text Input and Clear Button ---
94
  word_limit = 200
 
97
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
98
 
99
  def clear_text():
100
+ """Clears the text area and hides results."""
101
+ st.session_state['my_text_area'] = ""
102
+ st.session_state.show_results = False
103
+ st.session_state.last_text = ""
104
+ st.session_state.results_df = pd.DataFrame()
105
+ st.session_state.elapsed_time = 0.0
106
 
107
  st.button("Clear text", on_click=clear_text)
108
 
109
+ # --- Post-processing function to remove trailing punctuation ---
110
+ def remove_trailing_punctuation(text_string):
111
+ """
112
+ Removes trailing punctuation from a string.
113
+
114
+ Args:
115
+ text_string (str): The input string.
116
+
117
+ Returns:
118
+ str: The string with trailing punctuation removed.
119
+ """
120
+ return text_string.rstrip(string.punctuation)
121
+
122
  # --- Results Section ---
123
  if st.button("Results"):
124
+ if not text.strip():
125
+ st.warning("Please enter some text to extract entities.")
126
+ st.session_state.show_results = False
127
+ elif word_count > word_limit:
128
+ st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
129
+ st.session_state.show_results = False
130
+ else:
131
+ # Check if the text is different from the last time
132
+ if text != st.session_state.last_text:
133
+ st.session_state.show_results = True
134
+ st.session_state.last_text = text
135
+ start_time = time.time()
136
+ with st.spinner("Extracting entities...", show_time=True):
137
+ # Pass the raw text directly to the model
138
+ entities = model.predict_entities(text, labels)
139
+ df = pd.DataFrame(entities)
140
+
141
+ # Apply post-processing to remove punctuation
142
+ if not df.empty:
143
+ df['text'] = df['text'].apply(remove_trailing_punctuation)
144
+
145
+ st.session_state.results_df = df
146
+ if not df.empty:
147
+ df['category'] = df['label'].map(reverse_category_mapping)
148
+ if comet_initialized:
149
+ experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
150
+ experiment.log_parameter("input_text", text)
151
+ experiment.log_table("predicted_entities", df)
152
+ experiment.end()
153
+ end_time = time.time()
154
+ st.session_state.elapsed_time = end_time - start_time
155
+ # Place the message here, so it only runs once per button click
156
+ st.info(f"Results processed in **{st.session_state.elapsed_time:.2f} seconds**.")
157
+ # If the text is the same, do nothing but keep results displayed
158
+ else:
159
+ st.session_state.show_results = True
160
 
161
  # Display results if the state variable is True
162
  if st.session_state.show_results:
163
+ df = st.session_state.results_df
164
+ if not df.empty:
165
+ df['category'] = df['label'].map(reverse_category_mapping)
166
+ st.subheader("Grouped Entities by Category", divider="violet")
167
+
168
+ category_names = sorted(list(category_mapping.keys()))
169
+ category_tabs = st.tabs(category_names)
170
+
171
+ for i, category_name in enumerate(category_names):
172
+ with category_tabs[i]:
173
+ df_category_filtered = df[df['category'] == category_name]
174
+ if not df_category_filtered.empty:
175
+ st.dataframe(df_category_filtered.drop(columns=['category']), use_container_width=True)
176
+ else:
177
+ st.info(f"No entities found for the '{category_name}' category.")
178
+
179
+ with st.expander("See Glossary of tags"):
180
+ st.write('''
181
+ - **start**: ['index of the start of the corresponding entity']
182
+ - **end**: ['index of the end of the corresponding entity']
183
+ - **text**: ['entity extracted from your text data']
184
+ - **label**: ['label (tag) assigned to a given extracted entity']
185
+ - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
186
+
187
+ ''')
188
+
189
+ st.divider()
190
+ # Tree map
191
+ st.subheader("Tree map", divider="violet")
192
+ fig_treemap = px.treemap(df, path=[px.Constant("all"), 'category', 'label', 'text'], values='score', color='category')
193
+ fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
194
+ expander = st.expander("**Download**")
195
+ expander.write("""You can easily download the tree map by hovering over it. Look for the download icon that appears in the top right corner.
196
+ """)
197
+ st.plotly_chart(fig_treemap)
198
+
199
+ # Pie and Bar charts
200
+ grouped_counts = df['category'].value_counts().reset_index()
201
+ grouped_counts.columns = ['category', 'count']
202
+ col1, col2 = st.columns(2)
203
+
204
+ with col1:
205
+ st.subheader("Pie chart", divider="violet")
206
+ fig_pie = px.pie(grouped_counts, values='count', names='category', hover_data=['count'], labels={'count': 'count'}, title='Percentage of predicted categories')
207
+ fig_pie.update_traces(textposition='inside', textinfo='percent+label')
208
+ expander = st.expander("**Download**")
209
+ expander.write("""You can easily download the pie chart by hovering over it. Look for the download icon that appears in the top right corner.
210
+ """)
211
+ st.plotly_chart(fig_pie)
212
+
213
+ with col2:
214
+ st.subheader("Bar chart", divider="violet")
215
+ fig_bar = px.bar(grouped_counts, x="count", y="category", color="category", text_auto=True, title='Occurrences of predicted categories')
216
+ expander = st.expander("**Download**")
217
+ expander.write("""You can easily download the bar chart by hovering over it. Look for the download icon that appears in the top right corner.
218
+ """)
219
+ st.plotly_chart(fig_bar)
220
+
221
+ # Most Frequent Entities
222
+ st.subheader("Most Frequent Entities", divider="violet")
223
+ word_counts = df['text'].value_counts().reset_index()
224
+ word_counts.columns = ['Entity', 'Count']
225
+ repeating_entities = word_counts[word_counts['Count'] > 1]
226
+
227
+ if not repeating_entities.empty:
228
+ st.dataframe(repeating_entities, use_container_width=True)
229
+ fig_repeating_bar = px.bar(repeating_entities, x='Entity', y='Count', color='Entity')
230
+ fig_repeating_bar.update_layout(xaxis={'categoryorder': 'total descending'})
231
+ expander = st.expander("**Download**")
232
+ expander.write("""You can easily download the bar chart by hovering over it. Look for the download icon that appears in the top right corner.
233
+ """)
234
+ st.plotly_chart(fig_repeating_bar)
235
+ else:
236
+ st.warning("No entities were found that occur more than once.")
237
+
238
+ # Download Section
239
+ st.divider()
240
+ dfa = pd.DataFrame(data={'Column Name': ['start', 'end', 'text', 'label', 'score'],
241
  'Description': ['index of the start of the corresponding entity', 'index of the end of the corresponding entity', 'entity extracted from your text data', 'label (tag) assigned to a given extracted entity', 'accuracy score; how accurately a tag has been assigned to a given entity']})
242
+
243
+ buf = io.BytesIO()
244
+ with zipfile.ZipFile(buf, "w") as myzip:
245
+ myzip.writestr("Summary of the results.csv", df.to_csv(index=False))
246
+ myzip.writestr("Most Frequent Entities.csv", repeating_entities.to_csv(index=False))
247
+ myzip.writestr("Glossary of tags.csv", dfa.to_csv(index=False))
248
+
249
+ with stylable_container(
250
+ key="download_button",
251
+ css_styles="""button { background-color: #8A2BE2; border: 1px solid black; padding: 5px; color: white; }""",
252
+ ):
253
+ st.download_button(
254
+ label="Download results and glossary (zip)",
255
+ data=buf.getvalue(),
256
+ file_name="nlpblogs_results.zip",
257
+ mime="application/zip"
258
+ )
259
+ st.text("")
260
+ st.text("")
261
+ else:
262
+ st.warning("No entities were found in the provided text.")
263
+
264
+