AIEcosystem commited on
Commit
f7d7b8d
·
verified ·
1 Parent(s): afda445

Update src/streamlit_app.py

Browse files
Files changed (1) hide show
  1. src/streamlit_app.py +204 -140
src/streamlit_app.py CHANGED
@@ -12,6 +12,7 @@ from streamlit_extras.stylable_container import stylable_container
12
  from typing import Optional
13
  from gliner import GLiNER
14
  from comet_ml import Experiment
 
15
  st.markdown(
16
  """
17
  <style>
@@ -55,7 +56,9 @@ st.markdown(
55
  }
56
  </style>
57
  """,
58
- unsafe_allow_html=True)
 
 
59
  # --- Page Configuration and UI Elements ---
60
  st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
61
  st.subheader("Multilingual", divider="green")
@@ -63,11 +66,11 @@ st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
63
  expander = st.expander("**Important notes**")
64
  expander.write("""**Named Entities:** This Multilingual web app predicts fourteen (14) labels: "Person", "First_name", "Last_name", "Title", "Job_title", "Affiliation", "Gender", "Age", "Date", "Nationality", "Location", "Country", "Role", "Relationship"
65
 
66
- Results are presented in easy-to-read tables, visualized in an interactive tree map, pie chart and bar chart, and are available for download along with a Glossary of tags.
67
 
68
- **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract and tag entities in your text data.
69
 
70
- **Usage Limits:** You can request results unlimited times for one (1) month.
71
 
72
  **Supported Languages:** European, Asian, Indian, Arabic, African
73
 
@@ -76,6 +79,7 @@ Results are presented in easy-to-read tables, visualized in an interactive tree
76
  **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
77
 
78
  For any errors or inquiries, please contact us at info@nlpblogs.com""")
 
79
  with st.sidebar:
80
  st.write("Use the following code to embed the Multilingual web app on your website. Feel free to adjust the width and height values to fit your page.")
81
  code = '''
@@ -93,6 +97,7 @@ with st.sidebar:
93
  st.divider()
94
  st.subheader("🚀 Ready to build your own AI Web App?", divider="orange")
95
  st.link_button("AI Web App Builder", "https://nlpblogs.com/build-your-named-entity-recognition-app/", type="primary")
 
96
  # --- Comet ML Setup ---
97
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
98
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
@@ -100,177 +105,236 @@ COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
100
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
101
  if not comet_initialized:
102
  st.warning("Comet ML not initialized. Check environment variables.")
 
103
  # --- Label Definitions ---
104
- labels = [ "PERSON",
105
- "FIRST_NAME",
106
- "LAST_NAME",
107
- "TITLE", "JOB_TITLE",
108
- "AFFILIATION", "GENDER",
109
- "AGE",
110
- "DATE",
111
- "NATIONALITY", "LOCATION","COUNTRY", "ROLE",
112
- "RELATIONSHIP"]
113
- # Create a mapping dictionary for labels to categories
114
- category_mapping = { "Identity": [
115
  "PERSON",
116
  "FIRST_NAME",
117
  "LAST_NAME",
118
- "TITLE"
119
- ],
120
- "Professional": [
121
  "JOB_TITLE",
122
- "AFFILIATION"
123
- ],
124
- "Demographic": [
125
  "GENDER",
126
  "AGE",
127
  "DATE",
128
  "NATIONALITY",
129
- "LOCATION","COUNTRY"
130
- ],
131
- "Relational": [
132
  "ROLE",
133
  "RELATIONSHIP"
134
- ]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
  # --- Model Loading ---
136
  @st.cache_resource
137
  def load_ner_model():
138
  """Loads the GLiNER model and caches it."""
139
  try:
140
- return GLiNER.from_pretrained("urchade/gliner_multi", nested_ner=True, num_gen_sequences=2, gen_constraints= labels)
141
  except Exception as e:
142
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
143
  st.stop()
144
  model = load_ner_model()
 
145
  # Flatten the mapping to a single dictionary
146
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
 
 
 
 
 
 
 
 
 
 
 
147
  # --- Text Input and Clear Button ---
148
  word_limit = 200
149
  text = st.text_area(f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter", height=250, key='my_text_area')
150
  word_count = len(text.split())
151
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
 
152
  def clear_text():
153
- """Clears the text area."""
154
  st.session_state['my_text_area'] = ""
 
 
 
 
155
  st.button("Clear text", on_click=clear_text)
 
156
  # --- Results Section ---
157
  if st.button("Results"):
158
- start_time = time.time()
159
  if not text.strip():
160
  st.warning("Please enter some text to extract entities.")
 
161
  elif word_count > word_limit:
162
  st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
 
163
  else:
164
- with st.spinner("Extracting entities...", show_time=True):
165
- entities = model.predict_entities(text, labels)
166
- df = pd.DataFrame(entities)
167
- if not df.empty:
168
- df['category'] = df['label'].map(reverse_category_mapping)
169
- if comet_initialized:
170
- experiment = Experiment(
171
- api_key=COMET_API_KEY,
172
- workspace=COMET_WORKSPACE,
173
- project_name=COMET_PROJECT_NAME,
174
- )
175
- experiment.log_parameter("input_text", text)
176
- experiment.log_table("predicted_entities", df)
177
- st.subheader("Grouped Entities by Category", divider = "green")
178
- # Create tabs for each category
179
- category_names = sorted(list(category_mapping.keys()))
180
- category_tabs = st.tabs(category_names)
181
- for i, category_name in enumerate(category_names):
182
- with category_tabs[i]:
183
- df_category_filtered = df[df['category'] == category_name]
184
- if not df_category_filtered.empty:
185
- st.dataframe(df_category_filtered.drop(columns=['category']), use_container_width=True)
186
- else:
187
- st.info(f"No entities found for the '{category_name}' category.")
188
- with st.expander("See Glossary of tags"):
189
- st.write('''
190
- - **text**: ['entity extracted from your text data']
191
- - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
192
- - **label**: ['label (tag) assigned to a given extracted entity']
193
- - **category**: ['the high-level category for the label']
194
- - **start**: ['index of the start of the corresponding entity']
195
- - **end**: ['index of the end of the corresponding entity']
196
- ''')
197
- st.divider()
198
- # Tree map
199
- st.subheader("Tree map", divider = "green")
200
- fig_treemap = px.treemap(df, path=[px.Constant("all"), 'category', 'label', 'text'], values='score', color='category')
201
- fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25), paper_bgcolor='#F0F2F5', plot_bgcolor='#F0F2F5')
202
- st.plotly_chart(fig_treemap)
203
- # Pie and Bar charts
204
- grouped_counts = df['category'].value_counts().reset_index()
205
- grouped_counts.columns = ['category', 'count']
206
- col1, col2 = st.columns(2)
207
- with col1:
208
- st.subheader("Pie chart", divider = "green")
209
- fig_pie = px.pie(grouped_counts, values='count', names='category', hover_data=['count'], labels={'count': 'count'}, title='Percentage of predicted categories')
210
- fig_pie.update_traces(textposition='inside', textinfo='percent+label')
211
- fig_pie.update_layout(
212
- paper_bgcolor='#F0F2F5',
213
- plot_bgcolor='#F0F2F5'
214
- )
215
- st.plotly_chart(fig_pie)
216
- with col2:
217
- st.subheader("Bar chart", divider = "green")
218
- fig_bar = px.bar(grouped_counts, x="count", y="category", color="category", text_auto=True, title='Occurrences of predicted categories')
219
- fig_bar.update_layout(
220
- paper_bgcolor='#F0F2F5',
221
- plot_bgcolor='#F0F2F5'
222
- )
223
- st.plotly_chart(fig_bar)
224
- # Most Frequent Entities
225
- st.subheader("Most Frequent Entities", divider="green")
226
- word_counts = df['text'].value_counts().reset_index()
227
- word_counts.columns = ['Entity', 'Count']
228
- repeating_entities = word_counts[word_counts['Count'] > 1]
229
- if not repeating_entities.empty:
230
- st.dataframe(repeating_entities, use_container_width=True)
231
- fig_repeating_bar = px.bar(repeating_entities, x='Entity', y='Count', color='Entity')
232
- fig_repeating_bar.update_layout(xaxis={'categoryorder': 'total descending'},
233
- paper_bgcolor='#F0F2F5',
234
- plot_bgcolor='#F0F2F5')
235
- st.plotly_chart(fig_repeating_bar)
236
  else:
237
- st.warning("No entities were found that occur more than once.")
238
- # Download Section
239
- st.divider()
240
- dfa = pd.DataFrame(
241
- data={
242
- 'Column Name': ['text', 'label', 'score', 'start', 'end', 'category'],
243
- 'Description': [
244
- 'entity extracted from your text data',
245
- 'label (tag) assigned to a given extracted entity',
246
- 'accuracy score; how accurately a tag has been assigned to a given entity',
247
- 'index of the start of the corresponding entity',
248
- 'index of the end of the corresponding entity',
249
- 'the broader category the entity belongs to',
250
- ]
251
- }
252
- )
253
- buf = io.BytesIO()
254
- with zipfile.ZipFile(buf, "w") as myzip:
255
- myzip.writestr("Summary of the results.csv", df.to_csv(index=False))
256
- myzip.writestr("Glossary of tags.csv", dfa.to_csv(index=False))
257
- with stylable_container(
258
- key="download_button",
259
- css_styles="""button { background-color: red; border: 1px solid black; padding: 5px; color: white; }""",
260
- ):
261
- st.download_button(
262
- label="Download results and glossary (zip)",
263
- data=buf.getvalue(),
264
- file_name="nlpblogs_results.zip",
265
- mime="application/zip",
266
- )
267
- if comet_initialized:
268
- experiment.log_figure(figure=fig_treemap, figure_name="entity_treemap_categories")
269
- experiment.end()
270
- else: # If df is empty
271
- st.warning("No entities were found in the provided text.")
272
- end_time = time.time()
273
- elapsed_time = end_time - start_time
274
- st.text("")
275
- st.text("")
276
- st.info(f"Results processed in **{elapsed_time:.2f} seconds**.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  from typing import Optional
13
  from gliner import GLiNER
14
  from comet_ml import Experiment
15
+
16
  st.markdown(
17
  """
18
  <style>
 
56
  }
57
  </style>
58
  """,
59
+ unsafe_allow_html=True
60
+ )
61
+
62
  # --- Page Configuration and UI Elements ---
63
  st.set_page_config(layout="wide", page_title="Named Entity Recognition App")
64
  st.subheader("Multilingual", divider="green")
 
66
  expander = st.expander("**Important notes**")
67
  expander.write("""**Named Entities:** This Multilingual web app predicts fourteen (14) labels: "Person", "First_name", "Last_name", "Title", "Job_title", "Affiliation", "Gender", "Age", "Date", "Nationality", "Location", "Country", "Role", "Relationship"
68
 
69
+ Results are presented in easy-to-read tables, visualized in an interactive tree map, pie chart and bar chart, and are available for download along with a Glossary of tags.
70
 
71
+ **How to Use:** Type or paste your text into the text area below, then press Ctrl + Enter. Click the 'Results' button to extract and tag entities in your text data.
72
 
73
+ **Usage Limits:** You can request results unlimited times for one (1) month.
74
 
75
  **Supported Languages:** European, Asian, Indian, Arabic, African
76
 
 
79
  **Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
80
 
81
  For any errors or inquiries, please contact us at info@nlpblogs.com""")
82
+
83
  with st.sidebar:
84
  st.write("Use the following code to embed the Multilingual web app on your website. Feel free to adjust the width and height values to fit your page.")
85
  code = '''
 
97
  st.divider()
98
  st.subheader("🚀 Ready to build your own AI Web App?", divider="orange")
99
  st.link_button("AI Web App Builder", "https://nlpblogs.com/build-your-named-entity-recognition-app/", type="primary")
100
+
101
  # --- Comet ML Setup ---
102
  COMET_API_KEY = os.environ.get("COMET_API_KEY")
103
  COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
 
105
  comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
106
  if not comet_initialized:
107
  st.warning("Comet ML not initialized. Check environment variables.")
108
+
109
  # --- Label Definitions ---
110
+ labels = [
 
 
 
 
 
 
 
 
 
 
111
  "PERSON",
112
  "FIRST_NAME",
113
  "LAST_NAME",
114
+ "TITLE",
 
 
115
  "JOB_TITLE",
116
+ "AFFILIATION",
 
 
117
  "GENDER",
118
  "AGE",
119
  "DATE",
120
  "NATIONALITY",
121
+ "LOCATION",
122
+ "COUNTRY",
 
123
  "ROLE",
124
  "RELATIONSHIP"
125
+ ]
126
+
127
+ # Create a mapping dictionary for labels to categories
128
+ category_mapping = {
129
+ "Identity": [
130
+ "PERSON",
131
+ "FIRST_NAME",
132
+ "LAST_NAME",
133
+ "TITLE"
134
+ ],
135
+ "Professional": [
136
+ "JOB_TITLE",
137
+ "AFFILIATION"
138
+ ],
139
+ "Demographic": [
140
+ "GENDER",
141
+ "AGE",
142
+ "DATE",
143
+ "NATIONALITY",
144
+ "LOCATION",
145
+ "COUNTRY"
146
+ ],
147
+ "Relational": [
148
+ "ROLE",
149
+ "RELATIONSHIP"
150
+ ]
151
+ }
152
+
153
  # --- Model Loading ---
154
  @st.cache_resource
155
  def load_ner_model():
156
  """Loads the GLiNER model and caches it."""
157
  try:
158
+ return GLiNER.from_pretrained("urchade/gliner_multi", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
159
  except Exception as e:
160
  st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
161
  st.stop()
162
  model = load_ner_model()
163
+
164
  # Flatten the mapping to a single dictionary
165
  reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
166
+
167
+ # --- Session State Initialization ---
168
+ if 'show_results' not in st.session_state:
169
+ st.session_state.show_results = False
170
+ if 'last_text' not in st.session_state:
171
+ st.session_state.last_text = ""
172
+ if 'results_df' not in st.session_state:
173
+ st.session_state.results_df = pd.DataFrame()
174
+ if 'elapsed_time' not in st.session_state:
175
+ st.session_state.elapsed_time = 0.0
176
+
177
  # --- Text Input and Clear Button ---
178
  word_limit = 200
179
  text = st.text_area(f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter", height=250, key='my_text_area')
180
  word_count = len(text.split())
181
  st.markdown(f"**Word count:** {word_count}/{word_limit}")
182
+
183
  def clear_text():
184
+ """Clears the text area and hides results."""
185
  st.session_state['my_text_area'] = ""
186
+ st.session_state.show_results = False
187
+ st.session_state.last_text = ""
188
+ st.session_state.results_df = pd.DataFrame()
189
+ st.session_state.elapsed_time = 0.0
190
  st.button("Clear text", on_click=clear_text)
191
+
192
  # --- Results Section ---
193
  if st.button("Results"):
 
194
  if not text.strip():
195
  st.warning("Please enter some text to extract entities.")
196
+ st.session_state.show_results = False
197
  elif word_count > word_limit:
198
  st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
199
+ st.session_state.show_results = False
200
  else:
201
+ # Check if the text is different from the last time
202
+ if text != st.session_state.last_text:
203
+ st.session_state.show_results = True
204
+ st.session_state.last_text = text
205
+ start_time = time.time()
206
+ with st.spinner("Extracting entities...", show_time=True):
207
+ entities = model.predict_entities(text, labels)
208
+ df = pd.DataFrame(entities)
209
+ st.session_state.results_df = df
210
+ if not df.empty:
211
+ df['category'] = df['label'].map(reverse_category_mapping)
212
+ if comet_initialized:
213
+ experiment = Experiment(
214
+ api_key=COMET_API_KEY,
215
+ workspace=COMET_WORKSPACE,
216
+ project_name=COMET_PROJECT_NAME,
217
+ )
218
+ experiment.log_parameter("input_text", text)
219
+ experiment.log_table("predicted_entities", df)
220
+ experiment.end()
221
+ end_time = time.time()
222
+ st.session_state.elapsed_time = end_time - start_time
223
+ else:
224
+ # If the text is the same, just show the cached results without re-running
225
+ st.session_state.show_results = True
226
+
227
+ # Display results if the state variable is True
228
+ if st.session_state.show_results:
229
+ df = st.session_state.results_df
230
+ if not df.empty:
231
+ # Re-map categories for display
232
+ df['category'] = df['label'].map(reverse_category_mapping)
233
+ st.subheader("Grouped Entities by Category", divider="green")
234
+
235
+ # Create tabs for each category
236
+ category_names = sorted(list(category_mapping.keys()))
237
+ category_tabs = st.tabs(category_names)
238
+
239
+ for i, category_name in enumerate(category_names):
240
+ with category_tabs[i]:
241
+ df_category_filtered = df[df['category'] == category_name]
242
+ if not df_category_filtered.empty:
243
+ st.dataframe(df_category_filtered.drop(columns=['category']), use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
  else:
245
+ st.info(f"No entities found for the '{category_name}' category.")
246
+
247
+ with st.expander("See Glossary of tags"):
248
+ st.write('''
249
+ - **text**: ['entity extracted from your text data']
250
+ - **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
251
+ - **label**: ['label (tag) assigned to a given extracted entity']
252
+ - **category**: ['the high-level category for the label']
253
+ - **start**: ['index of the start of the corresponding entity']
254
+ - **end**: ['index of the end of the corresponding entity']
255
+ ''')
256
+ st.divider()
257
+
258
+ # Tree map
259
+ st.subheader("Tree map", divider="green")
260
+ fig_treemap = px.treemap(df, path=[px.Constant("all"), 'category', 'label', 'text'], values='score', color='category')
261
+ fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25), paper_bgcolor='#F0F2F5', plot_bgcolor='#F0F2F5')
262
+ st.plotly_chart(fig_treemap)
263
+
264
+ # Pie and Bar charts
265
+ grouped_counts = df['category'].value_counts().reset_index()
266
+ grouped_counts.columns = ['category', 'count']
267
+ col1, col2 = st.columns(2)
268
+
269
+ with col1:
270
+ st.subheader("Pie chart", divider="green")
271
+ fig_pie = px.pie(grouped_counts, values='count', names='category', hover_data=['count'], labels={'count': 'count'}, title='Percentage of predicted categories')
272
+ fig_pie.update_traces(textposition='inside', textinfo='percent+label')
273
+ fig_pie.update_layout(
274
+ paper_bgcolor='#F0F2F5',
275
+ plot_bgcolor='#F0F2F5'
276
+ )
277
+ st.plotly_chart(fig_pie)
278
+
279
+ with col2:
280
+ st.subheader("Bar chart", divider="green")
281
+ fig_bar = px.bar(grouped_counts, x="count", y="category", color="category", text_auto=True, title='Occurrences of predicted categories')
282
+ fig_bar.update_layout(
283
+ paper_bgcolor='#F0F2F5',
284
+ plot_bgcolor='#F0F2F5'
285
+ )
286
+ st.plotly_chart(fig_bar)
287
+
288
+ # Most Frequent Entities
289
+ st.subheader("Most Frequent Entities", divider="green")
290
+ word_counts = df['text'].value_counts().reset_index()
291
+ word_counts.columns = ['Entity', 'Count']
292
+ repeating_entities = word_counts[word_counts['Count'] > 1]
293
+
294
+ if not repeating_entities.empty:
295
+ st.dataframe(repeating_entities, use_container_width=True)
296
+ fig_repeating_bar = px.bar(repeating_entities, x='Entity', y='Count', color='Entity')
297
+ fig_repeating_bar.update_layout(xaxis={'categoryorder': 'total descending'},
298
+ paper_bgcolor='#F0F2F5',
299
+ plot_bgcolor='#F0F2F5')
300
+ st.plotly_chart(fig_repeating_bar)
301
+ else:
302
+ st.warning("No entities were found that occur more than once.")
303
+
304
+ # Download Section
305
+ st.divider()
306
+ dfa = pd.DataFrame(
307
+ data={
308
+ 'Column Name': ['text', 'label', 'score', 'start', 'end', 'category'],
309
+ 'Description': [
310
+ 'entity extracted from your text data',
311
+ 'label (tag) assigned to a given extracted entity',
312
+ 'accuracy score; how accurately a tag has been assigned to a given entity',
313
+ 'index of the start of the corresponding entity',
314
+ 'index of the end of the corresponding entity',
315
+ 'the broader category the entity belongs to',
316
+ ]
317
+ }
318
+ )
319
+ buf = io.BytesIO()
320
+ with zipfile.ZipFile(buf, "w") as myzip:
321
+ myzip.writestr("Summary of the results.csv", df.to_csv(index=False))
322
+ myzip.writestr("Glossary of tags.csv", dfa.to_csv(index=False))
323
+
324
+ with stylable_container(
325
+ key="download_button",
326
+ css_styles="""button { background-color: #6495ED; border: 1px solid black; padding: 5px; color: white; }""",
327
+ ):
328
+ st.download_button(
329
+ label="Download results and glossary (zip)",
330
+ data=buf.getvalue(),
331
+ file_name="nlpblogs_results.zip",
332
+ mime="application/zip",
333
+ )
334
+
335
+ st.text("")
336
+ st.text("")
337
+ st.info(f"Results processed in **{st.session_state.elapsed_time:.2f} seconds**.")
338
+
339
+ else: # If df is empty after the button click
340
+ st.warning("No entities were found in the provided text.")