Spaces:
Running
Running
File size: 32,935 Bytes
e342cce 4b151cd 758f35b e342cce 758f35b 6ff1fb4 758f35b e342cce 758f35b e342cce 758f35b 9f4b73f 758f35b 9f4b73f 758f35b 9f4b73f 1f6b923 e342cce 758f35b e342cce 758f35b e342cce 758f35b efb6584 758f35b efb6584 758f35b efb6584 758f35b e02388e 758f35b e342cce 758f35b 5283a70 6ff1fb4 758f35b e342cce 758f35b 0b61c41 758f35b efb6584 758f35b 0b61c41 758f35b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 |
import os
os.environ['HF_HOME'] = '/tmp'
import time
import streamlit as st
import streamlit.components.v1 as components
import pandas as pd
import io
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import re
import string
import json
# --- PPTX Imports ---
from io import BytesIO
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
import plotly.io as pio # Required for image export
# ---------------------------
# --- Stable Scikit-learn LDA Imports ---
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# ------------------------------
from gliner import GLiNER
from streamlit_extras.stylable_container import stylable_container
# Using a try/except for comet_ml import
try:
from comet_ml import Experiment
except ImportError:
class Experiment:
def __init__(self, **kwargs): pass
def log_parameter(self, *args): pass
def log_table(self, *args): pass
def end(self): pass
# --- Model Home Directory (Fix for deployment environments) ---
# Set HF_HOME environment variable to a writable path
os.environ['HF_HOME'] = '/tmp'
# --- Color Map for Highlighting and Network Graph Nodes ---
entity_color_map = {
"person": "#10b981",
"country": "#3b82f6",
"city": "#4ade80",
"organization": "#f59e0b",
"date": "#8b5cf6",
"time": "#ec4899",
"cardinal": "#06b6d4",
"money": "#f43f5e",
"position": "#a855f7",
}
# --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
labels = list(entity_color_map.keys())
category_mapping = {
"People": ["person", "organization", "position"],
"Locations": ["country", "city"],
"Time": ["date", "time"],
"Numbers": ["money", "cardinal"]}
reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
# --- Utility Functions for Analysis and Plotly ---
def extract_label(node_name):
"""Extracts the label from a node string like 'Text (Label)'."""
match = re.search(r'\(([^)]+)\)$', node_name)
return match.group(1) if match else "Unknown"
def remove_trailing_punctuation(text_string):
"""Removes trailing punctuation from a string."""
return text_string.rstrip(string.punctuation)
def highlight_entities(text, df_entities):
"""Generates HTML to display text with entities highlighted and colored."""
if df_entities.empty:
return text
# Sort entities by start index descending to insert highlights without affecting subsequent indices
entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
highlighted_text = text
for entity in entities:
start = entity['start']
end = entity['end']
label = entity['label']
entity_text = entity['text']
color = entity_color_map.get(label, '#000000')
# Create a span with background color and tooltip
highlight_html = f'<span style="background-color: {color}; color: white; padding: 2px 4px; border-radius: 3px; cursor: help;" title="{label}">{entity_text}</span>'
# Replace the original text segment with the highlighted HTML
highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
# Use a div to mimic the Streamlit input box style for the report
return f'<div style="border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px;">{highlighted_text}</div>'
def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
"""
Performs basic Topic Modeling using LDA on the extracted entities
and returns structured data for visualization.
"""
documents = df_entities['text'].unique().tolist()
if len(documents) < 2:
return None
N = min(num_top_words, len(documents))
try:
tfidf_vectorizer = TfidfVectorizer(
max_df=0.95,
min_df=1,
stop_words='english'
)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
lda = LatentDirichletAllocation(
n_components=num_topics, max_iter=5, learning_method='online',random_state=42, n_jobs=-1
)
lda.fit(tfidf)
topic_data_list = []
for topic_idx, topic in enumerate(lda.components_):
top_words_indices = topic.argsort()[:-N - 1:-1]
top_words = [tfidf_feature_names[i] for i in top_words_indices]
word_weights = [topic[i] for i in top_words_indices]
for word, weight in zip(top_words, word_weights):
topic_data_list.append({
'Topic_ID': f'Topic #{topic_idx + 1}',
'Word': word,
'Weight': weight,
})
return pd.DataFrame(topic_data_list)
except Exception as e:
st.error(f"Topic modeling failed: {e}")
return None
def create_topic_word_bubbles(df_topic_data):
"""Generates a Plotly Bubble Chart for top words across all topics."""
# Renaming columns to match the output of perform_topic_modeling
df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic', 'Word': 'word', 'Weight': 'weight'})
df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position in the app
if df_topic_data.empty:
return None
fig = px.scatter(
df_topic_data,
x='x_pos',
y='weight',
size='weight',
color='topic',
hover_name='word',
size_max=80,
title='Topic Word Weights (Bubble Chart)',
color_discrete_sequence=px.colors.qualitative.Bold,
labels={
'x_pos': 'Entity/Word Index',
'weight': 'Word Weight',
'topic': 'Topic ID'
},
custom_data=['word', 'weight', 'topic']
)
fig.update_layout(
xaxis_title="Entity/Word (Bubble size = Word Weight)",
yaxis_title="Word Weight",
xaxis={'tickangle': -45, 'showgrid': False},
yaxis={'showgrid': True},
showlegend=True,
plot_bgcolor='#f9f9f9', # Changed from pink
paper_bgcolor='#f9f9f9', # Changed from pink
height=600,
margin=dict(t=50, b=100, l=50, r=10),
)
fig.update_traces(hovertemplate='<b>%{customdata[0]}</b><br>Weight: %{customdata[1]:.3f}<extra></extra>', marker=dict(line=dict(width=1, color='DarkSlateGrey')))
return fig
def generate_network_graph(df, raw_text):
"""
Generates a network graph visualization (Node Plot) with edges
based on entity co-occurrence in sentences. (Content omitted for brevity but assumed to be here).
"""
# Using the existing generate_network_graph logic from previous context...
entity_counts = df['text'].value_counts().reset_index()
entity_counts.columns = ['text', 'frequency']
unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
if unique_entities.shape[0] < 2:
return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
num_nodes = len(unique_entities)
thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
radius = 10
unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
edges = set()
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', raw_text)
for sentence in sentences:
entities_in_sentence = []
for entity_text in unique_entities['text'].unique():
if entity_text.lower() in sentence.lower():
entities_in_sentence.append(entity_text)
unique_entities_in_sentence = list(set(entities_in_sentence))
for i in range(len(unique_entities_in_sentence)):
for j in range(i + 1, len(unique_entities_in_sentence)):
node1 = unique_entities_in_sentence[i]
node2 = unique_entities_in_sentence[j]
edge_tuple = tuple(sorted((node1, node2)))
edges.add(edge_tuple)
edge_x = []
edge_y = []
for edge in edges:
n1, n2 = edge
if n1 in pos_map and n2 in pos_map:
edge_x.extend([pos_map[n1]['x'], pos_map[n2]['x'], None])
edge_y.extend([pos_map[n1]['y'], pos_map[n2]['y'], None])
fig = go.Figure()
edge_trace = go.Scatter(
x=edge_x, y=edge_y,
line=dict(width=0.5, color='#888'),
hoverinfo='none',
mode='lines',
name='Co-occurrence Edges',
showlegend=False
)
fig.add_trace(edge_trace)
fig.add_trace(go.Scatter(
x=unique_entities['x'],
y=unique_entities['y'],
mode='markers+text',
name='Entities',
text=unique_entities['text'],
textposition="top center",
showlegend=False,
marker=dict(
size=unique_entities['frequency'] * 5 + 10,
color=[entity_color_map.get(label, '#cccccc') for label in unique_entities['label']],
line_width=1,
line_color='black',
opacity=0.9
),
textfont=dict(size=10),
customdata=unique_entities[['label', 'score', 'frequency']],
hovertemplate=(
"<b>%{text}</b><br>" +
"Label: %{customdata[0]}<br>" +
"Score: %{customdata[1]:.2f}<br>" +
"Frequency: %{customdata[2]}<extra></extra>"
)
))
legend_traces = []
seen_labels = set()
for index, row in unique_entities.iterrows():
label = row['label']
if label not in seen_labels:
seen_labels.add(label)
color = entity_color_map.get(label, '#cccccc')
legend_traces.append(go.Scatter(
x=[None], y=[None], mode='markers', marker=dict(size=10, color=color), name=f"{label.capitalize()}", showlegend=True
))
for trace in legend_traces:
fig.add_trace(trace)
fig.update_layout(
title='Entity Co-occurrence Network (Edges = Same Sentence)',
showlegend=True,
hovermode='closest',
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
plot_bgcolor='#f9f9f9',
paper_bgcolor='#f9f9f9',
margin=dict(t=50, b=10, l=10, r=10),
height=600
)
return fig
# --- NEW CSV GENERATION FUNCTION ---
def generate_entity_csv(df):
"""
Generates a CSV file of the extracted entities in an in-memory buffer,
including text, label, category, score, start, and end indices.
"""
csv_buffer = BytesIO()
# Select desired columns and write to buffer
df_export = df[['text', 'label', 'category', 'score', 'start', 'end']]
csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
csv_buffer.seek(0)
return csv_buffer
# -----------------------------------
# --- Existing App Functionality (HTML) ---
def generate_html_report(df, text_input, elapsed_time, df_topic_data):
"""
Generates a full HTML report containing all analysis results and visualizations.
(Content omitted for brevity but assumed to be here).
"""
# 1. Generate Visualizations (Plotly HTML)
# 1a. Treemap
fig_treemap = px.treemap(
df,
path=[px.Constant("All Entities"), 'category', 'label', 'text'],
values='score',
color='category',
title="Entity Distribution by Category and Label",
color_discrete_sequence=px.colors.qualitative.Dark24
)
fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
# 1b. Pie Chart
grouped_counts = df['category'].value_counts().reset_index()
grouped_counts.columns = ['Category', 'Count']
# Changed color_discrete_sequence from sequential.RdBu (which has reds) to sequential.Cividis
fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
fig_pie.update_layout(margin=dict(t=50, b=10))
pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
# 1c. Bar Chart (Category Count)
fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
# 1d. Bar Chart (Most Frequent Entities)
word_counts = df['text'].value_counts().reset_index()
word_counts.columns = ['Entity', 'Count']
repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
bar_freq_html = '<p>No entities appear more than once in the text for visualization.</p>'
if not repeating_entities.empty:
# Changed color_discrete_sequence from sequential.Plasma (which has pink/magenta) to sequential.Viridis
fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
# 1e. Network Graph HTML
network_fig = generate_network_graph(df, text_input)
network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
# 1f. Topic Charts HTML
topic_charts_html = '<h3>Topic Word Weights (Bubble Chart)</h3>'
if df_topic_data is not None and not df_topic_data.empty:
bubble_figure = create_topic_word_bubbles(df_topic_data)
if bubble_figure:
topic_charts_html += f'<div class="chart-box">{bubble_figure.to_html(full_html=False, include_plotlyjs="cdn")}</div>'
else:
topic_charts_html += '<p style="color: red;">Error: Topic modeling data was available but visualization failed.</p>'
else:
topic_charts_html += '<div class="chart-box" style="text-align: center; padding: 50px; background-color: #fff; border: 1px dashed #888888;">' # Changed border color
topic_charts_html += '<p><strong>Topic Modeling requires more unique input.</strong></p>'
topic_charts_html += '<p>Please enter text containing at least two unique entities to generate the Topic Bubble Chart.</p>'
topic_charts_html += '</div>'
# 2. Get Highlighted Text
highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
# 3. Entity Tables (Pandas to HTML)
entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
classes='table table-striped',
index=False
)
# 4. Construct the Final HTML
html_content = f"""<!DOCTYPE html><html lang="en"><head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Entity and Topic Analysis Report</title>
<script src="https://cdn.plot.ly/plotly-latest.min.js"></script>
<style>
body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: #f4f4f9; color: #333; }}
.container {{ max-width: 1200px; margin: 0 auto; background-color: #ffffff; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); }}
h1 {{ color: #007bff; border-bottom: 3px solid #007bff; padding-bottom: 10px; margin-top: 0; }}
h2 {{ color: #007bff; margin-top: 30px; border-bottom: 1px solid #ddd; padding-bottom: 5px; }}
h3 {{ color: #555; margin-top: 20px; }}
.metadata {{ background-color: #e6f0ff; padding: 15px; border-radius: 8px; margin-bottom: 20px; font-size: 0.9em; }}
.chart-box {{ background-color: #f9f9f9; padding: 15px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); min-width: 0; margin-bottom: 20px; }}
table {{ width: 100%; border-collapse: collapse; margin-top: 15px; }}
table th, table td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
table th {{ background-color: #f0f0f0; }}
.highlighted-text {{ border: 1px solid #888888; padding: 15px; border-radius: 5px; background-color: #ffffff; font-family: monospace; white-space: pre-wrap; margin-bottom: 20px; }}
</style></head><body>
<div class="container">
<h1>Entity and Topic Analysis Report</h1>
<div class="metadata">
<p><strong>Generated on:</strong> {time.strftime('%Y-%m-%d')}</p>
<p><strong>Processing Time:</strong> {elapsed_time:.2f} seconds</p>
</div>
<h2>1. Analyzed Text & Extracted Entities</h2>
<h3>Original Text with Highlighted Entities</h3>
<div class="highlighted-text-container">
{highlighted_text_html}
</div>
<h2>2. Full Extracted Entities Table</h2>
{entity_table_html}
<h2>3. Data Visualizations</h2>
<h3>3.1 Entity Distribution Treemap</h3>
<div class="chart-box">{treemap_html}</div>
<h3>3.2 Comparative Charts (Pie, Category Count, Frequency) - *Stacked Vertically*</h3>
<div class="chart-box">{pie_html}</div>
<div class="chart-box">{bar_category_html}</div>
<div class="chart-box">{bar_freq_html}</div>
<h3>3.3 Entity Relationship Map (Edges = Same Sentence)</h3>
<div class="chart-box">{network_html}</div>
<h2>4. Topic Modelling</h2>
{topic_charts_html}
</div></body></html>
"""
return html_content
# --- Page Configuration and Styling (No Sidebar) ---
st.set_page_config(layout="wide", page_title="NER & Topic Report App")
st.markdown(
"""
<style>
/* Overall app container - NO SIDEBAR */
.main {
background-color: #f4f4f9; /* Changed from light pink */
color: #333333; /* Dark grey text for contrast */
}
.stApp {
background-color: #f4f4f9; /* Changed from light pink */
}
/* Text Area background and text color (input fields) */
.stTextArea textarea {
background-color: #ffffff; /* Changed from near white/pinkish */
color: #000000; /* Black text for input */
border: 1px solid #888888; /* Changed border from pink to grey */
}
/* Button styling */
.stButton > button {
background-color: #007bff; /* Changed from Deep Pink to Blue */
color: #FFFFFF; /* White text for contrast */
border: none;
padding: 10px 20px;
border-radius: 5px;
}
/* Expander header and content background */
.streamlit-expanderHeader, .streamlit-expanderContent {
background-color: #e9ecef; /* Changed from lighter pink to light grey/blue */
color: #333333;
}
</style>
""",
unsafe_allow_html=True)
st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue"
st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
tab1, tab2 = st.tabs(["Embed", "Important Notes"])
with tab1:
with st.expander("Embed"):
st.write("Use the following code to embed the DataHarvest web app on your website. Feel free to adjust the width and height values to fit your page.")
code = '''
<iframe
src="https://aiecosystem-dataharvest.hf.space"
frameborder="0"
width="850"
height="450"
></iframe>
'''
st.code(code, language="html")
with tab2:
expander = st.expander("**Important Notes**")
expander.write("""**Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"
**Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
**How to Use:** Type or paste your text into the text area below, press Ctrl + Enter, and then click the 'Results' button.
**Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.""")
st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
# --- Comet ML Setup (Placeholder/Conditional) ---
COMET_API_KEY = os.environ.get("COMET_API_KEY")
COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
# --- Model Loading ---
@st.cache_resource
def load_ner_model():
"""Loads the GLiNER model and caches it."""
try:
return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
except Exception as e:
st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
st.stop()
model = load_ner_model()
# --- LONG DEFAULT TEXT (178 Words) ---
DEFAULT_TEXT = (
"In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
"TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
"leap forward for commercial space technology across the entire European Union. The agreement, finalized "
"on Monday in Paris, France, focuses specifically on jointly developing the next generation of the 'Astra' "
"software platform. This platform is critical for processing and managing the vast amounts of data being sent "
"back from the recent Mars rover mission. The core team, including lead engineer Marcus Davies, will hold "
"their first collaborative workshop in Berlin, Germany, on August 15th. The community response on social "
"media platform X (under the username @TechSolutionsCEO) was overwhelmingly positive, with many major tech "
"publications, including Wired Magazine, predicting a major impact on the space technology industry by the "
"end of the year. The platform is designed to be compatible with both Windows and Linux operating systems. "
"The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
"are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
"general public by October 1st. The goal is to deploy the Astra v2 platform before the next solar eclipse event in 2026.")
# -----------------------------------
# --- Session State Initialization (CRITICAL FIX) ---
if 'show_results' not in st.session_state:
st.session_state.show_results = False
if 'last_text' not in st.session_state:
st.session_state.last_text = ""
if 'results_df' not in st.session_state:
st.session_state.results_df = pd.DataFrame()
if 'elapsed_time' not in st.session_state:
st.session_state.elapsed_time = 0.0
if 'topic_results' not in st.session_state:
st.session_state.topic_results = None
if 'my_text_area' not in st.session_state:
st.session_state.my_text_area = DEFAULT_TEXT
# --- Clear Button Function (MODIFIED) ---
def clear_text():
"""Clears the text area (sets it to an empty string) and hides results."""
st.session_state['my_text_area'] = ""
st.session_state.show_results = False
st.session_state.last_text = ""
st.session_state.results_df = pd.DataFrame()
st.session_state.elapsed_time = 0.0
st.session_state.topic_results = None
# --- Text Input and Clear Button ---
word_limit = 1000
text = st.text_area(
f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
height=250,
key='my_text_area',
value=st.session_state.my_text_area)
word_count = len(text.split())
st.markdown(f"**Word count:** {word_count}/{word_limit}")
st.button("Clear text", on_click=clear_text)
# --- Results Trigger and Processing (Updated Logic) ---
if st.button("Results"):
if not text.strip():
st.warning("Please enter some text to extract entities.")
st.session_state.show_results = False
elif word_count > word_limit:
st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
st.session_state.show_results = False
else:
with st.spinner("Extracting entities and generating report data...", show_time=True):
if text != st.session_state.last_text:
st.session_state.last_text = text
start_time = time.time()
# --- Model Prediction & Dataframe Creation ---
entities = model.predict_entities(text, labels)
df = pd.DataFrame(entities)
if not df.empty:
df['text'] = df['text'].apply(remove_trailing_punctuation)
df['category'] = df['label'].map(reverse_category_mapping)
st.session_state.results_df = df
unique_entity_count = len(df['text'].unique())
N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
st.session_state.topic_results = perform_topic_modeling(
df,
num_topics=2,
num_top_words=N_TOP_WORDS_TO_USE
)
if comet_initialized:
experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
experiment.log_parameter("input_text", text)
experiment.log_table("predicted_entities", df)
experiment.end()
else:
st.session_state.results_df = pd.DataFrame()
st.session_state.topic_results = None
end_time = time.time()
st.session_state.elapsed_time = end_time - start_time
st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
st.session_state.show_results = True
# --- Display Download Link and Results ---
if st.session_state.show_results:
df = st.session_state.results_df
df_topic_data = st.session_state.topic_results
if df.empty:
st.warning("No entities were found in the provided text.")
else:
st.subheader("Analysis Results", divider="blue")
# 1. Highlighted Text
st.markdown("### 1. Analyzed Text with Highlighted Entities")
st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
# 2. Detailed Entity Analysis Tabs
st.markdown("### 2. Detailed Entity Analysis")
tab_category_details, tab_treemap_viz = st.tabs(["๐ Entities Grouped by Category", "๐บ๏ธ Treemap Distribution"])
with tab_category_details:
st.markdown("#### Detailed Entities Table (Grouped by Category)")
with st.expander("See Glossary of tags"):
st.write('''
- **text**: ['entity extracted from your text data']
- **label**: ['label (tag) assigned to a given extracted entity']
- **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
- **start**: ['index of the start of the corresponding entity']
- **end**: ['index of the end of the corresponding entity']
''')
unique_categories = list(category_mapping.keys())
tabs_category = st.tabs(unique_categories)
for category, tab in zip(unique_categories, tabs_category):
df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
with tab:
st.markdown(f"##### {category} Entities ({len(df_category)} total)")
if not df_category.empty:
st.dataframe(
df_category,
use_container_width=True,
column_config={'score': st.column_config.NumberColumn(format="%.4f")}
)
else:
st.info(f"No entities of category **{category}** were found in the text.")
with tab_treemap_viz:
st.markdown("#### Treemap: Entity Distribution")
fig_treemap = px.treemap(
df,
path=[px.Constant("All Entities"), 'category', 'label', 'text'],
values='score',
color='category',
color_discrete_sequence=px.colors.qualitative.Dark24
)
fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
st.plotly_chart(fig_treemap, use_container_width=True)
# 3. Comparative Charts
st.markdown("---")
st.markdown("### 3. Comparative Charts")
col1, col2, col3 = st.columns(3)
grouped_counts = df['category'].value_counts().reset_index()
grouped_counts.columns = ['Category', 'Count']
with col1: # Pie Chart
# Changed color_discrete_sequence
fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
st.plotly_chart(fig_pie, use_container_width=True)
with col2: # Bar Chart (Category Count)
fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
st.plotly_chart(fig_bar_category, use_container_width=True)
with col3: # Bar Chart (Most Frequent Entities)
word_counts = df['text'].value_counts().reset_index()
word_counts.columns = ['Entity', 'Count']
repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
if not repeating_entities.empty:
# Changed color_discrete_sequence
fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
st.plotly_chart(fig_bar_freq, use_container_width=True)
else:
st.info("No entities repeat for frequency chart.")
st.markdown("---")
st.markdown("### 4. Entity Relationship Map")
network_fig = generate_network_graph(df, st.session_state.last_text)
st.plotly_chart(network_fig, use_container_width=True)
st.markdown("---")
st.markdown("### 5. Topic Modelling Analysis")
if df_topic_data is not None and not df_topic_data.empty:
bubble_figure = create_topic_word_bubbles(df_topic_data)
if bubble_figure:
st.plotly_chart(bubble_figure, use_container_width=True)
else:
st.error("Error generating Topic Word Bubble Chart.")
else:
st.info("Topic modeling requires more unique input (at least two unique entities).")
# --- Report Download ---
st.markdown("---")
st.markdown("### Download Full Report Artifacts")
# 1. HTML Report Download (Retained)
html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
st.download_button(
label="Download Comprehensive HTML Report",
data=html_report,
file_name="ner_topic_report.html",
mime="text/html",
type="primary"
)
# 2. CSV Data Download (NEW)
csv_buffer = generate_entity_csv(df)
st.download_button(
label="Download Extracted Entities (CSV)",
data=csv_buffer,
file_name="extracted_entities.csv",
mime="text/csv",
type="secondary"
)
|