import os
os.environ['HF_HOME'] = '/tmp'
import time
import streamlit as st
import streamlit.components.v1 as components
import pandas as pd
import io
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import re
import string
import json
# --- PPTX Imports ---
from io import BytesIO
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE
import plotly.io as pio # Required for image export
# ---------------------------
# --- Stable Scikit-learn LDA Imports ---
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# ------------------------------
from gliner import GLiNER
from streamlit_extras.stylable_container import stylable_container
# Using a try/except for comet_ml import
try:
from comet_ml import Experiment
except ImportError:
class Experiment:
def __init__(self, **kwargs): pass
def log_parameter(self, *args): pass
def log_table(self, *args): pass
def end(self): pass
# --- Model Home Directory (Fix for deployment environments) ---
# Set HF_HOME environment variable to a writable path
os.environ['HF_HOME'] = '/tmp'
# --- Color Map for Highlighting and Network Graph Nodes ---
entity_color_map = {
"person": "#10b981",
"country": "#3b82f6",
"city": "#4ade80",
"organization": "#f59e0b",
"date": "#8b5cf6",
"time": "#ec4899",
"cardinal": "#06b6d4",
"money": "#f43f5e",
"position": "#a855f7",
}
# --- Label Definitions and Category Mapping (Used by the App and PPTX) ---
labels = list(entity_color_map.keys())
category_mapping = {
"People": ["person", "organization", "position"],
"Locations": ["country", "city"],
"Time": ["date", "time"],
"Numbers": ["money", "cardinal"]}
reverse_category_mapping = {label: category for category, label_list in category_mapping.items() for label in label_list}
# --- Utility Functions for Analysis and Plotly ---
def extract_label(node_name):
"""Extracts the label from a node string like 'Text (Label)'."""
match = re.search(r'\(([^)]+)\)$', node_name)
return match.group(1) if match else "Unknown"
def remove_trailing_punctuation(text_string):
"""Removes trailing punctuation from a string."""
return text_string.rstrip(string.punctuation)
def highlight_entities(text, df_entities):
"""Generates HTML to display text with entities highlighted and colored."""
if df_entities.empty:
return text
# Sort entities by start index descending to insert highlights without affecting subsequent indices
entities = df_entities.sort_values(by='start', ascending=False).to_dict('records')
highlighted_text = text
for entity in entities:
start = entity['start']
end = entity['end']
label = entity['label']
entity_text = entity['text']
color = entity_color_map.get(label, '#000000')
# Create a span with background color and tooltip
highlight_html = f'{entity_text}'
# Replace the original text segment with the highlighted HTML
highlighted_text = highlighted_text[:start] + highlight_html + highlighted_text[end:]
# Use a div to mimic the Streamlit input box style for the report
return f'
{highlighted_text}
'
def perform_topic_modeling(df_entities, num_topics=2, num_top_words=10):
"""
Performs basic Topic Modeling using LDA on the extracted entities,
allowing for n-grams to capture multi-word entities like 'Dr. Emily Carter'.
"""
# 1. Prepare Documents: Use unique entities (they are short, clean documents)
documents = df_entities['text'].unique().tolist()
if len(documents) < 2:
return None
N = min(num_top_words, len(documents))
try:
# 2. Vectorizer: Use TfidfVectorizer, but allow unigrams, bigrams, and trigrams (ngram_range)
# to capture multi-word entities. We keep stop_words='english' for the *components* of the entity.
tfidf_vectorizer = TfidfVectorizer(
max_df=0.95,
min_df=2, # Only consider words/phrases that appear at least twice to find topics
stop_words='english',
ngram_range=(1, 3) # This is the KEY to capturing "Dr. Emily Carter" as a single token (if it appears enough times)
)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
# Check if the vocabulary is too small after tokenization/ngram generation
if len(tfidf_feature_names) < num_topics:
# Re-run with min_df=1 if vocab is too small
tfidf_vectorizer = TfidfVectorizer(
max_df=1.0, min_df=1, stop_words='english', ngram_range=(1, 3)
)
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
if len(tfidf_feature_names) < num_topics:
return None
# 3. LDA Model Fit
lda = LatentDirichletAllocation(
n_components=num_topics, max_iter=5, learning_method='online',
random_state=42, n_jobs=-1
)
lda.fit(tfidf)
# 4. Extract Topic Data
topic_data_list = []
for topic_idx, topic in enumerate(lda.components_):
top_words_indices = topic.argsort()[:-N - 1:-1]
# These top_words will now include phrases like 'emily carter' or 'european space agency'
top_words = [tfidf_feature_names[i] for i in top_words_indices]
word_weights = [topic[i] for i in top_words_indices]
for word, weight in zip(top_words, word_weights):
topic_data_list.append({
'Topic_ID': f'Topic #{topic_idx + 1}',
'Word': word,
'Weight': weight,
})
return pd.DataFrame(topic_data_list)
except Exception as e:
# A broader catch for robustness
# st.error(f"Topic modeling failed: {e}") # Keep commented out for cleaner app
return None
def create_topic_word_bubbles(df_topic_data):
"""Generates a Plotly Bubble Chart for top words across
all topics, displaying the word directly on the bubble."""
# Renaming columns to match the output of perform_topic_modeling
df_topic_data = df_topic_data.rename(columns={'Topic_ID': 'topic',
'Word': 'word', 'Weight': 'weight'})
df_topic_data['x_pos'] = df_topic_data.index # Use index for x-position
if df_topic_data.empty:
return None
fig = px.scatter(
df_topic_data,
x='x_pos',
y='weight',
size='weight',
color='topic',
# Set text to the word
text='word',
hover_name='word',
size_max=40,
title='Topic Word Weights (Bubble Chart)',
color_discrete_sequence=px.colors.qualitative.Bold,
labels={
'x_pos': 'Entity/Word Index',
'weight': 'Word Weight',
'topic': 'Topic ID'
},
custom_data=['word', 'weight', 'topic']
)
fig.update_layout(
xaxis_title="Entity/Word",
yaxis_title="Word Weight",
# Hide x-axis labels since words are now labels
xaxis={'tickangle': -45, 'showgrid': False, 'showticklabels': False, 'zeroline': False, 'showline': False},
yaxis={'showgrid': True},
showlegend=True,
plot_bgcolor='#f9f9f9',
paper_bgcolor='#f9f9f9',
height=600,
margin=dict(t=50, b=100, l=50, r=10),
)
# Update traces to show the word text, set the text position, and set text color
fig.update_traces(
# Position the text on top of the bubble
textposition='middle center',
# --- THE KEY FIX IS HERE ---
# Set the text color to white for visibility against dark bubble colors
textfont=dict(color='white', size=10),
# ---------------------------
hovertemplate='%{customdata[0]}
Weight: %{customdata[1]:.3f}',
marker=dict(line=dict(width=1, color='DarkSlateGrey'))
)
return fig
def generate_network_graph(df, raw_text):
"""
Generates a network graph visualization (Node Plot) with edges
based on entity co-occurrence in sentences. (Content omitted for brevity but assumed to be here).
"""
# Using the existing generate_network_graph logic from previous context...
entity_counts = df['text'].value_counts().reset_index()
entity_counts.columns = ['text', 'frequency']
unique_entities = df.drop_duplicates(subset=['text', 'label']).merge(entity_counts, on='text')
if unique_entities.shape[0] < 2:
return go.Figure().update_layout(title="Not enough unique entities for a meaningful graph.")
num_nodes = len(unique_entities)
thetas = np.linspace(0, 2 * np.pi, num_nodes, endpoint=False)
radius = 10
unique_entities['x'] = radius * np.cos(thetas) + np.random.normal(0, 0.5, num_nodes)
unique_entities['y'] = radius * np.sin(thetas) + np.random.normal(0, 0.5, num_nodes)
pos_map = unique_entities.set_index('text')[['x', 'y']].to_dict('index')
edges = set()
sentences = re.split(r'(?%{text}
" +
"Label: %{customdata[0]}
" +
"Score: %{customdata[1]:.2f}
" +
"Frequency: %{customdata[2]}"
)
))
legend_traces = []
seen_labels = set()
for index, row in unique_entities.iterrows():
label = row['label']
if label not in seen_labels:
seen_labels.add(label)
color = entity_color_map.get(label, '#cccccc')
legend_traces.append(go.Scatter(
x=[None], y=[None], mode='markers', marker=dict(size=10, color=color), name=f"{label.capitalize()}", showlegend=True
))
for trace in legend_traces:
fig.add_trace(trace)
fig.update_layout(
title='Entity Co-occurrence Network (Edges = Same Sentence)',
showlegend=True,
hovermode='closest',
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, range=[-15, 15]),
plot_bgcolor='#f9f9f9',
paper_bgcolor='#f9f9f9',
margin=dict(t=50, b=10, l=10, r=10),
height=600
)
return fig
# --- NEW CSV GENERATION FUNCTION ---
def generate_entity_csv(df):
"""
Generates a CSV file of the extracted entities in an in-memory buffer,
including text, label, category, score, start, and end indices.
"""
csv_buffer = BytesIO()
# Select desired columns and write to buffer
df_export = df[['text', 'label', 'category', 'score', 'start', 'end']]
csv_buffer.write(df_export.to_csv(index=False).encode('utf-8'))
csv_buffer.seek(0)
return csv_buffer
# -----------------------------------
# --- Existing App Functionality (HTML) ---
def generate_html_report(df, text_input, elapsed_time, df_topic_data):
"""
Generates a full HTML report containing all analysis results and visualizations.
(Content omitted for brevity but assumed to be here).
"""
# 1. Generate Visualizations (Plotly HTML)
# 1a. Treemap
fig_treemap = px.treemap(
df,
path=[px.Constant("All Entities"), 'category', 'label', 'text'],
values='score',
color='category',
title="Entity Distribution by Category and Label",
color_discrete_sequence=px.colors.qualitative.Dark24
)
fig_treemap.update_layout(margin=dict(t=50, l=25, r=25, b=25))
treemap_html = fig_treemap.to_html(full_html=False, include_plotlyjs='cdn')
# 1b. Pie Chart
grouped_counts = df['category'].value_counts().reset_index()
grouped_counts.columns = ['Category', 'Count']
# Changed color_discrete_sequence from sequential.RdBu (which has reds) to sequential.Cividis
fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
fig_pie.update_layout(margin=dict(t=50, b=10))
pie_html = fig_pie.to_html(full_html=False, include_plotlyjs='cdn')
# 1c. Bar Chart (Category Count)
fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
bar_category_html = fig_bar_category.to_html(full_html=False,include_plotlyjs='cdn')
# 1d. Bar Chart (Most Frequent Entities)
word_counts = df['text'].value_counts().reset_index()
word_counts.columns = ['Entity', 'Count']
repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
bar_freq_html = 'No entities appear more than once in the text for visualization.
'
if not repeating_entities.empty:
# Changed color_discrete_sequence from sequential.Plasma (which has pink/magenta) to sequential.Viridis
fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=50, b=100))
bar_freq_html = fig_bar_freq.to_html(full_html=False, include_plotlyjs='cdn')
# 1e. Network Graph HTML
network_fig = generate_network_graph(df, text_input)
network_html = network_fig.to_html(full_html=False, include_plotlyjs='cdn')
# 1f. Topic Charts HTML
topic_charts_html = 'Topic Word Weights (Bubble Chart)
'
if df_topic_data is not None and not df_topic_data.empty:
bubble_figure = create_topic_word_bubbles(df_topic_data)
if bubble_figure:
topic_charts_html += f'{bubble_figure.to_html(full_html=False, include_plotlyjs="cdn", config={"responsive": True})}
'
else:
topic_charts_html += 'Error: Topic modeling data was available but visualization failed.
'
else:
topic_charts_html += '' # Changed border color
topic_charts_html += '
Topic Modeling requires more unique input.
'
topic_charts_html += '
Please enter text containing at least two unique entities to generate the Topic Bubble Chart.
'
topic_charts_html += '
'
# 2. Get Highlighted Text
highlighted_text_html = highlight_entities(text_input, df).replace("div style", "div class='highlighted-text' style")
# 3. Entity Tables (Pandas to HTML)
entity_table_html = df[['text', 'label', 'score', 'start', 'end', 'category']].to_html(
classes='table table-striped',
index=False
)
# 4. Construct the Final HTML
html_content = f"""
Entity and Topic Analysis Report
Entity and Topic Analysis Report
1. Analyzed Text & Extracted Entities
Original Text with Highlighted Entities
{highlighted_text_html}
2. Full Extracted Entities Table
{entity_table_html}
3. Data Visualizations
3.1 Entity Distribution Treemap
{treemap_html}
3.2 Comparative Charts (Pie, Category Count, Frequency) - *Stacked Vertically*
{pie_html}
{bar_category_html}
{bar_freq_html}
3.3 Entity Relationship Map (Edges = Same Sentence)
{network_html}
4. Topic Modelling
{topic_charts_html}
"""
return html_content
# --- Page Configuration and Styling (No Sidebar) ---
st.set_page_config(layout="wide", page_title="NER & Topic Report App")
# --- Conditional Mobile Warning ---
st.markdown(
"""
⚠️ **Tip for Mobile Users:** For the best viewing experience of the charts and tables, please switch your browser to **"Desktop Site"** view.
""",
unsafe_allow_html=True
)
# ----------------------------------
st.markdown(
"""
""",
unsafe_allow_html=True
)
st.subheader("Entity and Topic Analysis Report Generator", divider="blue") # Changed divider from "rainbow" (often includes red/pink) to "blue"
st.link_button("by nlpblogs", "https://nlpblogs.com", type="tertiary")
tab1, tab2 = st.tabs(["Embed", "Important Notes"]) # Assuming you have defined the tabs
with tab1:
with st.expander("Embed"):
st.write("Use the following code to embed the DataHarvest web app on your website. Feel free to adjust the width and height values to fit your page.")
code = '''
'''
st.code(code, language="html") # Keeps the copy icon, as intended for tab1
with tab2:
expander = st.expander("**Important Notes**")
# Use st.markdown() with a code block (```) to display the notes
# without the copy-to-clipboard icon, and retaining the styling.
expander.markdown("""
**Named Entities:** This DataHarvest web app predicts nine (9) labels: "person", "country", "city", "organization", "date", "time", "cardinal", "money", "position"
**Results:** Results are compiled into a single, comprehensive **HTML report** and a **CSV file** for easy download and sharing.
**How to Use:** Type or paste your text into the text area below, press Ctrl + Enter, and then click the 'Results' button.
**Technical issues:** If your connection times out, please refresh the page or reopen the app's URL.
""")
st.markdown("For any errors or inquiries, please contact us at [info@nlpblogs.com](mailto:info@nlpblogs.com)")
# --- Comet ML Setup (Placeholder/Conditional) ---
COMET_API_KEY = os.environ.get("COMET_API_KEY")
COMET_WORKSPACE = os.environ.get("COMET_WORKSPACE")
COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME")
comet_initialized = bool(COMET_API_KEY and COMET_WORKSPACE and COMET_PROJECT_NAME)
# --- Model Loading ---
@st.cache_resource
def load_ner_model():
"""Loads the GLiNER model and caches it."""
try:
return GLiNER.from_pretrained("knowledgator/gliner-multitask-large-v0.5", nested_ner=True, num_gen_sequences=2, gen_constraints=labels)
except Exception as e:
st.error(f"Failed to load NER model. Please check your internet connection or model availability: {e}")
st.stop()
model = load_ner_model()
# --- LONG DEFAULT TEXT (178 Words) ---
DEFAULT_TEXT = (
"In June 2024, the founder, Dr. Emily Carter, officially announced a new, expansive partnership between "
"TechSolutions Inc. and the European Space Agency (ESA). This strategic alliance represents a significant "
"leap forward for commercial space technology across the entire **European Union**. The agreement, finalized "
"on Monday in Paris, France, focuses specifically on jointly developing the next generation of the 'Astra' "
"software platform. This version of the **Astra** platform is critical for processing and managing the vast amounts of data being sent "
"back from the recent Mars rover mission. This project underscores the ESA's commitment to advancing "
"space capabilities within the **European Union**. The core team, including lead engineer Marcus Davies, will hold "
"their first collaborative workshop in Berlin, Germany, on August 15th. The community response on social "
"media platform X (under the username @TechCEO) was overwhelmingly positive, with many major tech "
"publications, including Wired Magazine, predicting a major impact on the space technology industry by the "
"end of the year, further strengthening the technological standing of the **European Union**. The platform is designed to be compatible with both Windows and Linux operating systems. "
"The initial funding, secured via a Series B round, totaled $50 million. Financial analysts from Morgan Stanley "
"are closely monitoring the impact on TechSolutions Inc.'s Q3 financial reports, expected to be released to the "
"general public by October 1st. The goal is to deploy the **Astra** v2 platform before the next solar eclipse event in 2026."
)
# -----------------------------------
# --- Session State Initialization (CRITICAL FIX) ---
if 'show_results' not in st.session_state:
st.session_state.show_results = False
if 'last_text' not in st.session_state:
st.session_state.last_text = ""
if 'results_df' not in st.session_state:
st.session_state.results_df = pd.DataFrame()
if 'elapsed_time' not in st.session_state:
st.session_state.elapsed_time = 0.0
if 'topic_results' not in st.session_state:
st.session_state.topic_results = None
if 'my_text_area' not in st.session_state:
st.session_state.my_text_area = DEFAULT_TEXT
# --- Clear Button Function (MODIFIED) ---
def clear_text():
"""Clears the text area (sets it to an empty string) and hides results."""
st.session_state['my_text_area'] = ""
st.session_state.show_results = False
st.session_state.last_text = ""
st.session_state.results_df = pd.DataFrame()
st.session_state.elapsed_time = 0.0
st.session_state.topic_results = None
# --- Text Input and Clear Button ---
word_limit = 1000
text = st.text_area(
f"Type or paste your text below (max {word_limit} words), and then press Ctrl + Enter",
height=250,
key='my_text_area',
)
word_count = len(text.split())
st.markdown(f"**Word count:** {word_count}/{word_limit}")
st.button("Clear text", on_click=clear_text)
# --- Results Trigger and Processing (Updated Logic) ---
if st.button("Results"):
if not text.strip():
st.warning("Please enter some text to extract entities.")
st.session_state.show_results = False
elif word_count > word_limit:
st.warning(f"Your text exceeds the {word_limit} word limit. Please shorten it to continue.")
st.session_state.show_results = False
else:
with st.spinner("Extracting entities and generating report data...", show_time=True):
if text != st.session_state.last_text:
st.session_state.last_text = text
start_time = time.time()
# --- Model Prediction & Dataframe Creation ---
entities = model.predict_entities(text, labels)
df = pd.DataFrame(entities)
if not df.empty:
df['text'] = df['text'].apply(remove_trailing_punctuation)
df['category'] = df['label'].map(reverse_category_mapping)
st.session_state.results_df = df
unique_entity_count = len(df['text'].unique())
N_TOP_WORDS_TO_USE = min(10, unique_entity_count)
st.session_state.topic_results = perform_topic_modeling(
df,
num_topics=2,
num_top_words=N_TOP_WORDS_TO_USE
)
if comet_initialized:
experiment = Experiment(api_key=COMET_API_KEY, workspace=COMET_WORKSPACE, project_name=COMET_PROJECT_NAME)
experiment.log_parameter("input_text", text)
experiment.log_table("predicted_entities", df)
experiment.end()
else:
st.session_state.results_df = pd.DataFrame()
st.session_state.topic_results = None
end_time = time.time()
st.session_state.elapsed_time = end_time - start_time
st.info(f"Report data generated in **{st.session_state.elapsed_time:.2f} seconds**.")
st.session_state.show_results = True
# --- Display Download Link and Results ---
if st.session_state.show_results:
df = st.session_state.results_df
df_topic_data = st.session_state.topic_results
if df.empty:
st.warning("No entities were found in the provided text.")
else:
st.subheader("Analysis Results", divider="blue")
# 1. Highlighted Text
st.markdown("### 1. Analyzed Text with Highlighted Entities")
st.markdown(highlight_entities(st.session_state.last_text, df), unsafe_allow_html=True)
# 2. Detailed Entity Analysis Tabs
st.markdown("### 2. Detailed Entity Analysis")
tab_category_details, tab_treemap_viz = st.tabs(["📑 Entities Grouped by Category", "🗺️ Treemap Distribution"])
with tab_category_details:
st.markdown("#### Detailed Entities Table (Grouped by Category)")
unique_categories = list(category_mapping.keys())
tabs_category = st.tabs(unique_categories)
for category, tab in zip(unique_categories, tabs_category):
df_category = df[df['category'] == category][['text', 'label', 'score', 'start', 'end']].sort_values(by='score', ascending=False)
with tab:
st.markdown(f"##### {category} Entities ({len(df_category)} total)")
if not df_category.empty:
st.dataframe(
df_category,
use_container_width=True,
column_config={'score': st.column_config.NumberColumn(format="%.4f")}
)
else:
st.info(f"No entities of category **{category}** were found in the text.")
with st.expander("See Glossary of tags"):
st.write('''
- **text**: ['entity extracted from your text data']
- **label**: ['label (tag) assigned to a given extracted entity']
- **score**: ['accuracy score; how accurately a tag has been assigned to a given entity']
- **start**: ['index of the start of the corresponding entity']
- **end**: ['index of the end of the corresponding entity']
''')
with tab_treemap_viz:
st.markdown("#### Treemap: Entity Distribution")
fig_treemap = px.treemap(
df,
path=[px.Constant("All Entities"), 'category', 'label', 'text'],
values='score',
color='category',
color_discrete_sequence=px.colors.qualitative.Dark24
)
fig_treemap.update_layout(margin=dict(t=10, l=10, r=10, b=10))
st.plotly_chart(fig_treemap, use_container_width=True)
# 3. Comparative Charts
st.markdown("---")
st.markdown("### 3. Comparative Charts")
col1, col2, col3 = st.columns(3)
grouped_counts = df['category'].value_counts().reset_index()
grouped_counts.columns = ['Category', 'Count']
with col1: # Pie Chart
# Changed color_discrete_sequence
fig_pie = px.pie(grouped_counts, values='Count', names='Category',title='Distribution of Entities by Category',color_discrete_sequence=px.colors.sequential.Cividis)
fig_pie.update_layout(margin=dict(t=30, b=10, l=10, r=10), height=350)
st.plotly_chart(fig_pie, use_container_width=True)
with col2: # Bar Chart (Category Count)
fig_bar_category = px.bar(grouped_counts, x='Category', y='Count',color='Category', title='Total Entities per Category',color_discrete_sequence=px.colors.qualitative.Pastel)
fig_bar_category.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
st.plotly_chart(fig_bar_category, use_container_width=True)
with col3: # Bar Chart (Most Frequent Entities)
word_counts = df['text'].value_counts().reset_index()
word_counts.columns = ['Entity', 'Count']
repeating_entities = word_counts[word_counts['Count'] > 1].head(10)
if not repeating_entities.empty:
# Changed color_discrete_sequence
fig_bar_freq = px.bar(repeating_entities, x='Entity', y='Count',color='Entity', title='Top 10 Most Frequent Entities',color_discrete_sequence=px.colors.sequential.Viridis)
fig_bar_freq.update_layout(xaxis={'categoryorder': 'total descending'},margin=dict(t=30, b=10, l=10, r=10), height=350)
st.plotly_chart(fig_bar_freq, use_container_width=True)
else:
st.info("No entities repeat for frequency chart.")
st.markdown("---")
st.markdown("### 4. Entity Relationship Map")
network_fig = generate_network_graph(df, st.session_state.last_text)
st.plotly_chart(network_fig, use_container_width=True)
st.markdown("---")
st.markdown("### 5. Topic Modelling Analysis")
if df_topic_data is not None and not df_topic_data.empty:
bubble_figure = create_topic_word_bubbles(df_topic_data)
if bubble_figure:
st.plotly_chart(bubble_figure, use_container_width=True)
else:
st.error("Error generating Topic Word Bubble Chart.")
else:
st.info("Topic modeling requires more unique input (at least two unique entities).")
# --- Report Download ---
st.markdown("---")
st.markdown("### Download Full Report Artifacts")
# 1. HTML Report Download (Retained)
html_report = generate_html_report(df, st.session_state.last_text, st.session_state.elapsed_time, df_topic_data)
st.download_button(
label="Download Comprehensive HTML Report",
data=html_report,
file_name="ner_topic_report.html",
mime="text/html",
type="primary"
)
# 2. CSV Data Download (NEW)
csv_buffer = generate_entity_csv(df)
st.download_button(
label="Download Extracted Entities (CSV)",
data=csv_buffer,
file_name="extracted_entities.csv",
mime="text/csv",
type="secondary"
)