|  | from urlextract import URLExtract | 
					
						
						|  | from wordcloud import WordCloud | 
					
						
						|  | import pandas as pd | 
					
						
						|  | from collections import Counter | 
					
						
						|  | import emoji | 
					
						
						|  | import plotly.express as px | 
					
						
						|  | import matplotlib.pyplot as plt | 
					
						
						|  | import seaborn as sns | 
					
						
						|  |  | 
					
						
						|  | extract = URLExtract() | 
					
						
						|  |  | 
					
						
						|  | def fetch_stats(selected_user, df): | 
					
						
						|  | if selected_user != 'Overall': | 
					
						
						|  | df = df[df['user'] == selected_user] | 
					
						
						|  | num_messages = df.shape[0] | 
					
						
						|  | words = sum(len(msg.split()) for msg in df['message']) | 
					
						
						|  | num_media_messages = df[df['unfiltered_messages'] == '<media omitted>\n'].shape[0] | 
					
						
						|  | links = sum(len(extract.find_urls(msg)) for msg in df['unfiltered_messages']) | 
					
						
						|  | return num_messages, words, num_media_messages, links | 
					
						
						|  |  | 
					
						
						|  | def most_busy_users(df): | 
					
						
						|  | x = df['user'].value_counts().head() | 
					
						
						|  | df = round((df['user'].value_counts() / df.shape[0]) * 100, 2).reset_index().rename( | 
					
						
						|  | columns={'index': 'percentage', 'user': 'Name'}) | 
					
						
						|  | return x, df | 
					
						
						|  |  | 
					
						
						|  | def create_wordcloud(selected_user, df): | 
					
						
						|  | if selected_user != 'Overall': | 
					
						
						|  | df = df[df['user'] == selected_user] | 
					
						
						|  | temp = df[df['user'] != 'group_notification'] | 
					
						
						|  | temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')] | 
					
						
						|  | wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white') | 
					
						
						|  | df_wc = wc.generate(temp['message'].str.cat(sep=" ")) | 
					
						
						|  | return df_wc | 
					
						
						|  |  | 
					
						
						|  | def most_common_words(selected_user, df): | 
					
						
						|  | if selected_user != 'Overall': | 
					
						
						|  | df = df[df['user'] == selected_user] | 
					
						
						|  | temp = df[df['user'] != 'group_notification'] | 
					
						
						|  | temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')] | 
					
						
						|  | words = [word for msg in temp['message'] for word in msg.lower().split()] | 
					
						
						|  | return pd.DataFrame(Counter(words).most_common(20)) | 
					
						
						|  |  | 
					
						
						|  | def emoji_helper(selected_user, df): | 
					
						
						|  | if selected_user != 'Overall': | 
					
						
						|  | df = df[df['user'] == selected_user] | 
					
						
						|  | emojis = [c for msg in df['unfiltered_messages'] for c in msg if c in emoji.EMOJI_DATA] | 
					
						
						|  | return pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis)))) | 
					
						
						|  |  | 
					
						
						|  | def monthly_timeline(selected_user, df): | 
					
						
						|  | if selected_user != 'Overall': | 
					
						
						|  | df = df[df['user'] == selected_user] | 
					
						
						|  | timeline = df.groupby(['year', 'month']).count()['message'].reset_index() | 
					
						
						|  | timeline['time'] = timeline['month'] + "-" + timeline['year'].astype(str) | 
					
						
						|  | return timeline | 
					
						
						|  |  | 
					
						
						|  | def daily_timeline(selected_user, df): | 
					
						
						|  | if selected_user != 'Overall': | 
					
						
						|  | df = df[df['user'] == selected_user] | 
					
						
						|  | return df.groupby('date').count()['message'].reset_index() | 
					
						
						|  |  | 
					
						
						|  | def week_activity_map(selected_user, df): | 
					
						
						|  | if selected_user != 'Overall': | 
					
						
						|  | df = df[df['user'] == selected_user] | 
					
						
						|  | return df['day_of_week'].value_counts() | 
					
						
						|  |  | 
					
						
						|  | def month_activity_map(selected_user, df): | 
					
						
						|  | if selected_user != 'Overall': | 
					
						
						|  | df = df[df['user'] == selected_user] | 
					
						
						|  | return df['month'].value_counts() | 
					
						
						|  |  | 
					
						
						|  | def plot_topic_distribution(df): | 
					
						
						|  | topic_counts = df['topic'].value_counts().sort_index() | 
					
						
						|  | fig = px.bar(x=topic_counts.index, y=topic_counts.values, title="Topic Distribution", color_discrete_sequence=['viridis']) | 
					
						
						|  | return fig | 
					
						
						|  |  | 
					
						
						|  | def topic_distribution_over_time(df, time_freq='M'): | 
					
						
						|  | df['time_period'] = df['date'].dt.to_period(time_freq) | 
					
						
						|  | return df.groupby(['time_period', 'topic']).size().unstack(fill_value=0) | 
					
						
						|  |  | 
					
						
						|  | def plot_topic_distribution_over_time_plotly(topic_distribution): | 
					
						
						|  | topic_distribution = topic_distribution.reset_index() | 
					
						
						|  | topic_distribution['time_period'] = topic_distribution['time_period'].dt.to_timestamp() | 
					
						
						|  | topic_distribution = topic_distribution.melt(id_vars='time_period', var_name='topic', value_name='count') | 
					
						
						|  | fig = px.line(topic_distribution, x='time_period', y='count', color='topic', title="Topic Distribution Over Time") | 
					
						
						|  | fig.update_layout(legend_title_text='Topics', xaxis_tickangle=-45) | 
					
						
						|  | return fig | 
					
						
						|  |  | 
					
						
						|  | def plot_clusters(reduced_features, clusters): | 
					
						
						|  | fig = px.scatter(x=reduced_features[:, 0], y=reduced_features[:, 1], color=clusters, title="Message Clusters (t-SNE)") | 
					
						
						|  | return fig | 
					
						
						|  | def most_common_words(selected_user, df): | 
					
						
						|  |  | 
					
						
						|  | stop_words = df | 
					
						
						|  |  | 
					
						
						|  | if selected_user != 'Overall': | 
					
						
						|  | df = df[df['user'] == selected_user] | 
					
						
						|  |  | 
					
						
						|  | temp = df[df['user'] != 'group_notification'] | 
					
						
						|  | temp = temp[~temp['message'].str.lower().str.contains('<media omitted>')] | 
					
						
						|  |  | 
					
						
						|  | words = [] | 
					
						
						|  |  | 
					
						
						|  | for message in temp['message']: | 
					
						
						|  | for word in message.lower().split(): | 
					
						
						|  | if word not in stop_words: | 
					
						
						|  | words.append(word) | 
					
						
						|  |  | 
					
						
						|  | most_common_df = pd.DataFrame(Counter(words).most_common(20)) | 
					
						
						|  | return most_common_df | 
					
						
						|  |  | 
					
						
						|  | def emoji_helper(selected_user, df): | 
					
						
						|  | if selected_user != 'Overall': | 
					
						
						|  | df = df[df['user'] == selected_user] | 
					
						
						|  |  | 
					
						
						|  | emojis = [] | 
					
						
						|  | for message in df['unfiltered_messages']: | 
					
						
						|  | emojis.extend([c for c in message if c in emoji.EMOJI_DATA]) | 
					
						
						|  |  | 
					
						
						|  | emoji_df = pd.DataFrame(Counter(emojis).most_common(len(Counter(emojis)))) | 
					
						
						|  |  | 
					
						
						|  | return emoji_df | 
					
						
						|  | def plot_topic_distribution(df): | 
					
						
						|  | """ | 
					
						
						|  | Plots the distribution of topics in the chat data. | 
					
						
						|  | """ | 
					
						
						|  | topic_counts = df['topic'].value_counts().sort_index() | 
					
						
						|  | fig, ax = plt.subplots() | 
					
						
						|  | sns.barplot(x=topic_counts.index, y=topic_counts.values, ax=ax, palette="viridis") | 
					
						
						|  | ax.set_title("Topic Distribution") | 
					
						
						|  | ax.set_xlabel("Topic") | 
					
						
						|  | ax.set_ylabel("Number of Messages") | 
					
						
						|  | return fig | 
					
						
						|  |  | 
					
						
						|  | def most_frequent_keywords(messages, top_n=10): | 
					
						
						|  | """ | 
					
						
						|  | Extracts the most frequent keywords from a list of messages. | 
					
						
						|  | """ | 
					
						
						|  | words = [word for msg in messages for word in msg.split()] | 
					
						
						|  | word_freq = Counter(words) | 
					
						
						|  | return word_freq.most_common(top_n) | 
					
						
						|  | def plot_topic_distribution_over_time(topic_distribution): | 
					
						
						|  | """ | 
					
						
						|  | Plots the distribution of topics over time using a line chart. | 
					
						
						|  | """ | 
					
						
						|  | fig, ax = plt.subplots(figsize=(12, 6)) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for topic in topic_distribution.columns: | 
					
						
						|  | ax.plot(topic_distribution.index.to_timestamp(), topic_distribution[topic], label=f"Topic {topic}") | 
					
						
						|  |  | 
					
						
						|  | ax.set_title("Topic Distribution Over Time") | 
					
						
						|  | ax.set_xlabel("Time Period") | 
					
						
						|  | ax.set_ylabel("Number of Messages") | 
					
						
						|  | ax.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left') | 
					
						
						|  | plt.xticks(rotation=45) | 
					
						
						|  | plt.tight_layout() | 
					
						
						|  | return fig | 
					
						
						|  |  | 
					
						
						|  | def plot_most_frequent_keywords(keywords): | 
					
						
						|  | """ | 
					
						
						|  | Plots the most frequent keywords. | 
					
						
						|  | """ | 
					
						
						|  | words, counts = zip(*keywords) | 
					
						
						|  | fig, ax = plt.subplots() | 
					
						
						|  | sns.barplot(x=list(counts), y=list(words), ax=ax, palette="viridis") | 
					
						
						|  | ax.set_title("Most Frequent Keywords") | 
					
						
						|  | ax.set_xlabel("Frequency") | 
					
						
						|  | ax.set_ylabel("Keyword") | 
					
						
						|  | return fig | 
					
						
						|  | def topic_distribution_over_time(df, time_freq='M'): | 
					
						
						|  | """ | 
					
						
						|  | Analyzes the distribution of topics over time. | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | df['time_period'] = df['date'].dt.to_period(time_freq) | 
					
						
						|  | topic_distribution = df.groupby(['time_period', 'topic']).size().unstack(fill_value=0) | 
					
						
						|  | return topic_distribution | 
					
						
						|  |  | 
					
						
						|  | def plot_topic_distribution_over_time(topic_distribution): | 
					
						
						|  | """ | 
					
						
						|  | Plots the distribution of topics over time using a line chart. | 
					
						
						|  | """ | 
					
						
						|  | fig, ax = plt.subplots(figsize=(12, 6)) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for topic in topic_distribution.columns: | 
					
						
						|  | ax.plot(topic_distribution.index.to_timestamp(), topic_distribution[topic], label=f"Topic {topic}") | 
					
						
						|  |  | 
					
						
						|  | ax.set_title("Topic Distribution Over Time") | 
					
						
						|  | ax.set_xlabel("Time Period") | 
					
						
						|  | ax.set_ylabel("Number of Messages") | 
					
						
						|  | ax.legend(title="Topics", bbox_to_anchor=(1.05, 1), loc='upper left') | 
					
						
						|  | plt.xticks(rotation=45) | 
					
						
						|  | plt.tight_layout() | 
					
						
						|  | return fig | 
					
						
						|  |  | 
					
						
						|  | def plot_topic_distribution_over_time_plotly(topic_distribution): | 
					
						
						|  | """ | 
					
						
						|  | Plots the distribution of topics over time using Plotly. | 
					
						
						|  | """ | 
					
						
						|  | topic_distribution = topic_distribution.reset_index() | 
					
						
						|  | topic_distribution['time_period'] = topic_distribution['time_period'].dt.to_timestamp() | 
					
						
						|  | topic_distribution = topic_distribution.melt(id_vars='time_period', var_name='topic', value_name='count') | 
					
						
						|  |  | 
					
						
						|  | fig = px.line(topic_distribution, x='time_period', y='count', color='topic', | 
					
						
						|  | title="Topic Distribution Over Time", labels={'time_period': 'Time Period', 'count': 'Number of Messages'}) | 
					
						
						|  | fig.update_layout(legend_title_text='Topics', xaxis_tickangle=-45) | 
					
						
						|  | return fig | 
					
						
						|  | def plot_clusters(reduced_features, clusters): | 
					
						
						|  | """ | 
					
						
						|  | Visualize clusters using t-SNE. | 
					
						
						|  | Args: | 
					
						
						|  | reduced_features (np.array): 2D array of reduced features. | 
					
						
						|  | clusters (np.array): Cluster labels. | 
					
						
						|  | Returns: | 
					
						
						|  | fig (plt.Figure): Matplotlib figure object. | 
					
						
						|  | """ | 
					
						
						|  | plt.figure(figsize=(10, 8)) | 
					
						
						|  | sns.scatterplot( | 
					
						
						|  | x=reduced_features[:, 0], | 
					
						
						|  | y=reduced_features[:, 1], | 
					
						
						|  | hue=clusters, | 
					
						
						|  | palette="viridis", | 
					
						
						|  | legend="full" | 
					
						
						|  | ) | 
					
						
						|  | plt.title("Message Clusters (t-SNE Visualization)") | 
					
						
						|  | plt.xlabel("t-SNE Component 1") | 
					
						
						|  | plt.ylabel("t-SNE Component 2") | 
					
						
						|  | plt.tight_layout() | 
					
						
						|  | return plt.gcf() | 
					
						
						|  | def get_cluster_labels(df, n_clusters): | 
					
						
						|  | """ | 
					
						
						|  | Generate descriptive labels for each cluster based on top keywords. | 
					
						
						|  | """ | 
					
						
						|  | from sklearn.feature_extraction.text import TfidfVectorizer | 
					
						
						|  | import numpy as np | 
					
						
						|  |  | 
					
						
						|  | vectorizer = TfidfVectorizer(max_features=5000, stop_words='english') | 
					
						
						|  | tfidf_matrix = vectorizer.fit_transform(df['lemmatized_message']) | 
					
						
						|  |  | 
					
						
						|  | cluster_labels = {} | 
					
						
						|  | for cluster_id in range(n_clusters): | 
					
						
						|  | cluster_indices = df[df['cluster'] == cluster_id].index | 
					
						
						|  | if len(cluster_indices) > 0: | 
					
						
						|  | cluster_tfidf = tfidf_matrix[cluster_indices] | 
					
						
						|  | top_keywords = np.argsort(cluster_tfidf.sum(axis=0).A1)[-3:][::-1] | 
					
						
						|  | cluster_labels[cluster_id] = ", ".join(vectorizer.get_feature_names_out()[top_keywords]) | 
					
						
						|  | else: | 
					
						
						|  | cluster_labels[cluster_id] = "No dominant theme" | 
					
						
						|  | return cluster_labels | 
					
						
						|  |  | 
					
						
						|  | def get_temporal_trends(df): | 
					
						
						|  | """ | 
					
						
						|  | Analyze temporal trends for each cluster (peak day and time). | 
					
						
						|  | """ | 
					
						
						|  | temporal_trends = {} | 
					
						
						|  | for cluster_id in df['cluster'].unique(): | 
					
						
						|  | cluster_data = df[df['cluster'] == cluster_id] | 
					
						
						|  | if not cluster_data.empty: | 
					
						
						|  | peak_day = cluster_data['day_of_week'].mode()[0] | 
					
						
						|  | peak_time = cluster_data['hour'].mode()[0] | 
					
						
						|  | temporal_trends[cluster_id] = {"peak_day": peak_day, "peak_time": f"{peak_time}:00"} | 
					
						
						|  | return temporal_trends | 
					
						
						|  |  | 
					
						
						|  | def get_user_contributions(df): | 
					
						
						|  | """ | 
					
						
						|  | Identify top contributors for each cluster. | 
					
						
						|  | """ | 
					
						
						|  | user_contributions = {} | 
					
						
						|  | for cluster_id in df['cluster'].unique(): | 
					
						
						|  | cluster_data = df[df['cluster'] == cluster_id] | 
					
						
						|  | if not cluster_data.empty: | 
					
						
						|  | top_users = cluster_data['user'].value_counts().head(3).index.tolist() | 
					
						
						|  | user_contributions[cluster_id] = top_users | 
					
						
						|  | return user_contributions | 
					
						
						|  |  | 
					
						
						|  | def get_sentiment_by_cluster(df): | 
					
						
						|  | """ | 
					
						
						|  | Analyze sentiment distribution for each cluster. | 
					
						
						|  | """ | 
					
						
						|  | sentiment_by_cluster = {} | 
					
						
						|  | for cluster_id in df['cluster'].unique(): | 
					
						
						|  | cluster_data = df[df['cluster'] == cluster_id] | 
					
						
						|  | if not cluster_data.empty: | 
					
						
						|  | sentiment_counts = cluster_data['sentiment'].value_counts(normalize=True) * 100 | 
					
						
						|  | sentiment_by_cluster[cluster_id] = { | 
					
						
						|  | "positive": round(sentiment_counts.get('positive', 0)), | 
					
						
						|  | "neutral": round(sentiment_counts.get('neutral', 0)), | 
					
						
						|  | "negative": round(sentiment_counts.get('negative', 0)) | 
					
						
						|  | } | 
					
						
						|  | return sentiment_by_cluster | 
					
						
						|  |  | 
					
						
						|  | def detect_anomalies(df): | 
					
						
						|  | """ | 
					
						
						|  | Detect anomalies in each cluster (e.g., high link or media share). | 
					
						
						|  | """ | 
					
						
						|  | anomalies = {} | 
					
						
						|  | for cluster_id in df['cluster'].unique(): | 
					
						
						|  | cluster_data = df[df['cluster'] == cluster_id] | 
					
						
						|  | if not cluster_data.empty: | 
					
						
						|  | link_share = (cluster_data['message'].str.contains('http').mean()) * 100 | 
					
						
						|  | media_share = (cluster_data['message'].str.contains('<media omitted>').mean()) * 100 | 
					
						
						|  | if link_share > 50: | 
					
						
						|  | anomalies[cluster_id] = f"{round(link_share)}% of messages contain links." | 
					
						
						|  | elif media_share > 50: | 
					
						
						|  | anomalies[cluster_id] = f"{round(media_share)}% of messages are media files." | 
					
						
						|  | return anomalies | 
					
						
						|  |  | 
					
						
						|  | def generate_recommendations(df): | 
					
						
						|  | """ | 
					
						
						|  | Generate actionable recommendations based on cluster insights. | 
					
						
						|  | """ | 
					
						
						|  | recommendations = [] | 
					
						
						|  | for cluster_id in df['cluster'].unique(): | 
					
						
						|  | cluster_data = df[df['cluster'] == cluster_id] | 
					
						
						|  | if not cluster_data.empty: | 
					
						
						|  | sentiment_counts = cluster_data['sentiment'].value_counts(normalize=True) * 100 | 
					
						
						|  | if sentiment_counts.get('negative', 0) > 50: | 
					
						
						|  | recommendations.append(f"Address negative sentiment in Cluster {cluster_id} by revisiting feedback processes.") | 
					
						
						|  | if cluster_data['message'].str.contains('http').mean() > 0.5: | 
					
						
						|  | recommendations.append(f"Pin resources from Cluster {cluster_id} (most-shared links) for easy access.") | 
					
						
						|  | return recommendations |