Update app.py
Browse files
app.py
CHANGED
|
@@ -7,6 +7,8 @@ import torch
|
|
| 7 |
import numpy as np
|
| 8 |
from collections import Counter
|
| 9 |
import os
|
|
|
|
|
|
|
| 10 |
# Add Arabic stop words
|
| 11 |
ARABIC_STOP_WORDS = {
|
| 12 |
'في', 'من', 'إلى', 'على', 'عن', 'مع', 'خلال', 'حتى', 'إذا', 'ثم',
|
|
@@ -69,6 +71,21 @@ def split_text(text, max_length=512):
|
|
| 69 |
|
| 70 |
return chunks
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
def clean_arabic_text(text):
|
| 73 |
"""Clean Arabic text by removing stop words and normalizing."""
|
| 74 |
words = text.split()
|
|
@@ -202,18 +219,22 @@ def format_emotions(emotion_counts):
|
|
| 202 |
})
|
| 203 |
return formatted_emotions
|
| 204 |
|
| 205 |
-
def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=
|
| 206 |
"""Process the data and generate summaries with flexible topic configuration."""
|
| 207 |
summaries = []
|
| 208 |
|
| 209 |
topic_model_params = {
|
| 210 |
"language": "arabic",
|
| 211 |
"calculate_probabilities": True,
|
| 212 |
-
"min_topic_size":
|
| 213 |
"n_gram_range": (1, 2),
|
| 214 |
"top_n_words": 15,
|
| 215 |
"verbose": True
|
|
|
|
| 216 |
}
|
|
|
|
|
|
|
|
|
|
| 217 |
|
| 218 |
if topic_strategy == "Manual":
|
| 219 |
topic_model_params["nr_topics"] = n_topics
|
|
@@ -250,7 +271,8 @@ def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, to
|
|
| 250 |
|
| 251 |
try:
|
| 252 |
topics, probs = topic_model.fit_transform(texts, embeddings)
|
| 253 |
-
|
|
|
|
| 254 |
topic_counts = Counter(topics)
|
| 255 |
if -1 in topic_counts:
|
| 256 |
del topic_counts[-1]
|
|
@@ -385,6 +407,12 @@ if uploaded_file is not None:
|
|
| 385 |
st.subheader("Emotions")
|
| 386 |
for emotion in summary['top_emotions']:
|
| 387 |
st.write(f"• {emotion['emotion']}: {emotion['count']} poems")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 388 |
|
| 389 |
with tab2:
|
| 390 |
st.subheader("Global Topic Distribution")
|
|
|
|
| 7 |
import numpy as np
|
| 8 |
from collections import Counter
|
| 9 |
import os
|
| 10 |
+
from wordcloud import WordCloud
|
| 11 |
+
import matplotlib.pyplot as plt
|
| 12 |
# Add Arabic stop words
|
| 13 |
ARABIC_STOP_WORDS = {
|
| 14 |
'في', 'من', 'إلى', 'على', 'عن', 'مع', 'خلال', 'حتى', 'إذا', 'ثم',
|
|
|
|
| 71 |
|
| 72 |
return chunks
|
| 73 |
|
| 74 |
+
def create_arabic_wordcloud(text, title):
|
| 75 |
+
wordcloud = WordCloud(
|
| 76 |
+
width=1200,
|
| 77 |
+
height=600,
|
| 78 |
+
background_color='white',
|
| 79 |
+
font_path='arial', # Works with system Arabic fonts
|
| 80 |
+
max_words=200
|
| 81 |
+
).generate(text)
|
| 82 |
+
|
| 83 |
+
fig, ax = plt.subplots(figsize=(15, 8))
|
| 84 |
+
ax.imshow(wordcloud, interpolation='bilinear')
|
| 85 |
+
ax.axis('off')
|
| 86 |
+
ax.set_title(title, fontsize=16, pad=20)
|
| 87 |
+
return fig
|
| 88 |
+
|
| 89 |
def clean_arabic_text(text):
|
| 90 |
"""Clean Arabic text by removing stop words and normalizing."""
|
| 91 |
words = text.split()
|
|
|
|
| 219 |
})
|
| 220 |
return formatted_emotions
|
| 221 |
|
| 222 |
+
def process_and_summarize(df, bert_tokenizer, bert_model, emotion_classifier, top_n=50, topic_strategy="Auto", n_topics=None, min_topic_size=1):
|
| 223 |
"""Process the data and generate summaries with flexible topic configuration."""
|
| 224 |
summaries = []
|
| 225 |
|
| 226 |
topic_model_params = {
|
| 227 |
"language": "arabic",
|
| 228 |
"calculate_probabilities": True,
|
| 229 |
+
"min_topic_size": 1,
|
| 230 |
"n_gram_range": (1, 2),
|
| 231 |
"top_n_words": 15,
|
| 232 |
"verbose": True
|
| 233 |
+
"seed_topic_list": None
|
| 234 |
}
|
| 235 |
+
st.write(f"Total documents: {len(df)}")
|
| 236 |
+
st.write(f"Topic strategy: {topic_strategy}")
|
| 237 |
+
st.write(f"Min topic size: {min_topic_size}")
|
| 238 |
|
| 239 |
if topic_strategy == "Manual":
|
| 240 |
topic_model_params["nr_topics"] = n_topics
|
|
|
|
| 271 |
|
| 272 |
try:
|
| 273 |
topics, probs = topic_model.fit_transform(texts, embeddings)
|
| 274 |
+
st.write(f"Number of unique topics: {len(set(topics))}")
|
| 275 |
+
st.write(f"Topic distribution: {Counter(topics)}")
|
| 276 |
topic_counts = Counter(topics)
|
| 277 |
if -1 in topic_counts:
|
| 278 |
del topic_counts[-1]
|
|
|
|
| 407 |
st.subheader("Emotions")
|
| 408 |
for emotion in summary['top_emotions']:
|
| 409 |
st.write(f"• {emotion['emotion']}: {emotion['count']} poems")
|
| 410 |
+
|
| 411 |
+
st.subheader("Word Cloud Visualization")
|
| 412 |
+
country_poems = df[df['country'] == summary['country']]['poem']
|
| 413 |
+
combined_text = ' '.join(country_poems)
|
| 414 |
+
wordcloud_fig = create_arabic_wordcloud(combined_text, f"Most Common Words in {summary['country']} Poems")
|
| 415 |
+
st.pyplot(wordcloud_fig)
|
| 416 |
|
| 417 |
with tab2:
|
| 418 |
st.subheader("Global Topic Distribution")
|