FLS

Sleeping

App Files Files Community

kambris commited on Nov 23, 2024

Commit

4ec5d16

verified ·

1 Parent(s): 6bd6b44

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -81

app.py CHANGED Viewed

@@ -11,94 +11,87 @@ bert_tokenizer = AutoTokenizer.from_pretrained("aubmindlab/bert-base-arabertv2")
 bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
 # Load AraBERT model for emotion classification
-emotion_model = AutoModelForSequenceClassification.from_pretrained("aubmindlab/bert-base-arabertv2")
 emotion_classifier = pipeline("text-classification", model=emotion_model, tokenizer=bert_tokenizer)
-# Function to generate embeddings using AraBERT
-def generate_embeddings(texts):
-    all_embeddings = []
-    for text in texts:
-        # Tokenize with truncation to handle long sequences
-        inputs = bert_tokenizer(
-            text,
-            return_tensors="pt",
-            padding=True,
-            truncation=True,
-            max_length=512
-        )
-        # Generate embeddings
-        with torch.no_grad():
-            outputs = bert_model(**inputs)
-        # Get the mean of the last hidden state as the embedding
-        embedding = outputs.last_hidden_state.mean(dim=1).numpy()
-        all_embeddings.append(embedding[0])  # Remove batch dimension
-    return np.array(all_embeddings)
-# Function to perform emotion classification with proper truncation
-def classify_emotions(texts):
-    emotions = []
-    for text in texts:
-        # Process text in chunks if it's too long
-        if len(bert_tokenizer.encode(text)) > 512:
-            chunks = [text[i:i + 512] for i in range(0, len(text), 512)]
-            # Take the emotion of the first chunk (usually contains the most relevant information)
-            emotion = emotion_classifier(chunks[0])[0]['label']
         else:
-            emotion = emotion_classifier(text)[0]['label']
-        emotions.append(emotion)
-    return emotions
-# Function to process the uploaded file and summarize by country
 def process_and_summarize(uploaded_file, top_n=50):
-    # Determine the file type
-    if uploaded_file.name.endswith(".csv"):
-        df = pd.read_csv(uploaded_file)
-    elif uploaded_file.name.endswith(".xlsx"):
-        df = pd.read_excel(uploaded_file)
-    else:
-        st.error("Unsupported file format.")
-        return None, None
-    # Validate required columns
-    required_columns = ['country', 'poem']
-    missing_columns = [col for col in required_columns if col not in df.columns]
-    if missing_columns:
-        st.error(f"Missing columns: {', '.join(missing_columns)}")
-        return None, None
-    # Parse and preprocess the file
-    df['country'] = df['country'].str.strip()
-    df = df.dropna(subset=['country', 'poem'])
-    # Initialize BERTopic
-    topic_model = BERTopic(language="arabic")
     # Group by country
     summaries = []
     for country, group in df.groupby('country'):
         st.info(f"Processing poems for {country}...")
-        # Get texts for this country
         texts = group['poem'].dropna().tolist()
-        # Classify emotions
-        st.info(f"Classifying emotions for {country}...")
-        emotions = classify_emotions(texts)
-        # Generate embeddings and fit topic model
-        st.info(f"Generating embeddings and topics for {country}...")
-        embeddings = generate_embeddings(texts)
         try:
             topics, _ = topic_model.fit_transform(texts, embeddings)
-            # Aggregate topics and emotions
-            top_topics = Counter(topics).most_common(top_n)
-            top_emotions = Counter(emotions).most_common(top_n)
             summaries.append({
                 'country': country,
@@ -120,7 +113,8 @@ uploaded_file = st.file_uploader("Choose a file", type=["csv", "xlsx"])
 if uploaded_file is not None:
     try:
-        top_n = st.number_input("Select the number of top topics/emotions to display:", min_value=1, max_value=100, value=50)
         summaries, topic_model = process_and_summarize(uploaded_file, top_n=top_n)
         if summaries is not None:
@@ -130,13 +124,27 @@ if uploaded_file is not None:
             for summary in summaries:
                 st.write(f"### {summary['country']}")
                 st.write(f"Total Poems: {summary['total_poems']}")
-                st.write(f"Top {top_n} Topics:")
-                st.write(summary['top_topics'])
-                st.write(f"Top {top_n} Emotions:")
-                st.write(summary['top_emotions'])
-            # Display overall topics
             st.write("### Global Topic Information:")
-            st.write(topic_model.get_topic_info())
     except Exception as e:
-        st.error(f"Error: {e}")

 bert_model = AutoModel.from_pretrained("aubmindlab/bert-base-arabertv2")
 # Load AraBERT model for emotion classification
+emotion_model = AutoModelForSequenceClassification.from_pretrained("CAMeL-Lab/bert-base-arabic-camelbert-msa-sentiment")
 emotion_classifier = pipeline("text-classification", model=emotion_model, tokenizer=bert_tokenizer)
+# Define emotion labels mapping
+EMOTION_LABELS = {
+    'LABEL_0': 'Negative',
+    'LABEL_1': 'Positive',
+    'LABEL_2': 'Neutral'
+}
+def format_topics(topic_model, topic_counts):
+    """Convert topic numbers to readable labels."""
+    formatted_topics = []
+    for topic_num, count in topic_counts:
+        if topic_num == -1:
+            topic_label = "Miscellaneous"
         else:
+            # Get the top words for this topic
+            words = topic_model.get_topic(topic_num)
+            # Take the top 3 words to form a topic label
+            topic_label = " | ".join([word for word, _ in words[:3]])
+        formatted_topics.append({
+            'topic': topic_label,
+            'count': count
+        })
+    return formatted_topics
+def format_emotions(emotion_counts):
+    """Convert emotion labels to readable text."""
+    formatted_emotions = []
+    for label, count in emotion_counts:
+        emotion = EMOTION_LABELS.get(label, label)
+        formatted_emotions.append({
+            'emotion': emotion,
+            'count': count
+        })
+    return formatted_emotions
+# [Previous functions remain the same until process_and_summarize]
 def process_and_summarize(uploaded_file, top_n=50):
+    # [Previous code remains the same until the summaries loop]
+    # Initialize BERTopic with specific parameters
+    topic_model = BERTopic(
+        language="arabic",
+        calculate_probabilities=True,
+        verbose=True
+    )
     # Group by country
     summaries = []
     for country, group in df.groupby('country'):
         st.info(f"Processing poems for {country}...")
         texts = group['poem'].dropna().tolist()
+        batch_size = 10
+        all_emotions = []
+        all_embeddings = []
+        for i in range(0, len(texts), batch_size):
+            batch_texts = texts[i:i + batch_size]
+            st.info(f"Generating embeddings for batch {i//batch_size + 1}...")
+            batch_embeddings = generate_embeddings(batch_texts)
+            all_embeddings.extend(batch_embeddings)
+            st.info(f"Classifying emotions for batch {i//batch_size + 1}...")
+            batch_emotions = [classify_emotion(text) for text in batch_texts]
+            all_emotions.extend(batch_emotions)
         try:
+            embeddings = np.array(all_embeddings)
+            st.info(f"Fitting topic model for {country}...")
             topics, _ = topic_model.fit_transform(texts, embeddings)
+            # Format topics and emotions with readable labels
+            top_topics = format_topics(topic_model, Counter(topics).most_common(top_n))
+            top_emotions = format_emotions(Counter(all_emotions).most_common(top_n))
             summaries.append({
                 'country': country,
 if uploaded_file is not None:
     try:
+        top_n = st.number_input("Select the number of top topics/emotions to display:",
+                               min_value=1, max_value=100, value=10)
         summaries, topic_model = process_and_summarize(uploaded_file, top_n=top_n)
         if summaries is not None:
             for summary in summaries:
                 st.write(f"### {summary['country']}")
                 st.write(f"Total Poems: {summary['total_poems']}")
+                st.write(f"\nTop {top_n} Topics:")
+                for topic in summary['top_topics']:
+                    st.write(f"• {topic['topic']}: {topic['count']} poems")
+                st.write(f"\nTop {top_n} Emotions:")
+                for emotion in summary['top_emotions']:
+                    st.write(f"• {emotion['emotion']}: {emotion['count']} poems")
+                st.write("---")
+            # Display overall topics in a more readable format
             st.write("### Global Topic Information:")
+            topic_info = topic_model.get_topic_info()
+            for _, row in topic_info.iterrows():
+                if row['Topic'] == -1:
+                    topic_name = "Miscellaneous"
+                else:
+                    words = topic_model.get_topic(row['Topic'])
+                    topic_name = " | ".join([word for word, _ in words[:3]])
+                st.write(f"• Topic {row['Topic']}: {topic_name} ({row['Count']} poems)")
     except Exception as e:
+        st.error(f"Error: {str(e)}")