Spaces:
Sleeping
Sleeping
| # -- coding: utf-8 -- | |
| """emotion-matcher.ipynb | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1LTiGD09mHJRCtPkBO3f3XnYPACLCB_ro | |
| ## 1. Dataset | |
| """ | |
| import pandas as pd | |
| # Define the file paths for each dataset split | |
| splits = { | |
| 'train': 'simplified/train-00000-of-00001.parquet', | |
| 'validation': 'simplified/validation-00000-of-00001.parquet', | |
| 'test': 'simplified/test-00000-of-00001.parquet' | |
| } | |
| # Load the training set from HuggingFace Hub using the hf:// protocol | |
| df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["train"]) | |
| # Preview the first few rows of the dataset | |
| print(df.head()) | |
| # View dataset shape | |
| print("Dataset shape:", df.shape) | |
| # View basic column information | |
| print("\nColumn names:", df.columns.tolist()) | |
| # View detailed info | |
| df.info() | |
| # Check for missing values | |
| print("Missing values per column:") | |
| print(df.isnull().sum()) | |
| # Check for duplicated rows (convert unhashable columns to string) | |
| print("\nNumber of duplicated rows:") | |
| print(df.astype(str).duplicated().sum()) | |
| # Check how many unique combinations of emotion labels exist | |
| print("\nNumber of unique label combinations:") | |
| print(df["labels"].apply(lambda x: tuple(x)).nunique()) | |
| # Compute text lengths in number of words | |
| df["text_length"] = df["text"].apply(lambda x: len(x.split())) | |
| # Plot histogram of text lengths | |
| import matplotlib.pyplot as plt | |
| plt.figure(figsize=(10,6)) | |
| plt.hist(df["text_length"], bins=50) | |
| plt.title("Distribution of Text Lengths (in words)") | |
| plt.xlabel("Number of words") | |
| plt.ylabel("Number of samples") | |
| plt.grid(True) | |
| plt.show() | |
| # Count how many emotion labels each text has | |
| df["num_labels"] = df["labels"].apply(len) | |
| # Plot distribution | |
| plt.figure(figsize=(8,5)) | |
| df["num_labels"].value_counts().sort_index().plot(kind="bar") | |
| plt.xlabel("Number of emotion labels") | |
| plt.ylabel("Number of samples") | |
| plt.title("Distribution of Emotion Labels per Sample") | |
| plt.show() | |
| # Count frequency of each individual emotion label | |
| from collections import Counter | |
| # Flatten the list of labels across all samples | |
| all_labels = [label for labels in df["labels"] for label in labels] | |
| label_counts = Counter(all_labels) | |
| # Convert to DataFrame for plotting | |
| emotion_freq = pd.DataFrame.from_dict(label_counts, orient='index', columns=['count']) | |
| emotion_freq = emotion_freq.sort_values(by='count', ascending=False) | |
| # Plot the frequency of each emotion | |
| emotion_freq.plot(kind='bar', figsize=(15,5), legend=False) | |
| plt.title("Frequency of Each Emotion Label") | |
| plt.xlabel("Emotion Label ID") | |
| plt.ylabel("Number of Occurrences") | |
| plt.show() | |
| # Create a binary matrix for emotions | |
| import numpy as np | |
| import seaborn as sns | |
| num_labels = max([max(l.tolist()) if len(l) > 0 else 0 for l in df["labels"]]) + 1 | |
| emotion_matrix = np.zeros((len(df), num_labels), dtype=int) | |
| for i, labels in enumerate(df["labels"]): | |
| for label in labels: | |
| emotion_matrix[i, label] = 1 | |
| # Compute co-occurrence matrix | |
| co_occurrence = np.dot(emotion_matrix.T, emotion_matrix) | |
| # Plot heatmap | |
| plt.figure(figsize=(12, 10)) | |
| sns.heatmap(co_occurrence, cmap="Blues", linewidths=0.5) | |
| plt.title("Emotion Co-occurrence Heatmap") | |
| plt.xlabel("Emotion Label ID") | |
| plt.ylabel("Emotion Label ID") | |
| plt.show() | |
| # Display 5 random rows | |
| print("Sample text examples with emotion labels:") | |
| print(df.sample(5)[["text", "labels"]]) | |
| # Define emotion label ID to name mapping manually (based on GoEmotions documentation) | |
| id2label = [ | |
| 'admiration', 'amusement', 'anger', 'annoyance', 'approval', | |
| 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', | |
| 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', | |
| 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', | |
| 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', | |
| 'neutral' | |
| ] | |
| def decode_labels(label_ids): | |
| return [id2label[i] for i in label_ids] | |
| # Display 5 random samples with readable label names | |
| print("Sample text examples with emotion label names:") | |
| sample_df = df.sample(5) | |
| sample_df["label_names"] = sample_df["labels"].apply(decode_labels) | |
| print(sample_df[["text", "label_names"]]) | |
| # Word cloud | |
| from wordcloud import WordCloud | |
| all_text = " ".join(df["text"]) | |
| wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text) | |
| plt.figure(figsize=(12, 6)) | |
| plt.imshow(wordcloud, interpolation="bilinear") | |
| plt.axis("off") | |
| plt.title("Most Frequent Words in All Text Samples") | |
| plt.show() | |
| # Clean the text data | |
| import re | |
| import string | |
| def clean_text(text): | |
| text = text.lower() | |
| text = re.sub(r"\[.*?\]", "", text) | |
| text = text.translate(str.maketrans('', '', string.punctuation)) | |
| text = re.sub(r"\d+", "", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| df["clean_text"] = df["text"].apply(clean_text) | |
| print("Sample cleaned texts:") | |
| print(df[["text", "clean_text"]].sample(5)) | |
| # Plot label distribution | |
| label_counts = Counter([label for sublist in df["labels"] for label in sublist]) | |
| label_df = pd.DataFrame.from_dict(label_counts, orient="index", columns=["count"]) | |
| label_df.index.name = "label_id" | |
| label_df = label_df.sort_index() | |
| label_df["label_name"] = label_df.index.map(lambda i: id2label[i]) | |
| plt.figure(figsize=(14, 6)) | |
| sns.barplot(x="label_name", y="count", data=label_df) | |
| plt.xticks(rotation=45, ha="right") | |
| plt.title("Distribution of Emotion Labels in Training Set") | |
| plt.xlabel("Emotion") | |
| plt.ylabel("Frequency") | |
| plt.tight_layout() | |
| plt.show() | |
| # Embeddings | |
| from sentence_transformers import SentenceTransformer | |
| import torch | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| model = model.to(device) | |
| sample_df = df.sample(n=3000, random_state=42).reset_index(drop=True) | |
| embeddings = model.encode(sample_df["clean_text"].tolist(), show_progress_bar=True, device=device) | |
| sample_df["embedding"] = embeddings.tolist() | |
| # t-SNE visualization | |
| from sklearn.manifold import TSNE | |
| X = np.array(sample_df["embedding"].tolist()) | |
| tsne = TSNE(n_components=2, random_state=42, perplexity=30) | |
| X_embedded = tsne.fit_transform(X) | |
| sample_df["x"] = X_embedded[:, 0] | |
| sample_df["y"] = X_embedded[:, 1] | |
| plt.figure(figsize=(10, 6)) | |
| plt.scatter(sample_df["x"], sample_df["y"], alpha=0.5) | |
| plt.title("t-SNE Projection of Text Embeddings") | |
| plt.xlabel("Component 1") | |
| plt.ylabel("Component 2") | |
| plt.show() | |
| # KMeans Clustering | |
| from sklearn.cluster import KMeans | |
| num_clusters = 8 | |
| kmeans = KMeans(n_clusters=num_clusters, random_state=42) | |
| sample_df["cluster"] = kmeans.fit_predict(X) | |
| plt.figure(figsize=(10, 6)) | |
| scatter = plt.scatter(sample_df["x"], sample_df["y"], c=sample_df["cluster"], cmap='tab10', alpha=0.6) | |
| plt.title(f"K-Means Clustering (k={num_clusters}) on t-SNE Projection") | |
| plt.xlabel("Component 1") | |
| plt.ylabel("Component 2") | |
| plt.colorbar(scatter, label="Cluster") | |
| plt.show() | |
| # Recommendation Function | |
| from sentence_transformers import util | |
| EMBEDDINGS = torch.tensor(sample_df['embedding'].tolist(), device=device) | |
| def recommend_similar_emotions(user_input): | |
| if not user_input.strip(): | |
| return "Please enter some text." | |
| user_embedding = model.encode(user_input, convert_to_tensor=True, device=device) | |
| similarities = util.cos_sim(user_embedding, EMBEDDINGS)[0] | |
| top_indices = similarities.argsort(descending=True)[:5] | |
| results = [] | |
| for idx in top_indices: | |
| row = sample_df.iloc[idx.item()] | |
| results.append(f"{row['text']}\nEmotions: {row['labels']}") | |
| return "\n\n".join(results) | |
| # Gradio App | |
| import gradio as gr | |
| demo = gr.Interface( | |
| fn=recommend_similar_emotions, | |
| inputs=gr.Textbox(lines=2, placeholder="Type your situation or feeling..."), | |
| outputs="text", | |
| title="Emotion Matcher", | |
| description="Describe how you feel, and get similar examples with emotion labels." | |
| ) | |
| demo.launch() |