Spaces:

vova631
/

emotion-matcher

Sleeping

App Files Files Community

emotion-matcher / app.py

vova631

Update app.py

69d8d00 verified 4 months ago

raw

history blame contribute delete

7.83 kB

	# -- coding: utf-8 --
	"""emotion-matcher.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1LTiGD09mHJRCtPkBO3f3XnYPACLCB_ro

	## 1. Dataset
	"""

	import pandas as pd

	# Define the file paths for each dataset split
	splits = {
	'train': 'simplified/train-00000-of-00001.parquet',
	'validation': 'simplified/validation-00000-of-00001.parquet',
	'test': 'simplified/test-00000-of-00001.parquet'
	}

	# Load the training set from HuggingFace Hub using the hf:// protocol
	df = pd.read_parquet("hf://datasets/google-research-datasets/go_emotions/" + splits["train"])

	# Preview the first few rows of the dataset
	print(df.head())

	# View dataset shape
	print("Dataset shape:", df.shape)

	# View basic column information
	print("\nColumn names:", df.columns.tolist())

	# View detailed info
	df.info()

	# Check for missing values
	print("Missing values per column:")
	print(df.isnull().sum())

	# Check for duplicated rows (convert unhashable columns to string)
	print("\nNumber of duplicated rows:")
	print(df.astype(str).duplicated().sum())

	# Check how many unique combinations of emotion labels exist
	print("\nNumber of unique label combinations:")
	print(df["labels"].apply(lambda x: tuple(x)).nunique())

	# Compute text lengths in number of words
	df["text_length"] = df["text"].apply(lambda x: len(x.split()))

	# Plot histogram of text lengths
	import matplotlib.pyplot as plt

	plt.figure(figsize=(10,6))
	plt.hist(df["text_length"], bins=50)
	plt.title("Distribution of Text Lengths (in words)")
	plt.xlabel("Number of words")
	plt.ylabel("Number of samples")
	plt.grid(True)
	plt.show()

	# Count how many emotion labels each text has
	df["num_labels"] = df["labels"].apply(len)

	# Plot distribution
	plt.figure(figsize=(8,5))
	df["num_labels"].value_counts().sort_index().plot(kind="bar")
	plt.xlabel("Number of emotion labels")
	plt.ylabel("Number of samples")
	plt.title("Distribution of Emotion Labels per Sample")
	plt.show()

	# Count frequency of each individual emotion label
	from collections import Counter

	# Flatten the list of labels across all samples
	all_labels = [label for labels in df["labels"] for label in labels]
	label_counts = Counter(all_labels)

	# Convert to DataFrame for plotting
	emotion_freq = pd.DataFrame.from_dict(label_counts, orient='index', columns=['count'])
	emotion_freq = emotion_freq.sort_values(by='count', ascending=False)

	# Plot the frequency of each emotion
	emotion_freq.plot(kind='bar', figsize=(15,5), legend=False)
	plt.title("Frequency of Each Emotion Label")
	plt.xlabel("Emotion Label ID")
	plt.ylabel("Number of Occurrences")
	plt.show()

	# Create a binary matrix for emotions
	import numpy as np
	import seaborn as sns

	num_labels = max([max(l.tolist()) if len(l) > 0 else 0 for l in df["labels"]]) + 1
	emotion_matrix = np.zeros((len(df), num_labels), dtype=int)
	for i, labels in enumerate(df["labels"]):
	for label in labels:
	emotion_matrix[i, label] = 1

	# Compute co-occurrence matrix
	co_occurrence = np.dot(emotion_matrix.T, emotion_matrix)

	# Plot heatmap
	plt.figure(figsize=(12, 10))
	sns.heatmap(co_occurrence, cmap="Blues", linewidths=0.5)
	plt.title("Emotion Co-occurrence Heatmap")
	plt.xlabel("Emotion Label ID")
	plt.ylabel("Emotion Label ID")
	plt.show()

	# Display 5 random rows
	print("Sample text examples with emotion labels:")
	print(df.sample(5)[["text", "labels"]])

	# Define emotion label ID to name mapping manually (based on GoEmotions documentation)
	id2label = [
	'admiration', 'amusement', 'anger', 'annoyance', 'approval',
	'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
	'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
	'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism',
	'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise',
	'neutral'
	]

	def decode_labels(label_ids):
	return [id2label[i] for i in label_ids]

	# Display 5 random samples with readable label names
	print("Sample text examples with emotion label names:")
	sample_df = df.sample(5)
	sample_df["label_names"] = sample_df["labels"].apply(decode_labels)
	print(sample_df[["text", "label_names"]])

	# Word cloud
	from wordcloud import WordCloud

	all_text = " ".join(df["text"])
	wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)

	plt.figure(figsize=(12, 6))
	plt.imshow(wordcloud, interpolation="bilinear")
	plt.axis("off")
	plt.title("Most Frequent Words in All Text Samples")
	plt.show()

	# Clean the text data
	import re
	import string

	def clean_text(text):
	text = text.lower()
	text = re.sub(r"\[.*?\]", "", text)
	text = text.translate(str.maketrans('', '', string.punctuation))
	text = re.sub(r"\d+", "", text)
	text = re.sub(r"\s+", " ", text).strip()
	return text

	df["clean_text"] = df["text"].apply(clean_text)

	print("Sample cleaned texts:")
	print(df[["text", "clean_text"]].sample(5))

	# Plot label distribution
	label_counts = Counter([label for sublist in df["labels"] for label in sublist])
	label_df = pd.DataFrame.from_dict(label_counts, orient="index", columns=["count"])
	label_df.index.name = "label_id"
	label_df = label_df.sort_index()
	label_df["label_name"] = label_df.index.map(lambda i: id2label[i])

	plt.figure(figsize=(14, 6))
	sns.barplot(x="label_name", y="count", data=label_df)
	plt.xticks(rotation=45, ha="right")
	plt.title("Distribution of Emotion Labels in Training Set")
	plt.xlabel("Emotion")
	plt.ylabel("Frequency")
	plt.tight_layout()
	plt.show()

	# Embeddings
	from sentence_transformers import SentenceTransformer
	import torch

	model = SentenceTransformer('all-MiniLM-L6-v2')
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	model = model.to(device)

	sample_df = df.sample(n=3000, random_state=42).reset_index(drop=True)
	embeddings = model.encode(sample_df["clean_text"].tolist(), show_progress_bar=True, device=device)
	sample_df["embedding"] = embeddings.tolist()

	# t-SNE visualization
	from sklearn.manifold import TSNE

	X = np.array(sample_df["embedding"].tolist())
	tsne = TSNE(n_components=2, random_state=42, perplexity=30)
	X_embedded = tsne.fit_transform(X)
	sample_df["x"] = X_embedded[:, 0]
	sample_df["y"] = X_embedded[:, 1]

	plt.figure(figsize=(10, 6))
	plt.scatter(sample_df["x"], sample_df["y"], alpha=0.5)
	plt.title("t-SNE Projection of Text Embeddings")
	plt.xlabel("Component 1")
	plt.ylabel("Component 2")
	plt.show()

	# KMeans Clustering
	from sklearn.cluster import KMeans

	num_clusters = 8
	kmeans = KMeans(n_clusters=num_clusters, random_state=42)
	sample_df["cluster"] = kmeans.fit_predict(X)

	plt.figure(figsize=(10, 6))
	scatter = plt.scatter(sample_df["x"], sample_df["y"], c=sample_df["cluster"], cmap='tab10', alpha=0.6)
	plt.title(f"K-Means Clustering (k={num_clusters}) on t-SNE Projection")
	plt.xlabel("Component 1")
	plt.ylabel("Component 2")
	plt.colorbar(scatter, label="Cluster")
	plt.show()

	# Recommendation Function
	from sentence_transformers import util

	EMBEDDINGS = torch.tensor(sample_df['embedding'].tolist(), device=device)

	def recommend_similar_emotions(user_input):
	if not user_input.strip():
	return "Please enter some text."
	user_embedding = model.encode(user_input, convert_to_tensor=True, device=device)
	similarities = util.cos_sim(user_embedding, EMBEDDINGS)[0]
	top_indices = similarities.argsort(descending=True)[:5]
	results = []
	for idx in top_indices:
	row = sample_df.iloc[idx.item()]
	results.append(f"{row['text']}\nEmotions: {row['labels']}")
	return "\n\n".join(results)

	# Gradio App
	import gradio as gr

	demo = gr.Interface(
	fn=recommend_similar_emotions,
	inputs=gr.Textbox(lines=2, placeholder="Type your situation or feeling..."),
	outputs="text",
	title="Emotion Matcher",
	description="Describe how you feel, and get similar examples with emotion labels."
	)

	demo.launch()