Spaces:

Kushagra13
/

InsightLoop

Sleeping

App Files Files Community

InsightLoop / src /pages /persona.py

Kushagra13

Update src/pages/persona.py

d85164d verified 4 months ago

raw

history blame contribute delete

18.7 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import os
	import re
	from groq import Groq
	import plotly.graph_objs as go
	from collections import defaultdict
	from itertools import cycle
	import json
	from dotenv import load_dotenv
	PERSONA_PATH = os.getenv("PERSONA_PATH", "/tmp/personas.json")

	# Set HuggingFace cache directories to /tmp for cloud hosting (permission safe)
	os.environ["TRANSFORMERS_CACHE"] = os.getenv("TRANSFORMERS_CACHE", "/tmp/hf_cache")
	os.environ["HF_HOME"] = os.getenv("HF_HOME", "/tmp/huggingface")
	os.environ["HF_DATASETS_CACHE"] = os.getenv("HF_DATASETS_CACHE", "/tmp/huggingface")

	# --- THEME COLORS ---
	neon_blue = "#00fff7"
	neon_green = "#7CFC00"
	neon_pink = "#F72585"
	neon_yellow = "#FFF600"
	neon_bg = "#181830"
	neon_orange = "#FFB347"
	neon_dark = "#202037"
	load_dotenv() # load .env file

	GROQ_API_KEY = os.environ.get("GROQ_API_KEY")

	# --- CONFIG ---
	GROQ_MODEL = "llama3-70b-8192"
	groq_client = Groq(api_key=GROQ_API_KEY)
	PRODUCT_CONTEXT = (
	"You are an AI market research expert analyzing customer reviews for a chocolate-flavoured whey protein powder. "
	"Generate user personas based on patterns and diversity in the reviews."
	)
	CSV_PATH = "src/data_with_text.csv"

	st.set_page_config(page_title="Persona Lab", layout="wide", initial_sidebar_state="collapsed")
	st.markdown(
	"<h1 style='color:#00fff7;font-size:2.6rem;font-weight:900;letter-spacing:0.01em;margin-bottom:5px;'>🎭 Persona Lab</h1>",
	unsafe_allow_html=True
	)

	# Set dark theme programmatically
	st.markdown(
	"""
	<style>
	body, .main, .stApp {
	background: #14151A !important;
	color: #fff !important;
	}
	</style>
	""",
	unsafe_allow_html=True
	)


	st.markdown(
	f"""
	<div style="font-size:1.21rem; color:#AC7CFF; font-weight:600; margin-top:-13px; margin-bottom:14px; line-height:1.5;">
	Ready to peek inside the minds of your customers?
	This is your sandbox for uncovering who buys, why they rave, and what they crave—powered by real reviews and sharp AI.
	Dive in, explore the personas that drive your market, and see your brand through their eyes (and taste buds)!
	</div>
	""",
	unsafe_allow_html=True
	)

	# --- NAVIGATION BUTTONS ---
	st.markdown("""
	<style>
	.neon-btn {
	display:inline-block;
	font-weight:bold;
	padding:14px 32px;
	border:none;
	border-radius:12px;
	font-size:1.1em;
	margin-right:18px;
	cursor:pointer;
	box-shadow:0 0 14px #00fff777;
	color:#222 !important;
	background:linear-gradient(90deg,#7CFC00,#00fff7);
	text-decoration:none !important;
	transition: transform 0.08s;
	}
	.neon-btn-pink {
	background:linear-gradient(90deg,#F72585,#00fff7);
	color:#fff !important;
	box-shadow:0 0 14px #F7258577;
	}
	.neon-btn:hover {
	transform:scale(1.04);
	box-shadow:0 0 24px #00fff799;
	}
	.neon-btn-pink:hover {
	box-shadow:0 0 24px #F7258599;
	}
	</style>
	""", unsafe_allow_html=True)

	st.markdown("""
	<div style="display:flex;gap:2em;justify-content:flex-start;">
	<a href="/prt111" class="neon-btn"target="_self">🏠 Home</a>
	<a href="/newprod" class="neon-btn neon-btn-pink"target="_self">🚀 New Product Launch</a>
	</div>
	<br>
	""", unsafe_allow_html=True)


	def block_markdown(text, color):
	text = text.replace('\n', '<br>')
	return (
	f'<div style="background:linear-gradient(90deg,{color}22,#181830 90%);'
	f'padding:16px 22px;border-radius:16px;margin:10px 0 24px 0;'
	f'font-weight:600;color:#fff;font-size:1.04em;line-height:1.6;box-shadow:0 2px 24px {color}19;">'
	f'{text}</div>'
	)

	@st.cache_data(show_spinner=True)
	def load_reviews(csv_path):
	if not os.path.exists(csv_path):
	st.error(f"CSV file not found: {csv_path}")
	return pd.DataFrame()
	df = pd.read_csv(csv_path)
	if "polarity" not in df.columns:
	try:
	from transformers import pipeline
	sa = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
	df["polarity"] = df["review_text"].apply(lambda x: 1 if sa(x)[0]["label"] == "POSITIVE" else -1)
	except Exception as e:
	st.warning("Could not compute sentiment scores. All reviews set to neutral (0).")
	df["polarity"] = 0

	if "review_length" not in df.columns:
	df["review_length"] = df["review_text"].apply(lambda x: len(str(x).split()))
	return df

	def generate_personas(review_texts, n_personas=4):
	prompt = (
	f"Read the following customer reviews for a chocolate-flavored whey protein powder. "
	f"Based on the language, interests, and context, segment these users into {n_personas} distinct personas. "
	"For each persona, provide:\n"
	"1. Persona Name starting with emoji\n"
	"2. A one-line summary\n"
	"3. Five detailed bullet points describing their characteristics, needs, goals, or behaviors (each bullet should be specific and insightful, not generic).\n"
	"Give the answer as a numbered list, one for each persona. Format:\n"
	"1. [Emoji] Persona Name\nSummary: ...\n- ...\n- ...\n- ...\n- ...\n- ...\n"
	"\nREVIEWS:\n" +
	"\n".join(review_texts[:120])[:3600]
	)
	try:
	chat_completion = groq_client.chat.completions.create(
	model=GROQ_MODEL,
	messages=[
	{"role": "system", "content": PRODUCT_CONTEXT},
	{"role": "user", "content": prompt}
	],
	max_tokens=900,
	temperature=0.6,
	)
	return chat_completion.choices[0].message.content.strip()
	except Exception as e:
	return f"Error generating personas: {e}"

	def parse_personas_bulletproof(llm_output, n=4):
	lines = llm_output.splitlines()
	persona_headers = []
	for i, line in enumerate(lines):
	if re.match(r"^([0-9]{1,2}[.)-]?\s*)?[\U0001F300-\U0001FAFF]", line.strip()):
	persona_headers.append(i)
	persona_blocks = []
	for idx, start in enumerate(persona_headers):
	end = persona_headers[idx+1] if idx+1 < len(persona_headers) else len(lines)
	persona_blocks.append(lines[start:end])

	personas = []
	for block in persona_blocks[:n]:
	name_line = re.sub(r"^([0-9]{1,2}[.)-]?\s)?", "", block[0]).strip().replace("*", "")
	summary = ""
	bullets = []
	for l in block[1:]:
	l = l.strip()
	if not l: continue
	if not summary and ("summary" in l.lower() or not l.startswith(("-", "•", "*", "+"))):
	summary = re.sub(r"^summary[:\- ]*", "", l, flags=re.I)
	elif l.startswith(("-", "•", "*", "+")) or re.match(r"^[0-9]{1,2}[.)-]", l):
	b = re.sub(r"^[-•*+0-9. ]+", "", l)
	if b: bullets.append(b)
	personas.append({
	"name": name_line,
	"summary": summary,
	"bullets": bullets[:5]
	})
	return personas

	def assign_review_to_persona_tfidf(df, persona_defs):
	# Use TF-IDF cosine similarity for assignment (faster than LLM for large data)
	from sklearn.feature_extraction.text import TfidfVectorizer
	persona_texts = [p["summary"] + " " + " ".join(p["bullets"]) for p in persona_defs]
	tfidf = TfidfVectorizer(stop_words='english')
	X = tfidf.fit_transform(df["review_text"].tolist() + persona_texts)
	review_vecs = X[:-len(persona_texts)]
	persona_vecs = X[-len(persona_texts):]
	assignments = []
	for i in range(review_vecs.shape[0]):
	sims = review_vecs[i].dot(persona_vecs.T).toarray().flatten()
	idx = np.argmax(sims)
	assignments.append(persona_defs[idx]["name"])
	return assignments

	def groq_bullets_persona(chart_desc, chart_data_text):
	user_prompt = (
	f"Summarize as exactly two bullet points the main insights for this chart: {chart_desc}. "
	f"Here is the data: {chart_data_text}. "
	"Provide a percentage if applicable. Just facts."
	)
	try:
	chat_completion = groq_client.chat.completions.create(
	model=GROQ_MODEL,
	messages=[
	{"role": "system", "content": PRODUCT_CONTEXT},
	{"role": "user", "content": user_prompt}
	],
	max_tokens=80,
	temperature=0.5,
	)
	bullets = chat_completion.choices[0].message.content.strip()
	points = [line for line in bullets.splitlines() if line.strip().startswith(("-", "•"))]
	return "\n".join(points[:2]) if len(points) >= 2 else "- " + bullets
	except Exception:
	return "- Summary not available.\n- (LLM error)"

	# --- EMOTION PIPELINE (optional) ---
	def emotion_pipeline(df):
	try:
	from transformers import pipeline
	emo = pipeline(
	"text-classification",
	model="finiteautomata/bertweet-base-emotion-analysis", # much smaller than roberta-base!
	top_k=None,
	device=-1 # always use CPU, avoid meta-tensor bug
	)
	except Exception as e:
	st.warning(f"Could not load emotion model, skipping emotion analysis: {e}")
	df["main_emotion"] = "neutral"
	return df
	all_emotions = []
	for t in df["review_text"]:
	try:
	emotions = emo(t[:512])
	if isinstance(emotions, list) and len(emotions) and isinstance(emotions[0], list):
	# Sometimes returns list of lists
	emotions = emotions[0]
	main_emo = sorted(emotions, key=lambda x: -x["score"])[0]["label"]
	except Exception:
	main_emo = "neutral"
	all_emotions.append(main_emo)
	df["main_emotion"] = all_emotions
	return df


	# ========== MAIN PIPELINE ========== #

	with st.spinner("🔎 Analyzing your data... Please wait a few moments."):
	df = load_reviews(CSV_PATH)
	reviews = df["review_text"].dropna().tolist() if not df.empty else []
	reviews = [t for t in reviews if "unreadable" not in t and "missing" not in t and t.strip()]
	if reviews:
	personas_raw = generate_personas(reviews, 4)
	personas = parse_personas_bulletproof(personas_raw, 4)
	if personas:
	with open(PERSONA_PATH, "w", encoding="utf-8") as f:
	json.dump(personas, f, ensure_ascii=False, indent=2)
	st.session_state['personas'] = personas
	st.success(f"{len(personas)} personas saved for next use.")
	else:
	personas = []

	persona_colors = [neon_green, neon_blue, neon_pink, neon_orange]
	persona_cycler = cycle(persona_colors)
	persona_blocks = []
	persona_names = []

	# Persona grid (left-right)
	if personas:
	st.markdown("<br>", unsafe_allow_html=True)
	grid_cols = st.columns(2)
	for i, p in enumerate(personas):
	c = next(persona_cycler)
	col = grid_cols[i%2]
	with col:
	st.markdown(
	f"<div style='background:linear-gradient(90deg,{c}18,#181830 95%);"
	"padding:24px 26px 16px 26px;border-radius:18px;margin-bottom:24px;"
	f"box-shadow:0 2px 22px {c}22;'>"
	f"<h2 style='color:{c};margin-bottom:0.18em'>{p['name']}</h2>"
	f"<div style='color:#fff;font-size:1.15em;font-weight:500;margin-bottom:10px'>Summary: {p['summary']}</div>"
	f"<div style='color:{neon_pink};font-weight:700;font-size:1.08em;margin-bottom:2px'>Characteristics</div>"
	f"<ul style='font-size:1.02em;margin-top:3px'>{''.join([f'<li>{b}</li>' for b in p['bullets']])}</ul>"
	"</div>", unsafe_allow_html=True
	)
	persona_names.append(p["name"])
	st.markdown("<hr>", unsafe_allow_html=True)

	if personas and len(reviews) > 0:
	# Assign reviews to persona via TF-IDF (fast)
	persona_for_review = assign_review_to_persona_tfidf(df, personas)
	df_reviews = df.copy()
	df_reviews = df_reviews.iloc[:len(persona_for_review)].copy()
	df_reviews["persona"] = persona_for_review

	# --- Generate all summary stats for new graphs
	# 1. Persona Review Share
	persona_counts = df_reviews["persona"].value_counts()
	# 2. Persona Sentiment
	avg_sentiment = df_reviews.groupby("persona")["polarity"].mean()
	# 3. Persona Review Length
	avg_length = df_reviews.groupby("persona")["review_length"].mean()
	# 4. Persona Emotion (optional)
	if "main_emotion" not in df_reviews.columns:
	df_reviews = emotion_pipeline(df_reviews)
	emo_dist = df_reviews.groupby("persona")["main_emotion"].value_counts().unstack().fillna(0)

	# --- Row 1: Pie and Sentiment Bar
	c1, c2 = st.columns(2)
	with c1:
	st.markdown("<h3 style='color:#fff;font-size:2rem;font-weight:700;'>Sales/Review Share by Persona</h3>", unsafe_allow_html=True)
	fig = go.Figure(data=[go.Pie(labels=persona_counts.index, values=persona_counts.values, hole=0.45)])
	fig.update_traces(textinfo='percent+label')
	st.plotly_chart(fig, use_container_width=True)
	st.markdown(block_markdown(
	groq_bullets_persona("Sales/Review Share by Persona", persona_counts.to_dict()), neon_green
	), unsafe_allow_html=True)

	with c2:
	st.markdown("<h3 style='color:#fff;font-size:2rem;font-weight:700;'>Average Sentiment by Persona</h3>", unsafe_allow_html=True)
	fig2 = go.Figure(data=[go.Bar(x=avg_sentiment.index, y=avg_sentiment.values, marker=dict(color=[neon_green, neon_blue, neon_pink, neon_orange]))])
	fig2.update_layout(xaxis_title="Persona", yaxis_title="Avg Sentiment", font=dict(size=15))
	st.plotly_chart(fig2, use_container_width=True)
	st.markdown(block_markdown(
	groq_bullets_persona("Average Sentiment by Persona", avg_sentiment.to_dict()), neon_blue
	), unsafe_allow_html=True)

	# --- Row 2: Review Length and Emotion Distribution
	c3, c4 = st.columns(2)
	with c3:
	st.markdown("<h3 style='color:#fff;font-size:2rem;font-weight:700;'>Persona vs. Review Length Distribution</h3>", unsafe_allow_html=True)
	fig3 = go.Figure(data=[go.Bar(x=avg_length.index, y=avg_length.values, marker=dict(color=[neon_green, neon_blue, neon_pink, neon_orange]))])
	fig3.update_layout(xaxis_title="Persona", yaxis_title="Avg Review Length", font=dict(size=15))
	st.plotly_chart(fig3, use_container_width=True)
	st.markdown(block_markdown(
	groq_bullets_persona("Average review length (words) by persona", avg_length.to_dict()), neon_orange
	), unsafe_allow_html=True)

	with c4:
	st.markdown("<h3 style='color:#fff;font-size:2rem;font-weight:700;'>Persona vs. Emotion Distribution</h3>", unsafe_allow_html=True)
	fig4 = go.Figure()
	for idx, em in enumerate(emo_dist.columns):
	fig4.add_trace(go.Bar(name=em, x=emo_dist.index, y=emo_dist[em].values))
	fig4.update_layout(barmode='stack', xaxis_title="Persona", yaxis_title="Emotion Count", font=dict(size=15))
	st.plotly_chart(fig4, use_container_width=True)
	st.markdown(block_markdown(
	groq_bullets_persona("Distribution of primary emotions per persona", emo_dist.to_dict()), neon_pink
	), unsafe_allow_html=True)

	# --- Persona-wise Highlights, grouped by persona with headings ---
	st.markdown("<hr><h2 style='color:#fff'>Persona-wise Sentiment Highlights & Recommendations</h2>", unsafe_allow_html=True)
	persona_grid = st.columns(2)

	for idx, p in enumerate(personas):
	persona_df = df_reviews[df_reviews["persona"] == p["name"]]
	top_pos = persona_df[persona_df["polarity"] > 0]["review_text"].head(2).tolist()
	top_neg = persona_df[persona_df["polarity"] < 0]["review_text"].head(2).tolist()
	pos_summary = groq_bullets_persona(
	f"Summarize two main positive sentiment points, with percentage, for persona '{p['name']}'.",
	" ".join(top_pos)
	) if top_pos else "No positive reviews."
	neg_summary = groq_bullets_persona(
	f"Summarize two main negative sentiment points, with percentage, for persona '{p['name']}'.",
	" ".join(top_neg)
	) if top_neg else "No negative reviews."

	rec_prompt = (
	f"You are a product marketing strategist. "
	f"Based on the review highlights and persona details for '{p['name']}' "
	f"(do not repeat the characteristics), write one concise or mention name of user, actionable product or marketing recommendation. Dont put * anywhere "
	f"for the company to better engage this persona. "
	f"Focus on practical actions the business can take (such as messaging, offers, features, or campaigns). "
	f"Reply with 1-2 sentences, avoid restating the persona’s traits."
	)

	try:
	rec_out = groq_client.chat.completions.create(
	model=GROQ_MODEL,
	messages=[
	{"role": "system", "content": PRODUCT_CONTEXT},
	{"role": "user", "content": rec_prompt}
	],
	max_tokens=80, temperature=0.5
	).choices[0].message.content.strip()
	except:
	rec_out = "No recommendation available."

	with persona_grid[idx % 2]:
	st.markdown(
	f"<div style='margin-bottom:38px;padding:18px 20px 8px 20px;border-radius:18px;"
	f"background:linear-gradient(90deg,{persona_colors[idx%4]}22,#181830 100%);box-shadow:0 2px 22px {persona_colors[idx%4]}18;'>"
	f"<h2 style='color:{persona_colors[idx%4]};font-size:1.35em;margin-bottom:0.3em'>{p['name']}</h2>"
	f"<div style='color:#fff;font-size:1.13em;font-weight:400;margin-bottom:14px;'>{p['summary']}</div>"
	"<div style='margin-bottom:16px'>"
	f"<b style='color:{neon_green};font-size:1.1em;'>Top Positive Sentiments:</b><br>{block_markdown(pos_summary, neon_green)}"
	"</div>"
	"<div style='margin-bottom:16px'>"
	f"<b style='color:{neon_pink};font-size:1.1em;'>Top Negative Sentiments:</b><br>{block_markdown(neg_summary, neon_pink)}"
	"</div>"
	"<div>"
	f"<b style='color:{neon_yellow};font-size:1.1em;'>Recommendation:</b><br>{block_markdown(rec_out, neon_yellow)}"
	"</div>"
	"</div>", unsafe_allow_html=True
	)


	st.markdown("---")