Spaces:

Koalar
/

kallam-demo-docker

Runtime error

App Files Files Community

kallam-demo-docker / scripts /visualizer_cell9.py

Koalar

Upload 19 files

0b70f11 verified about 2 months ago

raw

history blame contribute delete

20.1 kB

	# radar_visualizer_individual.py
	# Requirements: matplotlib, numpy, pandas

	import json
	import math
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from pathlib import Path
	from typing import Dict, List, Optional

	# -----------------
	# CONFIG
	# -----------------
	REPORT_CONFIGS = {
	# label: { path: Path\|str, color: hex\|rgb tuple (optional) }
	"Real Psychologist": {"path": "../data/human/report.json", "color": "#ff0000"},
	"Our KaLLaM": {"path": "../data/orchestrated/report.json", "color": "#2ca02c"},
	"Gemini-2.5-flash-light": {"path": "../data/gemini/report.json", "color": "#9dafff"},
	"Gemma-SEA-LION-v4-27B-IT": {"path": "../data/SEA-Lion/report.json", "color": "#8d35ff"},
	# Add more models here...
	}

	# Psychometric targets (units are already scaled as shown)
	RECOMMENDED = {
	"R/Q ratio": 1.0,
	"% Open Questions": 50.0,
	"% Complex Reflections": 40.0,
	"% MI-Consistent": 90.0,
	"% Change Talk": 50.0
	}

	# Safety keys (Xu et al. proxies, 0–10)
	SAFETY_KEYS = [
	"Q1_guidelines_adherence",
	"Q2_referral_triage",
	"Q3_consistency",
	"Q4_resources",
	"Q5_empowerment",
	]

	# -----------------
	# LOADING & EXTRACTION
	# -----------------
	def _load_json(path_like) -> Optional[dict]:
	p = Path(path_like).expanduser()
	if not p.exists():
	print(f"[warn] Missing report: {p}")
	return None
	try:
	with p.open("r", encoding="utf-8") as f:
	return json.load(f)
	except Exception as e:
	print(f"[warn] Failed to read {p}: {e}")
	return None

	def _extract_psychometrics(report: Optional[dict]) -> dict:
	psy = report.get("psychometrics", {}) if report else {}
	try:
	rq = float(psy.get("R_over_Q", 0.0))
	poq = float(psy.get("pct_open_questions", 0.0)) * 100.0
	pcr = float(psy.get("pct_complex_reflection", 0.0)) * 100.0
	mic = psy.get("pct_mi_consistent", psy.get("pct_mi_consistency", psy.get("pct_mi_consist", 0.0)))
	mic = float(mic) * 100.0
	pct_ct = float(psy.get("pct_CT_over_CT_plus_ST", 0.0)) * 100.0
	except Exception:
	rq, poq, pcr, mic, pct_ct = 0.0, 0.0, 0.0, 0.0, 0.0
	return {
	"R/Q ratio": rq,
	"% Open Questions": poq,
	"% Complex Reflections": pcr,
	"% MI-Consistent": mic,
	"% Change Talk": pct_ct,
	}

	def _extract_safety(report: Optional[dict]) -> dict:
	if not report:
	return {}
	safety = report.get("safety", {})
	scores = safety.get("scores_0_10", {})
	out = {}
	for k in SAFETY_KEYS:
	try:
	out[k] = float(scores.get(k, 0.0))
	except Exception:
	out[k] = 0.0
	return out

	# -----------------
	# UTIL
	# -----------------
	def values_by_labels(d: Dict[str, float], labels: List[str]) -> List[float]:
	out = []
	for k in labels:
	v = d.get(k, np.nan)
	out.append(0.0 if (pd.isna(v) or v is None) else float(v))
	return out

	def _make_angles(n: int) -> List[float]:
	ang = np.linspace(0, 2 * math.pi, n, endpoint=False).tolist()
	return ang + ang[:1]

	def _as_closed(seq: List[float]) -> List[float]:
	return seq + seq[:1] if seq else []

	# -----------------
	# DATA BUILD
	# -----------------
	def build_all_data(report_configs: dict):
	all_data = {}
	colors = {}
	for label, cfg in report_configs.items():
	rep = _load_json(cfg.get("path"))
	colors[label] = cfg.get("color", "#1f77b4")
	pm = _extract_psychometrics(rep)
	sm = _extract_safety(rep)
	all_data[label] = {"psychometrics": pm, "safety": sm, "report": rep}
	return all_data, colors

	# -----------------
	# CONSOLIDATED 1x2 BARS (absolute + recommended)
	# -----------------
	def render_unified_absolute_only(report_configs=REPORT_CONFIGS, save_path: str = "./radar_outputs/ALL_MODELS_absolute.png"):
	"""
	One figure, 1x2 grid:
	[0] Psychometrics — Absolute (Human + all models + Recommended targets as hatched bars)
	[1] Safety — Absolute (Human + all models + Recommended=10 for all safety as hatched bars)
	"""
	all_data, colors = build_all_data(report_configs)

	human_label = "Real Psychologist"
	if human_label not in all_data:
	print("[warn] No human baseline.")
	return

	entity_labels = [lbl for lbl in all_data.keys() if lbl != human_label]
	if not entity_labels:
	print("[warn] No non-human models.")
	return

	human_psych = all_data[human_label]["psychometrics"] or {}
	human_safety = all_data[human_label]["safety"] or {}

	psych_axes = list(RECOMMENDED.keys())
	safety_axes = SAFETY_KEYS

	human_psych_vals = values_by_labels(human_psych, psych_axes)
	model_psych_matrix = np.array([
	[float(all_data[m]["psychometrics"].get(metric, 0.0)) for m in entity_labels]
	for metric in psych_axes
	])

	has_any_model_safety = any(bool(all_data[m]["safety"]) for m in entity_labels)
	human_safety_vals = values_by_labels(human_safety, safety_axes) if human_safety else [0.0] * len(safety_axes)
	model_safety_matrix = np.array([
	[float(all_data[m]["safety"].get(metric, 0.0)) for m in entity_labels]
	for metric in safety_axes
	]) if has_any_model_safety and human_safety else np.zeros((len(safety_axes), len(entity_labels)))

	fig, axs = plt.subplots(1, 2, figsize=(18, 6))
	fig.suptitle("All Models vs Real Psychologist — Absolute Scores", fontsize=18, fontweight="bold", y=0.98)

	# ----------------- Psychometrics Absolute -----------------
	ax_abs_p = axs[0]
	x = np.arange(len(psych_axes))

	# bars per group = Recommended + Human + N models
	n_models = len(entity_labels)
	total_bars = 2 + n_models
	group_width = 0.9
	bar_width = group_width / total_bars
	start = -group_width / 2

	# Recommended bars (hatched)
	rec_vals = values_by_labels(RECOMMENDED, psych_axes)
	rec_offset = start + bar_width * 0.5
	ax_abs_p.bar(
	x + rec_offset, rec_vals, width=bar_width, label="Recommended",
	edgecolor="#222222", facecolor="none", hatch="//", linewidth=1.2
	)

	# Human bars
	human_offset = start + bar_width * 1.5
	ax_abs_p.bar(x + human_offset, human_psych_vals, width=bar_width, label=human_label, color="#ff0000", alpha=0.9)

	# Model bars
	y_max_psy = max([human_psych_vals, rec_vals]) if (human_psych_vals or rec_vals) else 0
	for i, m in enumerate(entity_labels):
	offs = start + bar_width * (i + 2.5)
	vals = model_psych_matrix[:, i]
	y_max_psy = max(y_max_psy, float(np.nanmax(vals)) if vals.size else 0)
	ax_abs_p.bar(x + offs, vals, width=bar_width, label=m, color=colors.get(m, "#1f77b4"), alpha=0.9)

	ax_abs_p.set_xticks(x)
	ax_abs_p.set_xticklabels(psych_axes, rotation=15, ha="right")
	ax_abs_p.set_ylabel("Score")
	ax_abs_p.set_ylim(0, y_max_psy * 1.15 if y_max_psy > 0 else 1)
	ax_abs_p.set_title("Psychometrics — Absolute")
	ax_abs_p.grid(axis="y", alpha=0.3)
	ax_abs_p.legend(ncol=2, frameon=False, bbox_to_anchor=(1.0, 1.15))

	# ----------------- Safety Absolute -----------------
	ax_abs_s = axs[1]
	x_s = np.arange(len(safety_axes))

	# bars per group = Recommended + Human + N models
	total_bars_s = 2 + len(entity_labels)
	group_width_s = 0.9
	bar_width_s = group_width_s / total_bars_s
	start_s = -group_width_s / 2

	# Recommended safety target = 10 for each key
	rec_safety_vals = [10.0] * len(safety_axes)
	rec_offset_s = start_s + bar_width_s * 0.5
	ax_abs_s.bar(
	x_s + rec_offset_s, rec_safety_vals, width=bar_width_s, label="Ideal Safety",
	edgecolor="#222222", facecolor="none", hatch="//", linewidth=1.2
	)

	# Human bars
	human_offset_s = start_s + bar_width_s * 1.5
	ax_abs_s.bar(x_s + human_offset_s, human_safety_vals, width=bar_width_s, label=human_label, color="#ff0000", alpha=0.9)

	# Models
	if has_any_model_safety and human_safety:
	for i, m in enumerate(entity_labels):
	offs = start_s + bar_width_s * (i + 2.5)
	vals = model_safety_matrix[:, i]
	ax_abs_s.bar(x_s + offs, vals, width=bar_width_s, label=m, color=colors.get(m, "#1f77b4"), alpha=0.9)

	ax_abs_s.set_xticks(x_s)
	ax_abs_s.set_xticklabels(["Guidelines", "Referral", "Consistency", "Resources", "Empowerment"], rotation=15, ha="right")
	ax_abs_s.set_ylabel("0–10")
	ax_abs_s.set_ylim(0, 10)
	ax_abs_s.set_title("Safety — Absolute")
	ax_abs_s.grid(axis="y", alpha=0.3)
	ax_abs_s.legend(ncol=2, frameon=False, bbox_to_anchor=(1.0, 1.15))

	plt.tight_layout()
	if save_path:
	Path(save_path).parent.mkdir(parents=True, exist_ok=True)
	fig.savefig(save_path, dpi=300, bbox_inches="tight", facecolor="white")
	print(f"[info] Saved absolute-only comparison to {save_path}")
	plt.show()

	# -----------------
	# FINAL POLYGON ACCURACY (Similarity-to-Human, 0–100)
	# -----------------
	def calculate_similarity_scores(all_data, human_label="Real Psychologist", max_score=100):
	human_data = all_data.get(human_label, {}) or {}
	human_psych = human_data.get("psychometrics", {}) or {}
	human_safety = human_data.get("safety", {}) or {}

	similarity_scores = {}
	SAFETY_SCALE_MAX = 10.0
	PSYCH_SCALE_MAX = 100.0
	RQ_RATIO_MAX = 5.0

	def scale_max(metric_name: str) -> float:
	if metric_name in SAFETY_KEYS:
	return SAFETY_SCALE_MAX
	if metric_name == "R/Q ratio":
	return RQ_RATIO_MAX
	return PSYCH_SCALE_MAX

	for model_name, data in all_data.items():
	if model_name == human_label:
	continue
	model_psych = data.get("psychometrics", {}) or {}
	model_safety = data.get("safety", {}) or {}

	model_sim = {}

	for metric in RECOMMENDED.keys():
	if metric in model_psych and metric in human_psych:
	m = float(model_psych[metric])
	h = float(human_psych[metric])
	smax = scale_max(metric)
	sim = max_score * (1 - (abs(m - h) / smax))
	model_sim[metric] = max(0, min(max_score, sim))

	for metric in SAFETY_KEYS:
	if metric in model_safety and metric in human_safety:
	m = float(model_safety[metric])
	h = float(human_safety[metric])
	smax = scale_max(metric)
	sim = max_score * (1 - (abs(m - h) / smax))
	model_sim[metric] = max(0, min(max_score, sim))

	if model_sim:
	similarity_scores[model_name] = model_sim

	return similarity_scores

	def render_final_similarity_polygon(report_configs=REPORT_CONFIGS, save_path: str = "./radar_outputs/FINAL_similarity_polygon.png"):
	"""
	One polygon radar: 10 axes total (5 psych + 5 safety), values are 0–100 similarity to the human baseline.
	Higher = closer to human. All models overlaid on the same axes.
	"""
	all_data, colors = build_all_data(report_configs)
	sim = calculate_similarity_scores(all_data)

	if not sim:
	print("[warn] No similarity scores; need human + at least one model with overlapping metrics.")
	return

	# Fixed unified axis order: 5 psych + 5 safety
	axes_labels_full = list(RECOMMENDED.keys()) + SAFETY_KEYS

	# Shorten labels for readability
	def short(lbl: str) -> str:
	s = lbl
	s = s.replace("% ", "")
	s = s.replace("Open Questions", "Open Q")
	s = s.replace("Complex Reflections", "Complex R")
	s = s.replace("MI-Consistent", "MI Consist")
	s = s.replace("Change Talk", "Change Talk")
	s = s.replace("R/Q ratio", "R/Q")
	s = s.replace("Q1_guidelines_adherence", "Guidelines")
	s = s.replace("Q2_referral_triage", "Referral")
	s = s.replace("Q3_consistency", "Consistency")
	s = s.replace("Q4_resources", "Resources")
	s = s.replace("Q5_empowerment", "Empowerment")
	return s

	labels = [short(x) for x in axes_labels_full]
	N = len(axes_labels_full)
	angles = _make_angles(N)

	fig = plt.figure(figsize=(8, 6))
	ax = plt.subplot(1, 1, 1, polar=True)
	fig.suptitle("Final Polygon Accuracy — Similarity to Real Psychologist (0–100)", fontsize=16, fontweight="bold", y=0.98)

	ax.set_theta_offset(math.pi / 2)
	ax.set_theta_direction(-1)
	ax.set_xticks(angles[:-1])
	ax.set_xticklabels(labels, fontsize=10)
	ax.set_ylim(0, 100)
	ax.grid(True, alpha=0.3)

	# Reference rings
	circle_angles = np.linspace(0, 2 * math.pi, 360)
	for ref_val in [25, 50, 75, 90]:
	lw = 2.0 if ref_val >= 75 else 1.2
	ax.plot(circle_angles, [ref_val] * 360, linestyle="--", linewidth=lw, color="#aaaaaa", alpha=0.65)

	# Plot each model
	for model_name, data in all_data.items():
	if model_name == "Real Psychologist":
	continue
	scores = sim.get(model_name, {})
	vals = [float(scores.get(k, 0.0)) for k in axes_labels_full]
	closed = _as_closed(vals)
	color = REPORT_CONFIGS.get(model_name, {}).get("color", "#1f77b4")
	ax.fill(angles, closed, alpha=0.15, color=color)
	ax.plot(angles, closed, linewidth=2.2, label=f"{model_name}", color=color, alpha=0.95)
	ax.scatter(angles[:-1], vals, s=36, color=color, alpha=0.9, zorder=5)

	ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.08), frameon=False, fontsize=9)

	# Footer helper
	fig.text(0.02, 0.02,
	"Scale: higher is better. 90+ excellent, 75+ good, 50+ fair.",
	fontsize=9, va="bottom",
	bbox=dict(boxstyle="round,pad=0.45", facecolor="whitesmoke", alpha=0.9))
	plt.tight_layout()

	if save_path:
	Path(save_path).parent.mkdir(parents=True, exist_ok=True)
	plt.savefig(save_path, dpi=300, bbox_inches="tight", facecolor="white")
	print(f"[info] Saved final similarity polygon to {save_path}")

	plt.show()

	# -----------------
	# RESULTS TABLE (absolute + similarity) → CSV + PNG
	# -----------------
	def _short_label(lbl: str) -> str:
	s = lbl
	s = s.replace("% ", "")
	s = s.replace("Open Questions", "Open Q")
	s = s.replace("Complex Reflections", "Complex R")
	s = s.replace("MI-Consistent", "MI Consist")
	s = s.replace("Change Talk", "Change Talk")
	s = s.replace("R/Q ratio", "R/Q")
	s = s.replace("Q1_guidelines_adherence", "Guidelines")
	s = s.replace("Q2_referral_triage", "Referral")
	s = s.replace("Q3_consistency", "Consistency")
	s = s.replace("Q4_resources", "Resources")
	s = s.replace("Q5_empowerment", "Empowerment")
	return s

	def build_results_dataframes(report_configs=REPORT_CONFIGS):
	"""
	Returns:
	absolute_df: rows = metrics (psych + safety), cols = all entities (human + models)
	similarity_df: rows = metrics, cols = models (0–100 similarity to human)
	"""
	all_data, _ = build_all_data(report_configs)

	# Unified metric order
	metrics = list(RECOMMENDED.keys()) + SAFETY_KEYS

	# Absolute values table
	abs_cols = []
	abs_col_data = []
	for entity in all_data.keys():
	combined = {}
	combined.update(all_data[entity].get("psychometrics", {}) or {})
	combined.update(all_data[entity].get("safety", {}) or {})
	abs_cols.append(entity)
	abs_col_data.append([float(combined.get(m, np.nan)) for m in metrics])

	absolute_df = pd.DataFrame(
	data=np.array(abs_col_data).T,
	index=metrics,
	columns=abs_cols
	)

	# Similarity table (0–100)
	sim = calculate_similarity_scores(all_data)
	if sim:
	sim_cols = []
	sim_col_data = []
	for model_name in sim.keys():
	sim_cols.append(model_name)
	sim_col_data.append([float(sim[model_name].get(m, np.nan)) for m in metrics])
	similarity_df = pd.DataFrame(
	data=np.array(sim_col_data).T,
	index=metrics,
	columns=sim_cols
	)
	else:
	similarity_df = pd.DataFrame(index=metrics)

	# Round for readability
	absolute_df = absolute_df.round(2)
	similarity_df = similarity_df.round(1)

	return absolute_df, similarity_df

	def render_results_table(
	report_configs=REPORT_CONFIGS,
	save_path_png: str = "./radar_outputs/RESULTS_table.png",
	save_path_csv: str = "./radar_outputs/RESULTS_table.csv",
	include_similarity: bool = True
	):
	"""
	Renders a single figure containing a table:
	- Absolute scores for all entities (human + models)
	- If include_similarity=True, appends similarity-to-human columns (with ' (sim)' suffix)

	Also exports a CSV with the same data.
	"""
	absolute_df, similarity_df = build_results_dataframes(report_configs)

	# Build combined table
	if include_similarity and not similarity_df.empty:
	sim_renamed = similarity_df.add_suffix(" (sim)")
	combined_df = absolute_df.join(sim_renamed, how="left")
	else:
	combined_df = absolute_df.copy()

	# Pretty row labels
	combined_df.index = [_short_label(x) for x in combined_df.index]

	# Export CSV
	out_dir = Path(save_path_png).parent
	out_dir.mkdir(parents=True, exist_ok=True)
	combined_df.to_csv(save_path_csv, encoding="utf-8")
	print(f"[info] Saved results CSV to {save_path_csv}")

	# Render matplotlib table
	n_rows, n_cols = combined_df.shape

	# Heuristic sizing: wider for more columns, taller for more rows
	fig_w = min(2 + 0.85 * n_cols, 28) # cap so it doesn't become ridiculous
	fig_h = min(2 + 0.55 * n_rows, 32)

	fig, ax = plt.subplots(figsize=(fig_w, fig_h))
	ax.axis("off")

	title = "Model Results — Absolute Scores"
	if include_similarity and not similarity_df.empty:
	title += " + Similarity-to-Human (0–100)"
	fig.suptitle(title, fontsize=16, fontweight="bold", y=0.995)

	# Convert DataFrame to table
	tbl = ax.table(
	cellText=combined_df.fillna("").values,
	rowLabels=combined_df.index.tolist(),
	colLabels=combined_df.columns.tolist(),
	cellLoc="center",
	loc="center"
	)

	# Styling
	tbl.auto_set_font_size(False)
	tbl.set_fontsize(9)
	# Increase row height slightly for readability
	tbl.scale(1.0, 1.15)

	# Header bold-ish
	for (row, col), cell in tbl.get_celld().items():
	if row == 0 or col == -1:
	# Matplotlib tables index headers differently; this keeps it simple
	pass
	# Shade header row and first column labels
	if row == 0:
	cell.set_facecolor("#f2f2f2")
	cell.set_edgecolor("#c0c0c0")
	cell.set_linewidth(1.0)

	# Light grid effect
	for cell in tbl.get_celld().values():
	cell.set_edgecolor("#dddddd")
	cell.set_linewidth(0.5)

	plt.tight_layout()
	fig.savefig(save_path_png, dpi=300, bbox_inches="tight", facecolor="white")
	print(f"[info] Saved results table figure to {save_path_png}")
	plt.show()

	# -----------------
	# MAIN
	# -----------------
	if __name__ == "__main__":
	render_unified_absolute_only(REPORT_CONFIGS, save_path="./radar_outputs/ALL_MODELS_absolute.png")
	render_final_similarity_polygon(REPORT_CONFIGS, save_path="./radar_outputs/FINAL_similarity_polygon.png")
	render_results_table(REPORT_CONFIGS,
	save_path_png="./radar_outputs/RESULTS_table.png",
	save_path_csv="./radar_outputs/RESULTS_table.csv",
	include_similarity=True)