kallam-demo-docker / scripts /visualizer_cell9.py
Koalar's picture
Upload 19 files
0b70f11 verified
# radar_visualizer_individual.py
# Requirements: matplotlib, numpy, pandas
import json
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from typing import Dict, List, Optional
# -----------------
# CONFIG
# -----------------
REPORT_CONFIGS = {
# label: { path: Path|str, color: hex|rgb tuple (optional) }
"Real Psychologist": {"path": "../data/human/report.json", "color": "#ff0000"},
"Our KaLLaM": {"path": "../data/orchestrated/report.json", "color": "#2ca02c"},
"Gemini-2.5-flash-light": {"path": "../data/gemini/report.json", "color": "#9dafff"},
"Gemma-SEA-LION-v4-27B-IT": {"path": "../data/SEA-Lion/report.json", "color": "#8d35ff"},
# Add more models here...
}
# Psychometric targets (units are already scaled as shown)
RECOMMENDED = {
"R/Q ratio": 1.0,
"% Open Questions": 50.0,
"% Complex Reflections": 40.0,
"% MI-Consistent": 90.0,
"% Change Talk": 50.0
}
# Safety keys (Xu et al. proxies, 0–10)
SAFETY_KEYS = [
"Q1_guidelines_adherence",
"Q2_referral_triage",
"Q3_consistency",
"Q4_resources",
"Q5_empowerment",
]
# -----------------
# LOADING & EXTRACTION
# -----------------
def _load_json(path_like) -> Optional[dict]:
p = Path(path_like).expanduser()
if not p.exists():
print(f"[warn] Missing report: {p}")
return None
try:
with p.open("r", encoding="utf-8") as f:
return json.load(f)
except Exception as e:
print(f"[warn] Failed to read {p}: {e}")
return None
def _extract_psychometrics(report: Optional[dict]) -> dict:
psy = report.get("psychometrics", {}) if report else {}
try:
rq = float(psy.get("R_over_Q", 0.0))
poq = float(psy.get("pct_open_questions", 0.0)) * 100.0
pcr = float(psy.get("pct_complex_reflection", 0.0)) * 100.0
mic = psy.get("pct_mi_consistent", psy.get("pct_mi_consistency", psy.get("pct_mi_consist", 0.0)))
mic = float(mic) * 100.0
pct_ct = float(psy.get("pct_CT_over_CT_plus_ST", 0.0)) * 100.0
except Exception:
rq, poq, pcr, mic, pct_ct = 0.0, 0.0, 0.0, 0.0, 0.0
return {
"R/Q ratio": rq,
"% Open Questions": poq,
"% Complex Reflections": pcr,
"% MI-Consistent": mic,
"% Change Talk": pct_ct,
}
def _extract_safety(report: Optional[dict]) -> dict:
if not report:
return {}
safety = report.get("safety", {})
scores = safety.get("scores_0_10", {})
out = {}
for k in SAFETY_KEYS:
try:
out[k] = float(scores.get(k, 0.0))
except Exception:
out[k] = 0.0
return out
# -----------------
# UTIL
# -----------------
def values_by_labels(d: Dict[str, float], labels: List[str]) -> List[float]:
out = []
for k in labels:
v = d.get(k, np.nan)
out.append(0.0 if (pd.isna(v) or v is None) else float(v))
return out
def _make_angles(n: int) -> List[float]:
ang = np.linspace(0, 2 * math.pi, n, endpoint=False).tolist()
return ang + ang[:1]
def _as_closed(seq: List[float]) -> List[float]:
return seq + seq[:1] if seq else []
# -----------------
# DATA BUILD
# -----------------
def build_all_data(report_configs: dict):
all_data = {}
colors = {}
for label, cfg in report_configs.items():
rep = _load_json(cfg.get("path"))
colors[label] = cfg.get("color", "#1f77b4")
pm = _extract_psychometrics(rep)
sm = _extract_safety(rep)
all_data[label] = {"psychometrics": pm, "safety": sm, "report": rep}
return all_data, colors
# -----------------
# CONSOLIDATED 1x2 BARS (absolute + recommended)
# -----------------
def render_unified_absolute_only(report_configs=REPORT_CONFIGS, save_path: str = "./radar_outputs/ALL_MODELS_absolute.png"):
"""
One figure, 1x2 grid:
[0] Psychometrics β€” Absolute (Human + all models + Recommended targets as hatched bars)
[1] Safety β€” Absolute (Human + all models + Recommended=10 for all safety as hatched bars)
"""
all_data, colors = build_all_data(report_configs)
human_label = "Real Psychologist"
if human_label not in all_data:
print("[warn] No human baseline.")
return
entity_labels = [lbl for lbl in all_data.keys() if lbl != human_label]
if not entity_labels:
print("[warn] No non-human models.")
return
human_psych = all_data[human_label]["psychometrics"] or {}
human_safety = all_data[human_label]["safety"] or {}
psych_axes = list(RECOMMENDED.keys())
safety_axes = SAFETY_KEYS
human_psych_vals = values_by_labels(human_psych, psych_axes)
model_psych_matrix = np.array([
[float(all_data[m]["psychometrics"].get(metric, 0.0)) for m in entity_labels]
for metric in psych_axes
])
has_any_model_safety = any(bool(all_data[m]["safety"]) for m in entity_labels)
human_safety_vals = values_by_labels(human_safety, safety_axes) if human_safety else [0.0] * len(safety_axes)
model_safety_matrix = np.array([
[float(all_data[m]["safety"].get(metric, 0.0)) for m in entity_labels]
for metric in safety_axes
]) if has_any_model_safety and human_safety else np.zeros((len(safety_axes), len(entity_labels)))
fig, axs = plt.subplots(1, 2, figsize=(18, 6))
fig.suptitle("All Models vs Real Psychologist β€” Absolute Scores", fontsize=18, fontweight="bold", y=0.98)
# ----------------- Psychometrics Absolute -----------------
ax_abs_p = axs[0]
x = np.arange(len(psych_axes))
# bars per group = Recommended + Human + N models
n_models = len(entity_labels)
total_bars = 2 + n_models
group_width = 0.9
bar_width = group_width / total_bars
start = -group_width / 2
# Recommended bars (hatched)
rec_vals = values_by_labels(RECOMMENDED, psych_axes)
rec_offset = start + bar_width * 0.5
ax_abs_p.bar(
x + rec_offset, rec_vals, width=bar_width, label="Recommended",
edgecolor="#222222", facecolor="none", hatch="//", linewidth=1.2
)
# Human bars
human_offset = start + bar_width * 1.5
ax_abs_p.bar(x + human_offset, human_psych_vals, width=bar_width, label=human_label, color="#ff0000", alpha=0.9)
# Model bars
y_max_psy = max([*human_psych_vals, *rec_vals]) if (human_psych_vals or rec_vals) else 0
for i, m in enumerate(entity_labels):
offs = start + bar_width * (i + 2.5)
vals = model_psych_matrix[:, i]
y_max_psy = max(y_max_psy, float(np.nanmax(vals)) if vals.size else 0)
ax_abs_p.bar(x + offs, vals, width=bar_width, label=m, color=colors.get(m, "#1f77b4"), alpha=0.9)
ax_abs_p.set_xticks(x)
ax_abs_p.set_xticklabels(psych_axes, rotation=15, ha="right")
ax_abs_p.set_ylabel("Score")
ax_abs_p.set_ylim(0, y_max_psy * 1.15 if y_max_psy > 0 else 1)
ax_abs_p.set_title("Psychometrics β€” Absolute")
ax_abs_p.grid(axis="y", alpha=0.3)
ax_abs_p.legend(ncol=2, frameon=False, bbox_to_anchor=(1.0, 1.15))
# ----------------- Safety Absolute -----------------
ax_abs_s = axs[1]
x_s = np.arange(len(safety_axes))
# bars per group = Recommended + Human + N models
total_bars_s = 2 + len(entity_labels)
group_width_s = 0.9
bar_width_s = group_width_s / total_bars_s
start_s = -group_width_s / 2
# Recommended safety target = 10 for each key
rec_safety_vals = [10.0] * len(safety_axes)
rec_offset_s = start_s + bar_width_s * 0.5
ax_abs_s.bar(
x_s + rec_offset_s, rec_safety_vals, width=bar_width_s, label="Ideal Safety",
edgecolor="#222222", facecolor="none", hatch="//", linewidth=1.2
)
# Human bars
human_offset_s = start_s + bar_width_s * 1.5
ax_abs_s.bar(x_s + human_offset_s, human_safety_vals, width=bar_width_s, label=human_label, color="#ff0000", alpha=0.9)
# Models
if has_any_model_safety and human_safety:
for i, m in enumerate(entity_labels):
offs = start_s + bar_width_s * (i + 2.5)
vals = model_safety_matrix[:, i]
ax_abs_s.bar(x_s + offs, vals, width=bar_width_s, label=m, color=colors.get(m, "#1f77b4"), alpha=0.9)
ax_abs_s.set_xticks(x_s)
ax_abs_s.set_xticklabels(["Guidelines", "Referral", "Consistency", "Resources", "Empowerment"], rotation=15, ha="right")
ax_abs_s.set_ylabel("0–10")
ax_abs_s.set_ylim(0, 10)
ax_abs_s.set_title("Safety β€” Absolute")
ax_abs_s.grid(axis="y", alpha=0.3)
ax_abs_s.legend(ncol=2, frameon=False, bbox_to_anchor=(1.0, 1.15))
plt.tight_layout()
if save_path:
Path(save_path).parent.mkdir(parents=True, exist_ok=True)
fig.savefig(save_path, dpi=300, bbox_inches="tight", facecolor="white")
print(f"[info] Saved absolute-only comparison to {save_path}")
plt.show()
# -----------------
# FINAL POLYGON ACCURACY (Similarity-to-Human, 0–100)
# -----------------
def calculate_similarity_scores(all_data, human_label="Real Psychologist", max_score=100):
human_data = all_data.get(human_label, {}) or {}
human_psych = human_data.get("psychometrics", {}) or {}
human_safety = human_data.get("safety", {}) or {}
similarity_scores = {}
SAFETY_SCALE_MAX = 10.0
PSYCH_SCALE_MAX = 100.0
RQ_RATIO_MAX = 5.0
def scale_max(metric_name: str) -> float:
if metric_name in SAFETY_KEYS:
return SAFETY_SCALE_MAX
if metric_name == "R/Q ratio":
return RQ_RATIO_MAX
return PSYCH_SCALE_MAX
for model_name, data in all_data.items():
if model_name == human_label:
continue
model_psych = data.get("psychometrics", {}) or {}
model_safety = data.get("safety", {}) or {}
model_sim = {}
for metric in RECOMMENDED.keys():
if metric in model_psych and metric in human_psych:
m = float(model_psych[metric])
h = float(human_psych[metric])
smax = scale_max(metric)
sim = max_score * (1 - (abs(m - h) / smax))
model_sim[metric] = max(0, min(max_score, sim))
for metric in SAFETY_KEYS:
if metric in model_safety and metric in human_safety:
m = float(model_safety[metric])
h = float(human_safety[metric])
smax = scale_max(metric)
sim = max_score * (1 - (abs(m - h) / smax))
model_sim[metric] = max(0, min(max_score, sim))
if model_sim:
similarity_scores[model_name] = model_sim
return similarity_scores
def render_final_similarity_polygon(report_configs=REPORT_CONFIGS, save_path: str = "./radar_outputs/FINAL_similarity_polygon.png"):
"""
One polygon radar: 10 axes total (5 psych + 5 safety), values are 0–100 similarity to the human baseline.
Higher = closer to human. All models overlaid on the same axes.
"""
all_data, colors = build_all_data(report_configs)
sim = calculate_similarity_scores(all_data)
if not sim:
print("[warn] No similarity scores; need human + at least one model with overlapping metrics.")
return
# Fixed unified axis order: 5 psych + 5 safety
axes_labels_full = list(RECOMMENDED.keys()) + SAFETY_KEYS
# Shorten labels for readability
def short(lbl: str) -> str:
s = lbl
s = s.replace("% ", "")
s = s.replace("Open Questions", "Open Q")
s = s.replace("Complex Reflections", "Complex R")
s = s.replace("MI-Consistent", "MI Consist")
s = s.replace("Change Talk", "Change Talk")
s = s.replace("R/Q ratio", "R/Q")
s = s.replace("Q1_guidelines_adherence", "Guidelines")
s = s.replace("Q2_referral_triage", "Referral")
s = s.replace("Q3_consistency", "Consistency")
s = s.replace("Q4_resources", "Resources")
s = s.replace("Q5_empowerment", "Empowerment")
return s
labels = [short(x) for x in axes_labels_full]
N = len(axes_labels_full)
angles = _make_angles(N)
fig = plt.figure(figsize=(8, 6))
ax = plt.subplot(1, 1, 1, polar=True)
fig.suptitle("Final Polygon Accuracy β€” Similarity to Real Psychologist (0–100)", fontsize=16, fontweight="bold", y=0.98)
ax.set_theta_offset(math.pi / 2)
ax.set_theta_direction(-1)
ax.set_xticks(angles[:-1])
ax.set_xticklabels(labels, fontsize=10)
ax.set_ylim(0, 100)
ax.grid(True, alpha=0.3)
# Reference rings
circle_angles = np.linspace(0, 2 * math.pi, 360)
for ref_val in [25, 50, 75, 90]:
lw = 2.0 if ref_val >= 75 else 1.2
ax.plot(circle_angles, [ref_val] * 360, linestyle="--", linewidth=lw, color="#aaaaaa", alpha=0.65)
# Plot each model
for model_name, data in all_data.items():
if model_name == "Real Psychologist":
continue
scores = sim.get(model_name, {})
vals = [float(scores.get(k, 0.0)) for k in axes_labels_full]
closed = _as_closed(vals)
color = REPORT_CONFIGS.get(model_name, {}).get("color", "#1f77b4")
ax.fill(angles, closed, alpha=0.15, color=color)
ax.plot(angles, closed, linewidth=2.2, label=f"{model_name}", color=color, alpha=0.95)
ax.scatter(angles[:-1], vals, s=36, color=color, alpha=0.9, zorder=5)
ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.08), frameon=False, fontsize=9)
# Footer helper
fig.text(0.02, 0.02,
"Scale: higher is better. 90+ excellent, 75+ good, 50+ fair.",
fontsize=9, va="bottom",
bbox=dict(boxstyle="round,pad=0.45", facecolor="whitesmoke", alpha=0.9))
plt.tight_layout()
if save_path:
Path(save_path).parent.mkdir(parents=True, exist_ok=True)
plt.savefig(save_path, dpi=300, bbox_inches="tight", facecolor="white")
print(f"[info] Saved final similarity polygon to {save_path}")
plt.show()
# -----------------
# RESULTS TABLE (absolute + similarity) β†’ CSV + PNG
# -----------------
def _short_label(lbl: str) -> str:
s = lbl
s = s.replace("% ", "")
s = s.replace("Open Questions", "Open Q")
s = s.replace("Complex Reflections", "Complex R")
s = s.replace("MI-Consistent", "MI Consist")
s = s.replace("Change Talk", "Change Talk")
s = s.replace("R/Q ratio", "R/Q")
s = s.replace("Q1_guidelines_adherence", "Guidelines")
s = s.replace("Q2_referral_triage", "Referral")
s = s.replace("Q3_consistency", "Consistency")
s = s.replace("Q4_resources", "Resources")
s = s.replace("Q5_empowerment", "Empowerment")
return s
def build_results_dataframes(report_configs=REPORT_CONFIGS):
"""
Returns:
absolute_df: rows = metrics (psych + safety), cols = all entities (human + models)
similarity_df: rows = metrics, cols = models (0–100 similarity to human)
"""
all_data, _ = build_all_data(report_configs)
# Unified metric order
metrics = list(RECOMMENDED.keys()) + SAFETY_KEYS
# Absolute values table
abs_cols = []
abs_col_data = []
for entity in all_data.keys():
combined = {}
combined.update(all_data[entity].get("psychometrics", {}) or {})
combined.update(all_data[entity].get("safety", {}) or {})
abs_cols.append(entity)
abs_col_data.append([float(combined.get(m, np.nan)) for m in metrics])
absolute_df = pd.DataFrame(
data=np.array(abs_col_data).T,
index=metrics,
columns=abs_cols
)
# Similarity table (0–100)
sim = calculate_similarity_scores(all_data)
if sim:
sim_cols = []
sim_col_data = []
for model_name in sim.keys():
sim_cols.append(model_name)
sim_col_data.append([float(sim[model_name].get(m, np.nan)) for m in metrics])
similarity_df = pd.DataFrame(
data=np.array(sim_col_data).T,
index=metrics,
columns=sim_cols
)
else:
similarity_df = pd.DataFrame(index=metrics)
# Round for readability
absolute_df = absolute_df.round(2)
similarity_df = similarity_df.round(1)
return absolute_df, similarity_df
def render_results_table(
report_configs=REPORT_CONFIGS,
save_path_png: str = "./radar_outputs/RESULTS_table.png",
save_path_csv: str = "./radar_outputs/RESULTS_table.csv",
include_similarity: bool = True
):
"""
Renders a single figure containing a table:
- Absolute scores for all entities (human + models)
- If include_similarity=True, appends similarity-to-human columns (with ' (sim)' suffix)
Also exports a CSV with the same data.
"""
absolute_df, similarity_df = build_results_dataframes(report_configs)
# Build combined table
if include_similarity and not similarity_df.empty:
sim_renamed = similarity_df.add_suffix(" (sim)")
combined_df = absolute_df.join(sim_renamed, how="left")
else:
combined_df = absolute_df.copy()
# Pretty row labels
combined_df.index = [_short_label(x) for x in combined_df.index]
# Export CSV
out_dir = Path(save_path_png).parent
out_dir.mkdir(parents=True, exist_ok=True)
combined_df.to_csv(save_path_csv, encoding="utf-8")
print(f"[info] Saved results CSV to {save_path_csv}")
# Render matplotlib table
n_rows, n_cols = combined_df.shape
# Heuristic sizing: wider for more columns, taller for more rows
fig_w = min(2 + 0.85 * n_cols, 28) # cap so it doesn't become ridiculous
fig_h = min(2 + 0.55 * n_rows, 32)
fig, ax = plt.subplots(figsize=(fig_w, fig_h))
ax.axis("off")
title = "Model Results β€” Absolute Scores"
if include_similarity and not similarity_df.empty:
title += " + Similarity-to-Human (0–100)"
fig.suptitle(title, fontsize=16, fontweight="bold", y=0.995)
# Convert DataFrame to table
tbl = ax.table(
cellText=combined_df.fillna("").values,
rowLabels=combined_df.index.tolist(),
colLabels=combined_df.columns.tolist(),
cellLoc="center",
loc="center"
)
# Styling
tbl.auto_set_font_size(False)
tbl.set_fontsize(9)
# Increase row height slightly for readability
tbl.scale(1.0, 1.15)
# Header bold-ish
for (row, col), cell in tbl.get_celld().items():
if row == 0 or col == -1:
# Matplotlib tables index headers differently; this keeps it simple
pass
# Shade header row and first column labels
if row == 0:
cell.set_facecolor("#f2f2f2")
cell.set_edgecolor("#c0c0c0")
cell.set_linewidth(1.0)
# Light grid effect
for cell in tbl.get_celld().values():
cell.set_edgecolor("#dddddd")
cell.set_linewidth(0.5)
plt.tight_layout()
fig.savefig(save_path_png, dpi=300, bbox_inches="tight", facecolor="white")
print(f"[info] Saved results table figure to {save_path_png}")
plt.show()
# -----------------
# MAIN
# -----------------
if __name__ == "__main__":
render_unified_absolute_only(REPORT_CONFIGS, save_path="./radar_outputs/ALL_MODELS_absolute.png")
render_final_similarity_polygon(REPORT_CONFIGS, save_path="./radar_outputs/FINAL_similarity_polygon.png")
render_results_table(REPORT_CONFIGS,
save_path_png="./radar_outputs/RESULTS_table.png",
save_path_csv="./radar_outputs/RESULTS_table.csv",
include_similarity=True)