Spaces:
Runtime error
Runtime error
| # radar_visualizer_individual.py | |
| # Requirements: matplotlib, numpy, pandas | |
| import json | |
| import math | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| from pathlib import Path | |
| from typing import Dict, List, Optional | |
| # ----------------- | |
| # CONFIG | |
| # ----------------- | |
| REPORT_CONFIGS = { | |
| # label: { path: Path|str, color: hex|rgb tuple (optional) } | |
| "Real Psychologist": {"path": "../data/human/report.json", "color": "#ff0000"}, | |
| "Our KaLLaM": {"path": "../data/orchestrated/report.json", "color": "#2ca02c"}, | |
| "Gemini-2.5-flash-light": {"path": "../data/gemini/report.json", "color": "#9dafff"}, | |
| "Gemma-SEA-LION-v4-27B-IT": {"path": "../data/SEA-Lion/report.json", "color": "#8d35ff"}, | |
| # Add more models here... | |
| } | |
| # Psychometric targets (units are already scaled as shown) | |
| RECOMMENDED = { | |
| "R/Q ratio": 1.0, | |
| "% Open Questions": 50.0, | |
| "% Complex Reflections": 40.0, | |
| "% MI-Consistent": 90.0, | |
| "% Change Talk": 50.0 | |
| } | |
| # Safety keys (Xu et al. proxies, 0β10) | |
| SAFETY_KEYS = [ | |
| "Q1_guidelines_adherence", | |
| "Q2_referral_triage", | |
| "Q3_consistency", | |
| "Q4_resources", | |
| "Q5_empowerment", | |
| ] | |
| # ----------------- | |
| # LOADING & EXTRACTION | |
| # ----------------- | |
| def _load_json(path_like) -> Optional[dict]: | |
| p = Path(path_like).expanduser() | |
| if not p.exists(): | |
| print(f"[warn] Missing report: {p}") | |
| return None | |
| try: | |
| with p.open("r", encoding="utf-8") as f: | |
| return json.load(f) | |
| except Exception as e: | |
| print(f"[warn] Failed to read {p}: {e}") | |
| return None | |
| def _extract_psychometrics(report: Optional[dict]) -> dict: | |
| psy = report.get("psychometrics", {}) if report else {} | |
| try: | |
| rq = float(psy.get("R_over_Q", 0.0)) | |
| poq = float(psy.get("pct_open_questions", 0.0)) * 100.0 | |
| pcr = float(psy.get("pct_complex_reflection", 0.0)) * 100.0 | |
| mic = psy.get("pct_mi_consistent", psy.get("pct_mi_consistency", psy.get("pct_mi_consist", 0.0))) | |
| mic = float(mic) * 100.0 | |
| pct_ct = float(psy.get("pct_CT_over_CT_plus_ST", 0.0)) * 100.0 | |
| except Exception: | |
| rq, poq, pcr, mic, pct_ct = 0.0, 0.0, 0.0, 0.0, 0.0 | |
| return { | |
| "R/Q ratio": rq, | |
| "% Open Questions": poq, | |
| "% Complex Reflections": pcr, | |
| "% MI-Consistent": mic, | |
| "% Change Talk": pct_ct, | |
| } | |
| def _extract_safety(report: Optional[dict]) -> dict: | |
| if not report: | |
| return {} | |
| safety = report.get("safety", {}) | |
| scores = safety.get("scores_0_10", {}) | |
| out = {} | |
| for k in SAFETY_KEYS: | |
| try: | |
| out[k] = float(scores.get(k, 0.0)) | |
| except Exception: | |
| out[k] = 0.0 | |
| return out | |
| # ----------------- | |
| # UTIL | |
| # ----------------- | |
| def values_by_labels(d: Dict[str, float], labels: List[str]) -> List[float]: | |
| out = [] | |
| for k in labels: | |
| v = d.get(k, np.nan) | |
| out.append(0.0 if (pd.isna(v) or v is None) else float(v)) | |
| return out | |
| def _make_angles(n: int) -> List[float]: | |
| ang = np.linspace(0, 2 * math.pi, n, endpoint=False).tolist() | |
| return ang + ang[:1] | |
| def _as_closed(seq: List[float]) -> List[float]: | |
| return seq + seq[:1] if seq else [] | |
| # ----------------- | |
| # DATA BUILD | |
| # ----------------- | |
| def build_all_data(report_configs: dict): | |
| all_data = {} | |
| colors = {} | |
| for label, cfg in report_configs.items(): | |
| rep = _load_json(cfg.get("path")) | |
| colors[label] = cfg.get("color", "#1f77b4") | |
| pm = _extract_psychometrics(rep) | |
| sm = _extract_safety(rep) | |
| all_data[label] = {"psychometrics": pm, "safety": sm, "report": rep} | |
| return all_data, colors | |
| # ----------------- | |
| # CONSOLIDATED 1x2 BARS (absolute + recommended) | |
| # ----------------- | |
| def render_unified_absolute_only(report_configs=REPORT_CONFIGS, save_path: str = "./radar_outputs/ALL_MODELS_absolute.png"): | |
| """ | |
| One figure, 1x2 grid: | |
| [0] Psychometrics β Absolute (Human + all models + Recommended targets as hatched bars) | |
| [1] Safety β Absolute (Human + all models + Recommended=10 for all safety as hatched bars) | |
| """ | |
| all_data, colors = build_all_data(report_configs) | |
| human_label = "Real Psychologist" | |
| if human_label not in all_data: | |
| print("[warn] No human baseline.") | |
| return | |
| entity_labels = [lbl for lbl in all_data.keys() if lbl != human_label] | |
| if not entity_labels: | |
| print("[warn] No non-human models.") | |
| return | |
| human_psych = all_data[human_label]["psychometrics"] or {} | |
| human_safety = all_data[human_label]["safety"] or {} | |
| psych_axes = list(RECOMMENDED.keys()) | |
| safety_axes = SAFETY_KEYS | |
| human_psych_vals = values_by_labels(human_psych, psych_axes) | |
| model_psych_matrix = np.array([ | |
| [float(all_data[m]["psychometrics"].get(metric, 0.0)) for m in entity_labels] | |
| for metric in psych_axes | |
| ]) | |
| has_any_model_safety = any(bool(all_data[m]["safety"]) for m in entity_labels) | |
| human_safety_vals = values_by_labels(human_safety, safety_axes) if human_safety else [0.0] * len(safety_axes) | |
| model_safety_matrix = np.array([ | |
| [float(all_data[m]["safety"].get(metric, 0.0)) for m in entity_labels] | |
| for metric in safety_axes | |
| ]) if has_any_model_safety and human_safety else np.zeros((len(safety_axes), len(entity_labels))) | |
| fig, axs = plt.subplots(1, 2, figsize=(18, 6)) | |
| fig.suptitle("All Models vs Real Psychologist β Absolute Scores", fontsize=18, fontweight="bold", y=0.98) | |
| # ----------------- Psychometrics Absolute ----------------- | |
| ax_abs_p = axs[0] | |
| x = np.arange(len(psych_axes)) | |
| # bars per group = Recommended + Human + N models | |
| n_models = len(entity_labels) | |
| total_bars = 2 + n_models | |
| group_width = 0.9 | |
| bar_width = group_width / total_bars | |
| start = -group_width / 2 | |
| # Recommended bars (hatched) | |
| rec_vals = values_by_labels(RECOMMENDED, psych_axes) | |
| rec_offset = start + bar_width * 0.5 | |
| ax_abs_p.bar( | |
| x + rec_offset, rec_vals, width=bar_width, label="Recommended", | |
| edgecolor="#222222", facecolor="none", hatch="//", linewidth=1.2 | |
| ) | |
| # Human bars | |
| human_offset = start + bar_width * 1.5 | |
| ax_abs_p.bar(x + human_offset, human_psych_vals, width=bar_width, label=human_label, color="#ff0000", alpha=0.9) | |
| # Model bars | |
| y_max_psy = max([*human_psych_vals, *rec_vals]) if (human_psych_vals or rec_vals) else 0 | |
| for i, m in enumerate(entity_labels): | |
| offs = start + bar_width * (i + 2.5) | |
| vals = model_psych_matrix[:, i] | |
| y_max_psy = max(y_max_psy, float(np.nanmax(vals)) if vals.size else 0) | |
| ax_abs_p.bar(x + offs, vals, width=bar_width, label=m, color=colors.get(m, "#1f77b4"), alpha=0.9) | |
| ax_abs_p.set_xticks(x) | |
| ax_abs_p.set_xticklabels(psych_axes, rotation=15, ha="right") | |
| ax_abs_p.set_ylabel("Score") | |
| ax_abs_p.set_ylim(0, y_max_psy * 1.15 if y_max_psy > 0 else 1) | |
| ax_abs_p.set_title("Psychometrics β Absolute") | |
| ax_abs_p.grid(axis="y", alpha=0.3) | |
| ax_abs_p.legend(ncol=2, frameon=False, bbox_to_anchor=(1.0, 1.15)) | |
| # ----------------- Safety Absolute ----------------- | |
| ax_abs_s = axs[1] | |
| x_s = np.arange(len(safety_axes)) | |
| # bars per group = Recommended + Human + N models | |
| total_bars_s = 2 + len(entity_labels) | |
| group_width_s = 0.9 | |
| bar_width_s = group_width_s / total_bars_s | |
| start_s = -group_width_s / 2 | |
| # Recommended safety target = 10 for each key | |
| rec_safety_vals = [10.0] * len(safety_axes) | |
| rec_offset_s = start_s + bar_width_s * 0.5 | |
| ax_abs_s.bar( | |
| x_s + rec_offset_s, rec_safety_vals, width=bar_width_s, label="Ideal Safety", | |
| edgecolor="#222222", facecolor="none", hatch="//", linewidth=1.2 | |
| ) | |
| # Human bars | |
| human_offset_s = start_s + bar_width_s * 1.5 | |
| ax_abs_s.bar(x_s + human_offset_s, human_safety_vals, width=bar_width_s, label=human_label, color="#ff0000", alpha=0.9) | |
| # Models | |
| if has_any_model_safety and human_safety: | |
| for i, m in enumerate(entity_labels): | |
| offs = start_s + bar_width_s * (i + 2.5) | |
| vals = model_safety_matrix[:, i] | |
| ax_abs_s.bar(x_s + offs, vals, width=bar_width_s, label=m, color=colors.get(m, "#1f77b4"), alpha=0.9) | |
| ax_abs_s.set_xticks(x_s) | |
| ax_abs_s.set_xticklabels(["Guidelines", "Referral", "Consistency", "Resources", "Empowerment"], rotation=15, ha="right") | |
| ax_abs_s.set_ylabel("0β10") | |
| ax_abs_s.set_ylim(0, 10) | |
| ax_abs_s.set_title("Safety β Absolute") | |
| ax_abs_s.grid(axis="y", alpha=0.3) | |
| ax_abs_s.legend(ncol=2, frameon=False, bbox_to_anchor=(1.0, 1.15)) | |
| plt.tight_layout() | |
| if save_path: | |
| Path(save_path).parent.mkdir(parents=True, exist_ok=True) | |
| fig.savefig(save_path, dpi=300, bbox_inches="tight", facecolor="white") | |
| print(f"[info] Saved absolute-only comparison to {save_path}") | |
| plt.show() | |
| # ----------------- | |
| # FINAL POLYGON ACCURACY (Similarity-to-Human, 0β100) | |
| # ----------------- | |
| def calculate_similarity_scores(all_data, human_label="Real Psychologist", max_score=100): | |
| human_data = all_data.get(human_label, {}) or {} | |
| human_psych = human_data.get("psychometrics", {}) or {} | |
| human_safety = human_data.get("safety", {}) or {} | |
| similarity_scores = {} | |
| SAFETY_SCALE_MAX = 10.0 | |
| PSYCH_SCALE_MAX = 100.0 | |
| RQ_RATIO_MAX = 5.0 | |
| def scale_max(metric_name: str) -> float: | |
| if metric_name in SAFETY_KEYS: | |
| return SAFETY_SCALE_MAX | |
| if metric_name == "R/Q ratio": | |
| return RQ_RATIO_MAX | |
| return PSYCH_SCALE_MAX | |
| for model_name, data in all_data.items(): | |
| if model_name == human_label: | |
| continue | |
| model_psych = data.get("psychometrics", {}) or {} | |
| model_safety = data.get("safety", {}) or {} | |
| model_sim = {} | |
| for metric in RECOMMENDED.keys(): | |
| if metric in model_psych and metric in human_psych: | |
| m = float(model_psych[metric]) | |
| h = float(human_psych[metric]) | |
| smax = scale_max(metric) | |
| sim = max_score * (1 - (abs(m - h) / smax)) | |
| model_sim[metric] = max(0, min(max_score, sim)) | |
| for metric in SAFETY_KEYS: | |
| if metric in model_safety and metric in human_safety: | |
| m = float(model_safety[metric]) | |
| h = float(human_safety[metric]) | |
| smax = scale_max(metric) | |
| sim = max_score * (1 - (abs(m - h) / smax)) | |
| model_sim[metric] = max(0, min(max_score, sim)) | |
| if model_sim: | |
| similarity_scores[model_name] = model_sim | |
| return similarity_scores | |
| def render_final_similarity_polygon(report_configs=REPORT_CONFIGS, save_path: str = "./radar_outputs/FINAL_similarity_polygon.png"): | |
| """ | |
| One polygon radar: 10 axes total (5 psych + 5 safety), values are 0β100 similarity to the human baseline. | |
| Higher = closer to human. All models overlaid on the same axes. | |
| """ | |
| all_data, colors = build_all_data(report_configs) | |
| sim = calculate_similarity_scores(all_data) | |
| if not sim: | |
| print("[warn] No similarity scores; need human + at least one model with overlapping metrics.") | |
| return | |
| # Fixed unified axis order: 5 psych + 5 safety | |
| axes_labels_full = list(RECOMMENDED.keys()) + SAFETY_KEYS | |
| # Shorten labels for readability | |
| def short(lbl: str) -> str: | |
| s = lbl | |
| s = s.replace("% ", "") | |
| s = s.replace("Open Questions", "Open Q") | |
| s = s.replace("Complex Reflections", "Complex R") | |
| s = s.replace("MI-Consistent", "MI Consist") | |
| s = s.replace("Change Talk", "Change Talk") | |
| s = s.replace("R/Q ratio", "R/Q") | |
| s = s.replace("Q1_guidelines_adherence", "Guidelines") | |
| s = s.replace("Q2_referral_triage", "Referral") | |
| s = s.replace("Q3_consistency", "Consistency") | |
| s = s.replace("Q4_resources", "Resources") | |
| s = s.replace("Q5_empowerment", "Empowerment") | |
| return s | |
| labels = [short(x) for x in axes_labels_full] | |
| N = len(axes_labels_full) | |
| angles = _make_angles(N) | |
| fig = plt.figure(figsize=(8, 6)) | |
| ax = plt.subplot(1, 1, 1, polar=True) | |
| fig.suptitle("Final Polygon Accuracy β Similarity to Real Psychologist (0β100)", fontsize=16, fontweight="bold", y=0.98) | |
| ax.set_theta_offset(math.pi / 2) | |
| ax.set_theta_direction(-1) | |
| ax.set_xticks(angles[:-1]) | |
| ax.set_xticklabels(labels, fontsize=10) | |
| ax.set_ylim(0, 100) | |
| ax.grid(True, alpha=0.3) | |
| # Reference rings | |
| circle_angles = np.linspace(0, 2 * math.pi, 360) | |
| for ref_val in [25, 50, 75, 90]: | |
| lw = 2.0 if ref_val >= 75 else 1.2 | |
| ax.plot(circle_angles, [ref_val] * 360, linestyle="--", linewidth=lw, color="#aaaaaa", alpha=0.65) | |
| # Plot each model | |
| for model_name, data in all_data.items(): | |
| if model_name == "Real Psychologist": | |
| continue | |
| scores = sim.get(model_name, {}) | |
| vals = [float(scores.get(k, 0.0)) for k in axes_labels_full] | |
| closed = _as_closed(vals) | |
| color = REPORT_CONFIGS.get(model_name, {}).get("color", "#1f77b4") | |
| ax.fill(angles, closed, alpha=0.15, color=color) | |
| ax.plot(angles, closed, linewidth=2.2, label=f"{model_name}", color=color, alpha=0.95) | |
| ax.scatter(angles[:-1], vals, s=36, color=color, alpha=0.9, zorder=5) | |
| ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.08), frameon=False, fontsize=9) | |
| # Footer helper | |
| fig.text(0.02, 0.02, | |
| "Scale: higher is better. 90+ excellent, 75+ good, 50+ fair.", | |
| fontsize=9, va="bottom", | |
| bbox=dict(boxstyle="round,pad=0.45", facecolor="whitesmoke", alpha=0.9)) | |
| plt.tight_layout() | |
| if save_path: | |
| Path(save_path).parent.mkdir(parents=True, exist_ok=True) | |
| plt.savefig(save_path, dpi=300, bbox_inches="tight", facecolor="white") | |
| print(f"[info] Saved final similarity polygon to {save_path}") | |
| plt.show() | |
| # ----------------- | |
| # RESULTS TABLE (absolute + similarity) β CSV + PNG | |
| # ----------------- | |
| def _short_label(lbl: str) -> str: | |
| s = lbl | |
| s = s.replace("% ", "") | |
| s = s.replace("Open Questions", "Open Q") | |
| s = s.replace("Complex Reflections", "Complex R") | |
| s = s.replace("MI-Consistent", "MI Consist") | |
| s = s.replace("Change Talk", "Change Talk") | |
| s = s.replace("R/Q ratio", "R/Q") | |
| s = s.replace("Q1_guidelines_adherence", "Guidelines") | |
| s = s.replace("Q2_referral_triage", "Referral") | |
| s = s.replace("Q3_consistency", "Consistency") | |
| s = s.replace("Q4_resources", "Resources") | |
| s = s.replace("Q5_empowerment", "Empowerment") | |
| return s | |
| def build_results_dataframes(report_configs=REPORT_CONFIGS): | |
| """ | |
| Returns: | |
| absolute_df: rows = metrics (psych + safety), cols = all entities (human + models) | |
| similarity_df: rows = metrics, cols = models (0β100 similarity to human) | |
| """ | |
| all_data, _ = build_all_data(report_configs) | |
| # Unified metric order | |
| metrics = list(RECOMMENDED.keys()) + SAFETY_KEYS | |
| # Absolute values table | |
| abs_cols = [] | |
| abs_col_data = [] | |
| for entity in all_data.keys(): | |
| combined = {} | |
| combined.update(all_data[entity].get("psychometrics", {}) or {}) | |
| combined.update(all_data[entity].get("safety", {}) or {}) | |
| abs_cols.append(entity) | |
| abs_col_data.append([float(combined.get(m, np.nan)) for m in metrics]) | |
| absolute_df = pd.DataFrame( | |
| data=np.array(abs_col_data).T, | |
| index=metrics, | |
| columns=abs_cols | |
| ) | |
| # Similarity table (0β100) | |
| sim = calculate_similarity_scores(all_data) | |
| if sim: | |
| sim_cols = [] | |
| sim_col_data = [] | |
| for model_name in sim.keys(): | |
| sim_cols.append(model_name) | |
| sim_col_data.append([float(sim[model_name].get(m, np.nan)) for m in metrics]) | |
| similarity_df = pd.DataFrame( | |
| data=np.array(sim_col_data).T, | |
| index=metrics, | |
| columns=sim_cols | |
| ) | |
| else: | |
| similarity_df = pd.DataFrame(index=metrics) | |
| # Round for readability | |
| absolute_df = absolute_df.round(2) | |
| similarity_df = similarity_df.round(1) | |
| return absolute_df, similarity_df | |
| def render_results_table( | |
| report_configs=REPORT_CONFIGS, | |
| save_path_png: str = "./radar_outputs/RESULTS_table.png", | |
| save_path_csv: str = "./radar_outputs/RESULTS_table.csv", | |
| include_similarity: bool = True | |
| ): | |
| """ | |
| Renders a single figure containing a table: | |
| - Absolute scores for all entities (human + models) | |
| - If include_similarity=True, appends similarity-to-human columns (with ' (sim)' suffix) | |
| Also exports a CSV with the same data. | |
| """ | |
| absolute_df, similarity_df = build_results_dataframes(report_configs) | |
| # Build combined table | |
| if include_similarity and not similarity_df.empty: | |
| sim_renamed = similarity_df.add_suffix(" (sim)") | |
| combined_df = absolute_df.join(sim_renamed, how="left") | |
| else: | |
| combined_df = absolute_df.copy() | |
| # Pretty row labels | |
| combined_df.index = [_short_label(x) for x in combined_df.index] | |
| # Export CSV | |
| out_dir = Path(save_path_png).parent | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| combined_df.to_csv(save_path_csv, encoding="utf-8") | |
| print(f"[info] Saved results CSV to {save_path_csv}") | |
| # Render matplotlib table | |
| n_rows, n_cols = combined_df.shape | |
| # Heuristic sizing: wider for more columns, taller for more rows | |
| fig_w = min(2 + 0.85 * n_cols, 28) # cap so it doesn't become ridiculous | |
| fig_h = min(2 + 0.55 * n_rows, 32) | |
| fig, ax = plt.subplots(figsize=(fig_w, fig_h)) | |
| ax.axis("off") | |
| title = "Model Results β Absolute Scores" | |
| if include_similarity and not similarity_df.empty: | |
| title += " + Similarity-to-Human (0β100)" | |
| fig.suptitle(title, fontsize=16, fontweight="bold", y=0.995) | |
| # Convert DataFrame to table | |
| tbl = ax.table( | |
| cellText=combined_df.fillna("").values, | |
| rowLabels=combined_df.index.tolist(), | |
| colLabels=combined_df.columns.tolist(), | |
| cellLoc="center", | |
| loc="center" | |
| ) | |
| # Styling | |
| tbl.auto_set_font_size(False) | |
| tbl.set_fontsize(9) | |
| # Increase row height slightly for readability | |
| tbl.scale(1.0, 1.15) | |
| # Header bold-ish | |
| for (row, col), cell in tbl.get_celld().items(): | |
| if row == 0 or col == -1: | |
| # Matplotlib tables index headers differently; this keeps it simple | |
| pass | |
| # Shade header row and first column labels | |
| if row == 0: | |
| cell.set_facecolor("#f2f2f2") | |
| cell.set_edgecolor("#c0c0c0") | |
| cell.set_linewidth(1.0) | |
| # Light grid effect | |
| for cell in tbl.get_celld().values(): | |
| cell.set_edgecolor("#dddddd") | |
| cell.set_linewidth(0.5) | |
| plt.tight_layout() | |
| fig.savefig(save_path_png, dpi=300, bbox_inches="tight", facecolor="white") | |
| print(f"[info] Saved results table figure to {save_path_png}") | |
| plt.show() | |
| # ----------------- | |
| # MAIN | |
| # ----------------- | |
| if __name__ == "__main__": | |
| render_unified_absolute_only(REPORT_CONFIGS, save_path="./radar_outputs/ALL_MODELS_absolute.png") | |
| render_final_similarity_polygon(REPORT_CONFIGS, save_path="./radar_outputs/FINAL_similarity_polygon.png") | |
| render_results_table(REPORT_CONFIGS, | |
| save_path_png="./radar_outputs/RESULTS_table.png", | |
| save_path_csv="./radar_outputs/RESULTS_table.csv", | |
| include_similarity=True) |