Spaces:

Koalar
/

kallam-demo-docker

Runtime error

File size: 20,078 Bytes

0b70f11

# radar_visualizer_individual.py
# Requirements: matplotlib, numpy, pandas

import json
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from typing import Dict, List, Optional

# -----------------
# CONFIG
# -----------------
REPORT_CONFIGS = {
    # label: { path: Path|str, color: hex|rgb tuple (optional) }
    "Real Psychologist": {"path": "../data/human/report.json", "color": "#ff0000"},
    "Our KaLLaM": {"path": "../data/orchestrated/report.json", "color": "#2ca02c"},
    "Gemini-2.5-flash-light": {"path": "../data/gemini/report.json", "color": "#9dafff"},
    "Gemma-SEA-LION-v4-27B-IT": {"path": "../data/SEA-Lion/report.json", "color": "#8d35ff"},
    # Add more models here...
}

# Psychometric targets (units are already scaled as shown)
RECOMMENDED = {
    "R/Q ratio": 1.0,
    "% Open Questions": 50.0,
    "% Complex Reflections": 40.0,
    "% MI-Consistent": 90.0,
    "% Change Talk": 50.0
}

# Safety keys (Xu et al. proxies, 0–10)
SAFETY_KEYS = [
    "Q1_guidelines_adherence",
    "Q2_referral_triage",
    "Q3_consistency",
    "Q4_resources",
    "Q5_empowerment",
]

# -----------------
# LOADING & EXTRACTION
# -----------------
def _load_json(path_like) -> Optional[dict]:
    p = Path(path_like).expanduser()
    if not p.exists():
        print(f"[warn] Missing report: {p}")
        return None
    try:
        with p.open("r", encoding="utf-8") as f:
            return json.load(f)
    except Exception as e:
        print(f"[warn] Failed to read {p}: {e}")
        return None

def _extract_psychometrics(report: Optional[dict]) -> dict:
    psy = report.get("psychometrics", {}) if report else {}
    try:
        rq   = float(psy.get("R_over_Q", 0.0))
        poq  = float(psy.get("pct_open_questions", 0.0)) * 100.0
        pcr  = float(psy.get("pct_complex_reflection", 0.0)) * 100.0
        mic  = psy.get("pct_mi_consistent", psy.get("pct_mi_consistency", psy.get("pct_mi_consist", 0.0)))
        mic  = float(mic) * 100.0
        pct_ct = float(psy.get("pct_CT_over_CT_plus_ST", 0.0)) * 100.0
    except Exception:
        rq, poq, pcr, mic, pct_ct = 0.0, 0.0, 0.0, 0.0, 0.0
    return {
        "R/Q ratio": rq,
        "% Open Questions": poq,
        "% Complex Reflections": pcr,
        "% MI-Consistent": mic,
        "% Change Talk": pct_ct,
    }

def _extract_safety(report: Optional[dict]) -> dict:
    if not report:
        return {}
    safety = report.get("safety", {})
    scores = safety.get("scores_0_10", {})
    out = {}
    for k in SAFETY_KEYS:
        try:
            out[k] = float(scores.get(k, 0.0))
        except Exception:
            out[k] = 0.0
    return out

# -----------------
# UTIL
# -----------------
def values_by_labels(d: Dict[str, float], labels: List[str]) -> List[float]:
    out = []
    for k in labels:
        v = d.get(k, np.nan)
        out.append(0.0 if (pd.isna(v) or v is None) else float(v))
    return out

def _make_angles(n: int) -> List[float]:
    ang = np.linspace(0, 2 * math.pi, n, endpoint=False).tolist()
    return ang + ang[:1]

def _as_closed(seq: List[float]) -> List[float]:
    return seq + seq[:1] if seq else []

# -----------------
# DATA BUILD
# -----------------
def build_all_data(report_configs: dict):
    all_data = {}
    colors = {}
    for label, cfg in report_configs.items():
        rep = _load_json(cfg.get("path"))
        colors[label] = cfg.get("color", "#1f77b4")
        pm = _extract_psychometrics(rep)
        sm = _extract_safety(rep)
        all_data[label] = {"psychometrics": pm, "safety": sm, "report": rep}
    return all_data, colors

# -----------------
# CONSOLIDATED 1x2 BARS (absolute + recommended)
# -----------------
def render_unified_absolute_only(report_configs=REPORT_CONFIGS, save_path: str = "./radar_outputs/ALL_MODELS_absolute.png"):
    """

    One figure, 1x2 grid:

      [0] Psychometrics — Absolute (Human + all models + Recommended targets as hatched bars)

      [1] Safety        — Absolute (Human + all models + Recommended=10 for all safety as hatched bars)

    """
    all_data, colors = build_all_data(report_configs)

    human_label = "Real Psychologist"
    if human_label not in all_data:
        print("[warn] No human baseline.")
        return

    entity_labels = [lbl for lbl in all_data.keys() if lbl != human_label]
    if not entity_labels:
        print("[warn] No non-human models.")
        return

    human_psych = all_data[human_label]["psychometrics"] or {}
    human_safety = all_data[human_label]["safety"] or {}

    psych_axes = list(RECOMMENDED.keys())
    safety_axes = SAFETY_KEYS

    human_psych_vals = values_by_labels(human_psych, psych_axes)
    model_psych_matrix = np.array([
        [float(all_data[m]["psychometrics"].get(metric, 0.0)) for m in entity_labels]
        for metric in psych_axes
    ])

    has_any_model_safety = any(bool(all_data[m]["safety"]) for m in entity_labels)
    human_safety_vals = values_by_labels(human_safety, safety_axes) if human_safety else [0.0] * len(safety_axes)
    model_safety_matrix = np.array([
        [float(all_data[m]["safety"].get(metric, 0.0)) for m in entity_labels]
        for metric in safety_axes
    ]) if has_any_model_safety and human_safety else np.zeros((len(safety_axes), len(entity_labels)))

    fig, axs = plt.subplots(1, 2, figsize=(18, 6))
    fig.suptitle("All Models vs Real Psychologist — Absolute Scores", fontsize=18, fontweight="bold", y=0.98)

    # ----------------- Psychometrics Absolute -----------------
    ax_abs_p = axs[0]
    x = np.arange(len(psych_axes))

    # bars per group = Recommended + Human + N models
    n_models = len(entity_labels)
    total_bars = 2 + n_models
    group_width = 0.9
    bar_width = group_width / total_bars
    start = -group_width / 2

    # Recommended bars (hatched)
    rec_vals = values_by_labels(RECOMMENDED, psych_axes)
    rec_offset = start + bar_width * 0.5
    ax_abs_p.bar(
        x + rec_offset, rec_vals, width=bar_width, label="Recommended",
        edgecolor="#222222", facecolor="none", hatch="//", linewidth=1.2
    )

    # Human bars
    human_offset = start + bar_width * 1.5
    ax_abs_p.bar(x + human_offset, human_psych_vals, width=bar_width, label=human_label, color="#ff0000", alpha=0.9)

    # Model bars
    y_max_psy = max([*human_psych_vals, *rec_vals]) if (human_psych_vals or rec_vals) else 0
    for i, m in enumerate(entity_labels):
        offs = start + bar_width * (i + 2.5)
        vals = model_psych_matrix[:, i]
        y_max_psy = max(y_max_psy, float(np.nanmax(vals)) if vals.size else 0)
        ax_abs_p.bar(x + offs, vals, width=bar_width, label=m, color=colors.get(m, "#1f77b4"), alpha=0.9)

    ax_abs_p.set_xticks(x)
    ax_abs_p.set_xticklabels(psych_axes, rotation=15, ha="right")
    ax_abs_p.set_ylabel("Score")
    ax_abs_p.set_ylim(0, y_max_psy * 1.15 if y_max_psy > 0 else 1)
    ax_abs_p.set_title("Psychometrics — Absolute")
    ax_abs_p.grid(axis="y", alpha=0.3)
    ax_abs_p.legend(ncol=2, frameon=False, bbox_to_anchor=(1.0, 1.15))

    # ----------------- Safety Absolute -----------------
    ax_abs_s = axs[1]
    x_s = np.arange(len(safety_axes))

    # bars per group = Recommended + Human + N models
    total_bars_s = 2 + len(entity_labels)
    group_width_s = 0.9
    bar_width_s = group_width_s / total_bars_s
    start_s = -group_width_s / 2

    # Recommended safety target = 10 for each key
    rec_safety_vals = [10.0] * len(safety_axes)
    rec_offset_s = start_s + bar_width_s * 0.5
    ax_abs_s.bar(
        x_s + rec_offset_s, rec_safety_vals, width=bar_width_s, label="Ideal Safety",
        edgecolor="#222222", facecolor="none", hatch="//", linewidth=1.2
    )

    # Human bars
    human_offset_s = start_s + bar_width_s * 1.5
    ax_abs_s.bar(x_s + human_offset_s, human_safety_vals, width=bar_width_s, label=human_label, color="#ff0000", alpha=0.9)

    # Models
    if has_any_model_safety and human_safety:
        for i, m in enumerate(entity_labels):
            offs = start_s + bar_width_s * (i + 2.5)
            vals = model_safety_matrix[:, i]
            ax_abs_s.bar(x_s + offs, vals, width=bar_width_s, label=m, color=colors.get(m, "#1f77b4"), alpha=0.9)

    ax_abs_s.set_xticks(x_s)
    ax_abs_s.set_xticklabels(["Guidelines", "Referral", "Consistency", "Resources", "Empowerment"], rotation=15, ha="right")
    ax_abs_s.set_ylabel("0–10")
    ax_abs_s.set_ylim(0, 10)
    ax_abs_s.set_title("Safety — Absolute")
    ax_abs_s.grid(axis="y", alpha=0.3)
    ax_abs_s.legend(ncol=2, frameon=False, bbox_to_anchor=(1.0, 1.15))

    plt.tight_layout()
    if save_path:
        Path(save_path).parent.mkdir(parents=True, exist_ok=True)
        fig.savefig(save_path, dpi=300, bbox_inches="tight", facecolor="white")
        print(f"[info] Saved absolute-only comparison to {save_path}")
    plt.show()

# -----------------
# FINAL POLYGON ACCURACY (Similarity-to-Human, 0–100)
# -----------------
def calculate_similarity_scores(all_data, human_label="Real Psychologist", max_score=100):
    human_data = all_data.get(human_label, {}) or {}
    human_psych = human_data.get("psychometrics", {}) or {}
    human_safety = human_data.get("safety", {}) or {}

    similarity_scores = {}
    SAFETY_SCALE_MAX = 10.0
    PSYCH_SCALE_MAX = 100.0
    RQ_RATIO_MAX = 5.0

    def scale_max(metric_name: str) -> float:
        if metric_name in SAFETY_KEYS:
            return SAFETY_SCALE_MAX
        if metric_name == "R/Q ratio":
            return RQ_RATIO_MAX
        return PSYCH_SCALE_MAX

    for model_name, data in all_data.items():
        if model_name == human_label:
            continue
        model_psych = data.get("psychometrics", {}) or {}
        model_safety = data.get("safety", {}) or {}

        model_sim = {}

        for metric in RECOMMENDED.keys():
            if metric in model_psych and metric in human_psych:
                m = float(model_psych[metric])
                h = float(human_psych[metric])
                smax = scale_max(metric)
                sim = max_score * (1 - (abs(m - h) / smax))
                model_sim[metric] = max(0, min(max_score, sim))

        for metric in SAFETY_KEYS:
            if metric in model_safety and metric in human_safety:
                m = float(model_safety[metric])
                h = float(human_safety[metric])
                smax = scale_max(metric)
                sim = max_score * (1 - (abs(m - h) / smax))
                model_sim[metric] = max(0, min(max_score, sim))

        if model_sim:
            similarity_scores[model_name] = model_sim

    return similarity_scores

def render_final_similarity_polygon(report_configs=REPORT_CONFIGS, save_path: str = "./radar_outputs/FINAL_similarity_polygon.png"):
    """

    One polygon radar: 10 axes total (5 psych + 5 safety), values are 0–100 similarity to the human baseline.

    Higher = closer to human. All models overlaid on the same axes.

    """
    all_data, colors = build_all_data(report_configs)
    sim = calculate_similarity_scores(all_data)

    if not sim:
        print("[warn] No similarity scores; need human + at least one model with overlapping metrics.")
        return

    # Fixed unified axis order: 5 psych + 5 safety
    axes_labels_full = list(RECOMMENDED.keys()) + SAFETY_KEYS

    # Shorten labels for readability
    def short(lbl: str) -> str:
        s = lbl
        s = s.replace("% ", "")
        s = s.replace("Open Questions", "Open Q")
        s = s.replace("Complex Reflections", "Complex R")
        s = s.replace("MI-Consistent", "MI Consist")
        s = s.replace("Change Talk", "Change Talk")
        s = s.replace("R/Q ratio", "R/Q")
        s = s.replace("Q1_guidelines_adherence", "Guidelines")
        s = s.replace("Q2_referral_triage", "Referral")
        s = s.replace("Q3_consistency", "Consistency")
        s = s.replace("Q4_resources", "Resources")
        s = s.replace("Q5_empowerment", "Empowerment")
        return s

    labels = [short(x) for x in axes_labels_full]
    N = len(axes_labels_full)
    angles = _make_angles(N)

    fig = plt.figure(figsize=(8, 6))
    ax = plt.subplot(1, 1, 1, polar=True)
    fig.suptitle("Final Polygon Accuracy — Similarity to Real Psychologist (0–100)", fontsize=16, fontweight="bold", y=0.98)

    ax.set_theta_offset(math.pi / 2)
    ax.set_theta_direction(-1)
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(labels, fontsize=10)
    ax.set_ylim(0, 100)
    ax.grid(True, alpha=0.3)

    # Reference rings
    circle_angles = np.linspace(0, 2 * math.pi, 360)
    for ref_val in [25, 50, 75, 90]:
        lw = 2.0 if ref_val >= 75 else 1.2
        ax.plot(circle_angles, [ref_val] * 360, linestyle="--", linewidth=lw, color="#aaaaaa", alpha=0.65)

    # Plot each model
    for model_name, data in all_data.items():
        if model_name == "Real Psychologist":
            continue
        scores = sim.get(model_name, {})
        vals = [float(scores.get(k, 0.0)) for k in axes_labels_full]
        closed = _as_closed(vals)
        color = REPORT_CONFIGS.get(model_name, {}).get("color", "#1f77b4")
        ax.fill(angles, closed, alpha=0.15, color=color)
        ax.plot(angles, closed, linewidth=2.2, label=f"{model_name}", color=color, alpha=0.95)
        ax.scatter(angles[:-1], vals, s=36, color=color, alpha=0.9, zorder=5)

    ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.08), frameon=False, fontsize=9)

    # Footer helper
    fig.text(0.02, 0.02,
             "Scale: higher is better. 90+ excellent, 75+ good, 50+ fair.",
             fontsize=9, va="bottom",
             bbox=dict(boxstyle="round,pad=0.45", facecolor="whitesmoke", alpha=0.9))
    plt.tight_layout()

    if save_path:
        Path(save_path).parent.mkdir(parents=True, exist_ok=True)
        plt.savefig(save_path, dpi=300, bbox_inches="tight", facecolor="white")
        print(f"[info] Saved final similarity polygon to {save_path}")

    plt.show()

# -----------------
# RESULTS TABLE (absolute + similarity) → CSV + PNG
# -----------------
def _short_label(lbl: str) -> str:
    s = lbl
    s = s.replace("% ", "")
    s = s.replace("Open Questions", "Open Q")
    s = s.replace("Complex Reflections", "Complex R")
    s = s.replace("MI-Consistent", "MI Consist")
    s = s.replace("Change Talk", "Change Talk")
    s = s.replace("R/Q ratio", "R/Q")
    s = s.replace("Q1_guidelines_adherence", "Guidelines")
    s = s.replace("Q2_referral_triage", "Referral")
    s = s.replace("Q3_consistency", "Consistency")
    s = s.replace("Q4_resources", "Resources")
    s = s.replace("Q5_empowerment", "Empowerment")
    return s

def build_results_dataframes(report_configs=REPORT_CONFIGS):
    """

    Returns:

      absolute_df: rows = metrics (psych + safety), cols = all entities (human + models)

      similarity_df: rows = metrics, cols = models (0–100 similarity to human)

    """
    all_data, _ = build_all_data(report_configs)

    # Unified metric order
    metrics = list(RECOMMENDED.keys()) + SAFETY_KEYS

    # Absolute values table
    abs_cols = []
    abs_col_data = []
    for entity in all_data.keys():
        combined = {}
        combined.update(all_data[entity].get("psychometrics", {}) or {})
        combined.update(all_data[entity].get("safety", {}) or {})
        abs_cols.append(entity)
        abs_col_data.append([float(combined.get(m, np.nan)) for m in metrics])

    absolute_df = pd.DataFrame(
        data=np.array(abs_col_data).T,
        index=metrics,
        columns=abs_cols
    )

    # Similarity table (0–100)
    sim = calculate_similarity_scores(all_data)
    if sim:
        sim_cols = []
        sim_col_data = []
        for model_name in sim.keys():
            sim_cols.append(model_name)
            sim_col_data.append([float(sim[model_name].get(m, np.nan)) for m in metrics])
        similarity_df = pd.DataFrame(
            data=np.array(sim_col_data).T,
            index=metrics,
            columns=sim_cols
        )
    else:
        similarity_df = pd.DataFrame(index=metrics)

    # Round for readability
    absolute_df = absolute_df.round(2)
    similarity_df = similarity_df.round(1)

    return absolute_df, similarity_df

def render_results_table(

    report_configs=REPORT_CONFIGS,

    save_path_png: str = "./radar_outputs/RESULTS_table.png",

    save_path_csv: str = "./radar_outputs/RESULTS_table.csv",

    include_similarity: bool = True

):
    """

    Renders a single figure containing a table:

      - Absolute scores for all entities (human + models)

      - If include_similarity=True, appends similarity-to-human columns (with ' (sim)' suffix)



    Also exports a CSV with the same data.

    """
    absolute_df, similarity_df = build_results_dataframes(report_configs)

    # Build combined table
    if include_similarity and not similarity_df.empty:
        sim_renamed = similarity_df.add_suffix(" (sim)")
        combined_df = absolute_df.join(sim_renamed, how="left")
    else:
        combined_df = absolute_df.copy()

    # Pretty row labels
    combined_df.index = [_short_label(x) for x in combined_df.index]

    # Export CSV
    out_dir = Path(save_path_png).parent
    out_dir.mkdir(parents=True, exist_ok=True)
    combined_df.to_csv(save_path_csv, encoding="utf-8")
    print(f"[info] Saved results CSV to {save_path_csv}")

    # Render matplotlib table
    n_rows, n_cols = combined_df.shape

    # Heuristic sizing: wider for more columns, taller for more rows
    fig_w = min(2 + 0.85 * n_cols, 28)         # cap so it doesn't become ridiculous
    fig_h = min(2 + 0.55 * n_rows, 32)

    fig, ax = plt.subplots(figsize=(fig_w, fig_h))
    ax.axis("off")

    title = "Model Results — Absolute Scores"
    if include_similarity and not similarity_df.empty:
        title += " + Similarity-to-Human (0–100)"
    fig.suptitle(title, fontsize=16, fontweight="bold", y=0.995)

    # Convert DataFrame to table
    tbl = ax.table(
        cellText=combined_df.fillna("").values,
        rowLabels=combined_df.index.tolist(),
        colLabels=combined_df.columns.tolist(),
        cellLoc="center",
        loc="center"
    )

    # Styling
    tbl.auto_set_font_size(False)
    tbl.set_fontsize(9)
    # Increase row height slightly for readability
    tbl.scale(1.0, 1.15)

    # Header bold-ish
    for (row, col), cell in tbl.get_celld().items():
        if row == 0 or col == -1:
            # Matplotlib tables index headers differently; this keeps it simple
            pass
        # Shade header row and first column labels
        if row == 0:
            cell.set_facecolor("#f2f2f2")
            cell.set_edgecolor("#c0c0c0")
            cell.set_linewidth(1.0)

    # Light grid effect
    for cell in tbl.get_celld().values():
        cell.set_edgecolor("#dddddd")
        cell.set_linewidth(0.5)

    plt.tight_layout()
    fig.savefig(save_path_png, dpi=300, bbox_inches="tight", facecolor="white")
    print(f"[info] Saved results table figure to {save_path_png}")
    plt.show()

# -----------------
# MAIN
# -----------------
if __name__ == "__main__":
    render_unified_absolute_only(REPORT_CONFIGS, save_path="./radar_outputs/ALL_MODELS_absolute.png")
    render_final_similarity_polygon(REPORT_CONFIGS, save_path="./radar_outputs/FINAL_similarity_polygon.png")
    render_results_table(REPORT_CONFIGS,
                         save_path_png="./radar_outputs/RESULTS_table.png",
                         save_path_csv="./radar_outputs/RESULTS_table.csv",
                         include_similarity=True)