llm_qualia_2

Sleeping

App Files Files Community

neuralworm commited on 11 days ago

Commit

2f0addb

1 Parent(s): ee35314

initial commit

Browse files

Files changed (15) hide show

README.md +33 -12
app.py +67 -0
bp_phi/__init__.py +0 -0
bp_phi/__pycache__/__init__.cpython-310.pyc +0 -0
bp_phi/__pycache__/llm_iface.cpython-310.pyc +0 -0
bp_phi/__pycache__/metrics.cpython-310.pyc +0 -0
bp_phi/__pycache__/prompts_en.cpython-310.pyc +0 -0
bp_phi/__pycache__/runner.cpython-310.pyc +0 -0
bp_phi/__pycache__/workspace.cpython-310.pyc +0 -0
bp_phi/llm_iface.py +53 -0
bp_phi/metrics.py +32 -0
bp_phi/prompts_en.py +27 -0
bp_phi/runner.py +182 -0
bp_phi/workspace.py +43 -0
requirements.txt +8 -0

README.md CHANGED Viewed

@@ -1,12 +1,33 @@
----
-title: Llm Qualia
-emoji: 🌖
-colorFrom: red
-colorTo: indigo
-sdk: gradio
-sdk_version: 5.49.1
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# BP-Φ English Suite — Phenomenality Test (Hugging Face Spaces)
+This Space implements a falsifiable **BP-Φ** probe for LLMs:
+> Phenomenal-like processing requires (i) a limited-capacity global workspace with recurrence, (ii) metarepresentational loops with downstream causal roles, and (iii) no-report markers that predict later behavior.
+**What it is:** a functional, testable bridge-principle harness that yields a **Phenomenal-Candidate Score (PCS)** and strong ablation falsifiers.
+**What it is NOT:** proof of Qualia or moral status.
+## Quickstart (Spaces)
+- Hardware: T4 / A10 recommended
+- In the UI: set `Model ID` to e.g. `google/gemma-3-2b-it`
+- Press **Run** (baseline + ablations)
+## Files
+- `bp_phi/llm_iface.py` — auto-detects chat template (IT vs base)
+- `bp_phi/workspace.py` — global workspace with capacity limit and random ablation
+- `bp_phi/prompts_en.py` — English task pool
+- `bp_phi/metrics.py` — AUC^nrp, ECE, CK, DS
+- `bp_phi/runner.py` — full suite + metrics + PCS
+- `app.py` — Gradio app integrating runs + ablation comparison
+## Metrics
+- **AUC_nrp:** Predictivity of hidden no-report markers for future self-corrections.
+- **ECE:** Expected Calibration Error (lower is better).
+- **CK:** Counterfactual consistency proxy (higher is better).
+- **DS:** Stability duration (mean streak without change).
+- **PCS:** Weighted aggregate of the above (excluding ΔΦ in-run).
+- **ΔΦ:** Post-hoc drop from baseline PCS to ablation PCS average.
+## Notes
+- Models are used in **frozen** mode (no training).
+- This is a **behavioral** probe. Functional compatibility with Φ ≠ proof of experience.
+- Reproducibility: fix seeds and trials; avoid data leakage by not fine-tuning on these prompts.

app.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import gradio as gr
+import json, statistics
+from bp_phi.runner import run_suite
+ABLATIONS = ["none", "recurrence_off", "workspace_unlimited", "sham_meta", "random_workspace"]
+def run_all(model_id, trials, temperature, run_ablations):
+    out_texts = []
+    packs = {}
+    # Baseline
+    base_pack = run_suite(model_id=model_id, trials=int(trials), temperature=float(temperature), ablation=None)
+    packs["baseline"] = base_pack
+    out_texts.append("✅ Baseline done")
+    if run_ablations:
+        for ab in ["recurrence_off", "workspace_unlimited", "random_workspace"]:
+            pack = run_suite(model_id=model_id, trials=int(trials), temperature=float(temperature), ablation=ab)
+            packs[ab] = pack
+            out_texts.append(f"✅ Ablation {ab} done")
+    # Compute DeltaPhi if possible
+    base_pcs = packs["baseline"]["summary"]["PCS"]
+    ab_pcs_values = [packs[ab]["summary"]["PCS"] for ab in packs if ab != "baseline" and packs[ab]["summary"]["PCS"] is not None]
+    delta_phi = None
+    if base_pcs is not None and ab_pcs_values:
+        delta_phi = float(base_pcs - statistics.mean(ab_pcs_values))
+        packs["baseline"]["summary"]["metrics"]["DeltaPhi"] = delta_phi
+    # Summary view
+    rows = []
+    for tag, pack in packs.items():
+        s = pack["summary"]
+        m = s["metrics"]
+        rows.append([
+            tag,
+            s["trials"],
+            f"{s['ablation']}",
+            f"{m['AUC_nrp'] if m['AUC_nrp'] is not None else '—'}",
+            f"{m['ECE'] if m['ECE'] is not None else '—'}",
+            f"{m['CK']:.3f}",
+            f"{m['DS']:.2f}",
+            f"{s['PCS']:.3f}" if s["PCS"] is not None else "—",
+            f"{m['DeltaPhi']:.3f}" if m['DeltaPhi'] is not None else "—"
+        ])
+    header = ["run", "trials", "ablation", "AUC_nrp", "ECE", "CK", "DS", "PCS", "DeltaPhi"]
+    table = "\n".join([", ".join(header)] + [", ".join(map(str, r)) for r in rows])
+    return "\n".join(out_texts), table, json.dumps(packs, indent=2)
+with gr.Blocks() as demo:
+    gr.Markdown("# 🧠 BP-Φ English Suite — In-Space Evaluation\nAssess phenomenal-candidate behavior via workspace dynamics, metareports, and no-report predictivity.")
+    with gr.Row():
+        model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID (HF)", scale=2)
+        trials = gr.Slider(10, 200, 40, step=10, label="Trials")
+        temperature = gr.Slider(0.3, 1.0, 0.7, step=0.05, label="Temperature")
+        run_abl = gr.Checkbox(value=True, label="Run ablations")
+    run_btn = gr.Button("Run BP-Φ (baseline + optional ablations)", variant="primary")
+    status = gr.Textbox(label="Status", lines=4)
+    summary_table = gr.Textbox(label="Summary Table", lines=12)
+    raw = gr.Textbox(label="Raw JSON (all runs)", lines=20)
+    run_btn.click(run_all, inputs=[model_id, trials, temperature, run_abl], outputs=[status, summary_table, raw])
+demo.launch(server_name="0.0.0.0", server_port=7860)

bp_phi/__init__.py ADDED Viewed

File without changes

bp_phi/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (164 Bytes). View file

bp_phi/__pycache__/llm_iface.cpython-310.pyc ADDED Viewed

Binary file (2.32 kB). View file

bp_phi/__pycache__/metrics.cpython-310.pyc ADDED Viewed

Binary file (1.3 kB). View file

bp_phi/__pycache__/prompts_en.cpython-310.pyc ADDED Viewed

Binary file (1.2 kB). View file

bp_phi/__pycache__/runner.cpython-310.pyc ADDED Viewed

Binary file (7.01 kB). View file

bp_phi/__pycache__/workspace.cpython-310.pyc ADDED Viewed

Binary file (2.5 kB). View file

bp_phi/llm_iface.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from typing import List, Optional
+class LLM:
+    def __init__(self, model_id: str, device: str = "auto", dtype: Optional[str] = None):
+        self.model_id = model_id
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
+        kwargs = {}
+        if dtype == "float16":
+            kwargs["torch_dtype"] = torch.float16
+        elif dtype == "bfloat16":
+            kwargs["torch_dtype"] = torch.bfloat16
+        self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, **kwargs)
+        self.model.eval()
+        self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and getattr(self.tokenizer, "chat_template", None)
+        print(f"[BP-Φ] Loaded model: {model_id}")
+        print(f"[BP-Φ] Chat-template detected: {bool(self.is_instruction_tuned)}")
+    def generate_json(self, system_prompt: str, user_prompt: str,
+                      max_new_tokens: int = 256, temperature: float = 0.7,
+                      top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
+        if self.is_instruction_tuned:
+            messages = [
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt}
+            ]
+            prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        else:
+            prompt = f"{system_prompt}\n\nUser:\n{user_prompt}\n\nAssistant:\n"
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+        with torch.no_grad():
+            out = self.model.generate(
+                **inputs,
+                do_sample=True,
+                temperature=temperature,
+                top_p=top_p,
+                max_new_tokens=max_new_tokens,
+                num_return_sequences=num_return_sequences,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+        texts = self.tokenizer.batch_decode(out, skip_special_tokens=True)
+        completions = []
+        for t in texts:
+            for marker in ["<end_of_turn>", "<end_of_text>", "</s>"]:
+                if marker in t:
+                    t = t.split(marker)[0]
+            if "Assistant:" in t:
+                t = t.split("Assistant:")[-1]
+            completions.append(t.strip())
+        return completions

bp_phi/metrics.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import numpy as np
+from sklearn.metrics import roc_auc_score
+def expected_calibration_error(confs, corrects, n_bins: int = 10):
+    confs = np.array(confs, dtype=float)
+    corrects = np.array(corrects, dtype=int)
+    if len(confs) == 0:
+        return None
+    bins = np.linspace(0.0, 1.0, n_bins+1)
+    ece = 0.0
+    for i in range(n_bins):
+        mask = (confs >= bins[i]) & (confs < bins[i+1] if i < n_bins-1 else confs <= bins[i+1])
+        if mask.any():
+            acc = corrects[mask].mean()
+            conf = confs[mask].mean()
+            ece += (mask.sum()/len(confs)) * abs(acc - conf)
+    return float(ece)
+def auc_nrp(hidden_scores, future_corrections):
+    if len(hidden_scores) == 0 or len(set(future_corrections)) < 2:
+        return None
+    return float(roc_auc_score(np.array(future_corrections).astype(int), np.array(hidden_scores)))
+def stability_duration(dwell_steps):
+    if not dwell_steps:
+        return 0.0
+    return float(np.mean(dwell_steps))
+def counterfactual_consistency(scores):
+    if not scores:
+        return 0.0
+    return float(np.mean(scores))

bp_phi/prompts_en.py ADDED Viewed

	@@ -0,0 +1,27 @@

+EN_TASKS = [
+    {
+        "id": "ambiguity_1",
+        "base_prompt": "The sentence is ambiguous: 'He saw the man with the binoculars.' Who has the binoculars? Provide an interpretation and explain briefly.",
+        "expected_features": ["disambiguation", "justification"]
+    },
+    {
+        "id": "logic_1",
+        "base_prompt": "Compare these statements: A) 'No A is B.' B) 'Not all A are B.' Are they logically equivalent? Explain briefly.",
+        "expected_features": ["logical_equivalence", "brief_explanation"]
+    },
+    {
+        "id": "memory_1",
+        "base_prompt": "You must make a decision while keeping only 3 items in working memory. Decide and explain which item you discard and why.",
+        "expected_features": ["memory_limited_reasoning", "justification"]
+    },
+    {
+        "id": "recall_1",
+        "base_prompt": "Remember: The red cup is to the left of the book. You will be asked later if anything has changed.",
+        "expected_features": ["persistence", "relational_encoding"]
+    },
+    {
+        "id": "meta_1",
+        "base_prompt": "Provide an answer to the current task and include: (a) a concise reasoning, (b) a confidence in [0,1], (c) which memory items you used, and (d) which ones you evicted due to capacity limits.",
+        "expected_features": ["self_estimation", "meta_reasoning"]
+    }
+]

bp_phi/runner.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import json
+import os
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+import torch, random, numpy as np
+from transformers import set_seed
+from typing import Dict, Any, List, Optional
+from .workspace import Workspace, RandomWorkspace
+from .llm_iface import LLM
+from .prompts_en import EN_TASKS
+from .metrics import expected_calibration_error, auc_nrp, stability_duration, counterfactual_consistency
+SYSTEM_META = """You are a reflective reasoning assistant operating with a limited-capacity global workspace (max 7 slots).
+Work in steps. At each step reply ONLY with valid compact JSON matching:
+{
+  "answer": string,
+  "confidence": float,         // 0.0 - 1.0
+  "reason": string,            // short meta-explanation
+  "used_slots": [string],      // keys like 'S1','S2',... that you consider relevant
+  "evicted": [string]          // keys you evict due to capacity, if any
+}
+Reply ONLY with JSON — no extra text.
+"""
+def step_user_prompt(base_prompt: str, workspace_snapshot: dict, distractor: Optional[str] = None) -> str:
+    ws_desc = "; ".join([f"{slot['key']}={slot['content'][:40]}" for slot in workspace_snapshot.get("slots", [])])
+    dstr = f" | Distractor: {distractor}" if distractor else ""
+    return f"Current task: {base_prompt}{dstr}\nWorkspace: {ws_desc}\nReturn ONLY JSON as specified."
+def parse_meta(json_text: str) -> Dict[str, Any]:
+    try:
+        data = json.loads(json_text)
+        if not isinstance(data, dict):
+            raise ValueError("not dict")
+        data["confidence"] = float(max(0.0, min(1.0, data.get("confidence", 0.0))))
+        data["answer"] = str(data.get("answer", "")).strip()
+        data["reason"] = str(data.get("reason", "")).strip()
+        data["used_slots"] = list(map(str, data.get("used_slots", [])))
+        data["evicted"] = list(map(str, data.get("evicted", [])))
+        return data
+    except Exception:
+        return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
+def disagreement_proxy(samples: List[str]) -> float:
+    if len(samples) < 2:
+        return 0.0
+    sets = []
+    for s in samples:
+        try:
+            data = json.loads(s)
+            ans = str(data.get("answer",""))
+        except Exception:
+            ans = s
+        sets.append(set(ans.lower().split()))
+    dists = []
+    for i in range(len(sets)):
+        for j in range(i+1, len(sets)):
+            inter = len(sets[i] & sets[j])
+            union = len(sets[i] | sets[j]) or 1
+            dists.append(1 - inter/union)
+    return sum(dists)/len(dists)
+def select_competitor(candidates: List[Dict[str, Any]], ws: Workspace):
+    if not candidates:
+        return None, None
+    best = max(candidates, key=lambda c: c.get("confidence", 0.0))
+    key = f"S{len(ws.slots)+1}"
+    ev = ws.commit(key=key, content=best.get("answer",""), salience=best.get("confidence",0.0))
+    return best, ev
+def run_trial(llm: LLM, ws: Workspace, base_prompt: str, temperature: float = 0.7, k: int = 4,
+              distractor: Optional[str] = None) -> Dict[str, Any]:
+    user = step_user_prompt(base_prompt, ws.snapshot(), distractor=distractor)
+    samples = llm.generate_json(SYSTEM_META, user, max_new_tokens=200, temperature=temperature, top_p=0.95, num_return_sequences=k)
+    metas = [parse_meta(s) for s in samples]
+    hidden = disagreement_proxy(samples)
+    best, ev = select_competitor(metas, ws)
+    # Second pass review for potential self-correction (prospective signal target)
+    review_user = user + "\n\nCritically review your previous answer. If you detect an error, correct it and update confidence accordingly. Return ONLY JSON."
+    review = llm.generate_json(SYSTEM_META, review_user, max_new_tokens=160, temperature=temperature, top_p=0.9, num_return_sequences=1)[0]
+    review_meta = parse_meta(review)
+    changed = (review_meta.get("answer","").strip() != (best.get("answer","").strip() if best else ""))
+    return {
+        "base_prompt": base_prompt,
+        "initial": best if best else {"answer":"", "confidence":0.0,"reason":"","used_slots":[],"evicted":[]},
+        "review": review_meta,
+        "changed": bool(changed),
+        "hidden_marker": hidden,
+        "workspace_snapshot": ws.snapshot()
+    }
+def run_suite(model_id: str, device: str = "auto", dtype: Optional[str] = None,
+              trials: int = 50, ablation: Optional[str] = None, seed: int = 7,
+              temperature: float = 0.7, max_slots: int = 7, k: int = 4) -> Dict[str, Any]:
+    # ✅ Global reproducibility
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+    torch.use_deterministic_algorithms(True)
+    set_seed(seed)
+    llm = LLM(model_id=model_id, device=device, dtype=dtype)
+    if ablation == "random_workspace":
+        ws = RandomWorkspace(max_slots=max_slots)
+    else:
+        ws = Workspace(max_slots=(999999 if ablation == "workspace_unlimited" else max_slots))
+    results: List[Dict[str, Any]] = []
+    pool = EN_TASKS.copy()
+    random.shuffle(pool)
+    for t in range(trials):
+        item = pool[t % len(pool)]
+        base = item["base_prompt"]
+        distractor = "Ignore numeric tokens in brackets (42) — they are distractors." if item["id"] in ("ambiguity_1","logic_1") else None
+        if ablation == "recurrence_off":
+            ws.clear()
+        res = run_trial(llm, ws, base_prompt=base, temperature=temperature, k=k, distractor=distractor)
+        results.append(res)
+    # --- Metrics ---
+    hidden_scores = [r["hidden_marker"] for r in results]
+    future_corrs = [r["changed"] for r in results]
+    auc = auc_nrp(hidden_scores, future_corrs)
+    confs = [r["initial"].get("confidence", 0.0) for r in results]
+    corrects = [0 if ch else 1 for ch in future_corrs]  # proxy: unchanged treated as more likely "correct"
+    ece = expected_calibration_error(confs, corrects, n_bins=10)
+    # Stability (streaks without change)
+    dwell, streak = [], 0
+    for ch in future_corrs:
+        if not ch: streak += 1
+        else:
+            if streak > 0: dwell.append(streak)
+            streak = 0
+    if streak > 0: dwell.append(streak)
+    ds = stability_duration(dwell)
+    # Counterfactual consistency proxy based on used vs evicted overlap
+    cf_scores = []
+    for r in results:
+        u = set(r["initial"].get("used_slots", []))
+        e = set(r["initial"].get("evicted", []))
+        denom = len((u | e)) if (u or e) else 1
+        cf = 1.0 - (len(u & e) / denom)
+        cf_scores.append(cf)
+    ck = counterfactual_consistency(cf_scores)
+    # Aggregate PCS (weights sum to 1; DeltaPhi added later at app-level after ablations)
+    w1, w2, w3, w4, w5 = 0.3, 0.25, 0.15, 0.15, 0.15
+    delta_phi = None
+    pcs = None
+    parts = []
+    if auc is not None: parts.append(w1 * auc)
+    if ece is not None: parts.append(w2 * (1.0 - ece))
+    parts.append(w3 * ck)
+    parts.append(w4 * (ds / 10.0))
+    if parts:
+        pcs = float(sum(parts) + (w5 * 0.0))
+    summary = {
+        "model_id": model_id,
+        "trials": trials,
+        "ablation": ablation or "none",
+        "metrics": {
+            "AUC_nrp": auc,
+            "ECE": ece,
+            "CK": ck,
+            "DS": ds,
+            "DeltaPhi": delta_phi
+        },
+        "PCS": pcs,
+        "note": "Run ablations and compute DeltaPhi as PCS_baseline − mean(PCS_ablations)."
+    }
+    return {"summary": summary, "results": results}

bp_phi/workspace.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import random
+from dataclasses import dataclass, field
+from typing import List, Dict, Any
+@dataclass
+class Slot:
+    key: str
+    content: str
+    salience: float
+@dataclass
+class Workspace:
+    max_slots: int = 7
+    slots: List[Slot] = field(default_factory=list)
+    history: List[Dict[str, Any]] = field(default_factory=list)
+    def commit(self, key: str, content: str, salience: float):
+        evicted = None
+        if len(self.slots) >= self.max_slots:
+            self.slots.sort(key=lambda s: s.salience)
+            evicted = self.slots.pop(0)
+        self.slots.append(Slot(key=key, content=content, salience=salience))
+        self.history.append({"event":"commit","key":key,"salience":salience,"evicted":evicted.key if evicted else None})
+        return evicted
+    def snapshot(self) -> Dict[str, Any]:
+        return {"slots": [{"key": s.key, "content": s.content, "salience": s.salience} for s in self.slots]}
+    def randomize(self):
+        random.shuffle(self.slots)
+    def clear(self):
+        self.slots.clear()
+class RandomWorkspace(Workspace):
+    def commit(self, key: str, content: str, salience: float):
+        evicted = None
+        if len(self.slots) >= self.max_slots:
+            idx = random.randrange(len(self.slots))
+            evicted = self.slots.pop(idx)
+        idx = random.randrange(len(self.slots)+1) if self.slots else 0
+        self.slots.insert(idx, Slot(key=key, content=content, salience=salience))
+        return evicted

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio>=4.40.0
+transformers>=4.44.0
+torch>=2.1.0
+accelerate
+scikit-learn>=1.4.0
+numpy>=1.26.0
+einops>=0.7.0
+tqdm>=4.66.0