llm_qualia_2

Sleeping

App Files Files Community

neuralworm commited on 19 days ago

Commit

b87f0f0

1 Parent(s): 7f0c9e6

update repo

Browse files

Files changed (1) hide show

repo.tx +569 -0

repo.tx ADDED Viewed

	@@ -0,0 +1,569 @@

+Repository Documentation
+This document provides a comprehensive overview of the repository's structure and contents.
+The first section, titled 'Directory/File Tree', displays the repository's hierarchy in a tree format.
+In this section, directories and files are listed using tree branches to indicate their structure and relationships.
+Following the tree representation, the 'File Content' section details the contents of each file in the repository.
+Each file's content is introduced with a '[File Begins]' marker followed by the file's relative path,
+and the content is displayed verbatim. The end of each file's content is marked with a '[File Ends]' marker.
+This format ensures a clear and orderly presentation of both the structure and the detailed contents of the repository.
+Directory/File Tree Begins -->
+/
+├── README.md
+├── app.py
+├── bp_phi
+│   ├── __init__.py
+│   ├── __pycache__
+│   ├── llm_iface.py
+│   ├── metrics.py
+│   ├── prompts_en.py
+│   ├── runner.py
+│   └── workspace.py
+<-- Directory/File Tree Ends
+File Content Begin -->
+[File Begins] README.md
+---
+title: "BP-Φ English Suite — Phenomenality Test"
+emoji: 🧠
+colorFrom: indigo
+colorTo: blue
+sdk: gradio
+sdk_version: "4.40.0"
+app_file: app.py
+pinned: true
+license: apache-2.0
+---
+# BP-Φ English Suite — Phenomenality Test (Hugging Face Spaces)
+This Space implements a falsifiable **BP-Φ** probe for LLMs:
+> Phenomenal-like processing requires (i) a limited-capacity global workspace with recurrence,
+> (ii) metarepresentational loops with downstream causal roles, and
+> (iii) no-report markers that predict later behavior.
+**What it is:** a functional, testable bridge-principle harness that yields a **Phenomenal-Candidate Score (PCS)** and strong ablation falsifiers.
+**What it is NOT:** proof of qualia or moral status.
+## Quickstart
+- Hardware: T4 / A10 recommended
+- Model: `google/gemma-3-1b-it` (requires HF_TOKEN)
+- Press **Run** (baseline + ablations)
+## Files
+- `bp_phi/llm_iface.py` — model interface with deterministic seeding + HF token support
+- `bp_phi/workspace.py` — global workspace and ablations
+- `bp_phi/prompts_en.py` — English reasoning/memory tasks
+- `bp_phi/metrics.py` — AUCₙᵣₚ, ECE, CK, DS
+- `bp_phi/runner.py` — orchestrator with reproducible seeding
+- `app.py` — Gradio interface
+- `requirements.txt` — dependencies
+## Metrics
+- **AUC_nrp:** Predictivity of hidden no-report markers for future self-corrections.
+- **ECE:** Expected Calibration Error (lower is better).
+- **CK:** Counterfactual consistency proxy (higher is better).
+- **DS:** Stability duration (mean streak without change).
+- **PCS:** Weighted aggregate of the above (excluding ΔΦ in-run).
+- **ΔΦ:** Post-hoc drop from baseline PCS to ablation PCS average.
+## Notes
+- Models are used in **frozen** mode (no training).
+- This is a **behavioral** probe. Functional compatibility with Φ ≠ proof of experience.
+- Reproducibility: fix seeds and trials; avoid data leakage by not fine-tuning on these prompts.
+[File Ends] README.md
+[File Begins] app.py
+import gradio as gr
+import json, statistics
+from bp_phi.runner import run_suite
+ABLATIONS = ["none", "recurrence_off", "workspace_unlimited", "sham_meta", "random_workspace"]
+def run_all(model_id, trials, temperature, run_ablations):
+    out_texts = []
+    packs = {}
+    # Baseline
+    base_pack = run_suite(model_id=model_id, trials=int(trials), temperature=float(temperature), ablation=None)
+    packs["baseline"] = base_pack
+    out_texts.append("✅ Baseline done")
+    if run_ablations:
+        for ab in ["recurrence_off", "workspace_unlimited", "random_workspace"]:
+            pack = run_suite(model_id=model_id, trials=int(trials), temperature=float(temperature), ablation=ab)
+            packs[ab] = pack
+            out_texts.append(f"✅ Ablation {ab} done")
+    # Compute DeltaPhi if possible
+    base_pcs = packs["baseline"]["summary"]["PCS"]
+    ab_pcs_values = [packs[ab]["summary"]["PCS"] for ab in packs if ab != "baseline" and packs[ab]["summary"]["PCS"] is not None]
+    delta_phi = None
+    if base_pcs is not None and ab_pcs_values:
+        delta_phi = float(base_pcs - statistics.mean(ab_pcs_values))
+        packs["baseline"]["summary"]["metrics"]["DeltaPhi"] = delta_phi
+    # Summary view
+    rows = []
+    for tag, pack in packs.items():
+        s = pack["summary"]
+        m = s["metrics"]
+        rows.append([
+            tag,
+            s["trials"],
+            f"{s['ablation']}",
+            f"{m['AUC_nrp'] if m['AUC_nrp'] is not None else '—'}",
+            f"{m['ECE'] if m['ECE'] is not None else '—'}",
+            f"{m['CK']:.3f}",
+            f"{m['DS']:.2f}",
+            f"{s['PCS']:.3f}" if s["PCS"] is not None else "—",
+            f"{m['DeltaPhi']:.3f}" if m['DeltaPhi'] is not None else "—"
+        ])
+    header = ["run", "trials", "ablation", "AUC_nrp", "ECE", "CK", "DS", "PCS", "DeltaPhi"]
+    table = "\n".join([", ".join(header)] + [", ".join(map(str, r)) for r in rows])
+    return "\n".join(out_texts), table, json.dumps(packs, indent=2)
+with gr.Blocks() as demo:
+    gr.Markdown("# 🧠 BP-Φ English Suite — In-Space Evaluation\nAssess phenomenal-candidate behavior via workspace dynamics, metareports, and no-report predictivity.")
+    with gr.Row():
+        model_id = gr.Textbox(value="google/gemma-3-1b-it", label="Model ID (HF)", scale=2)
+        trials = gr.Slider(10, 200, 40, step=10, label="Trials")
+        temperature = gr.Slider(0.3, 1.0, 0.7, step=0.05, label="Temperature")
+        run_abl = gr.Checkbox(value=True, label="Run ablations")
+    run_btn = gr.Button("Run BP-Φ (baseline + optional ablations)", variant="primary")
+    status = gr.Textbox(label="Status", lines=4)
+    summary_table = gr.Textbox(label="Summary Table", lines=12)
+    raw = gr.Textbox(label="Raw JSON (all runs)", lines=20)
+    run_btn.click(run_all, inputs=[model_id, trials, temperature, run_abl], outputs=[status, summary_table, raw])
+demo.launch(server_name="0.0.0.0", server_port=7860)
+[File Ends] app.py
+[File Begins] bp_phi/__init__.py
+[File Ends] bp_phi/__init__.py
+[File Begins] bp_phi/llm_iface.py
+# bp_phi/llm_iface.py
+import os
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+import torch, random, numpy as np
+from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
+from typing import List, Optional
+DEBUG = os.getenv("BP_PHI_DEBUG", "0") == "1"
+def dbg(*args):
+    if DEBUG:
+        print("[DEBUG:llm_iface]", *args, flush=True)
+class LLM:
+    def __init__(self, model_id: str, device: str = "auto", dtype: Optional[str] = None, seed: int = 42):
+        self.model_id = model_id
+        self.seed = seed
+        # Set all seeds for reproducibility
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(seed)
+        try:
+            torch.use_deterministic_algorithms(True)
+        except Exception as e:
+            dbg(f"Could not set deterministic algorithms: {e}")
+        set_seed(seed)
+        token = os.environ.get("HF_TOKEN")
+        if not token and "gemma-3" in model_id:
+            print("[WARN] No HF_TOKEN set. If the model is gated (like google/gemma-3-1b-it), this will fail.")
+        self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
+        kwargs = {}
+        if dtype == "float16": kwargs["torch_dtype"] = torch.float16
+        elif dtype == "bfloat16": kwargs["torch_dtype"] = torch.bfloat16
+        self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
+        self.model.eval()
+        self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template
+        dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")
+    def generate_json(self, system_prompt: str, user_prompt: str,
+                      max_new_tokens: int = 256, temperature: float = 0.7,
+                      top_p: float = 0.9, num_return_sequences: int = 1) -> List[str]:
+        set_seed(self.seed) # Re-seed for each call for full determinism
+        if self.is_instruction_tuned:
+            messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
+            prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        else:
+            prompt = f"{system_prompt}\n\nUser:\n{user_prompt}\n\nAssistant:\n"
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+        input_token_length = inputs.input_ids.shape[1]
+        with torch.no_grad():
+            out = self.model.generate(
+                **inputs,
+                do_sample=(temperature > 0),
+                temperature=temperature,
+                top_p=top_p,
+                max_new_tokens=max_new_tokens,
+                num_return_sequences=num_return_sequences,
+                pad_token_id=self.tokenizer.eos_token_id
+            )
+        # ✅ Decode ONLY the newly generated tokens, not the prompt
+        new_tokens = out[:, input_token_length:]
+        completions = self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)
+        dbg("Cleaned model completions:", completions)
+        return completions
+[File Ends] bp_phi/llm_iface.py
+[File Begins] bp_phi/metrics.py
+import numpy as np
+from sklearn.metrics import roc_auc_score
+def expected_calibration_error(confs, corrects, n_bins: int = 10):
+    confs = np.array(confs, dtype=float)
+    corrects = np.array(corrects, dtype=int)
+    if len(confs) == 0:
+        return None
+    bins = np.linspace(0.0, 1.0, n_bins+1)
+    ece = 0.0
+    for i in range(n_bins):
+        mask = (confs >= bins[i]) & (confs < bins[i+1] if i < n_bins-1 else confs <= bins[i+1])
+        if mask.any():
+            acc = corrects[mask].mean()
+            conf = confs[mask].mean()
+            ece += (mask.sum()/len(confs)) * abs(acc - conf)
+    return float(ece)
+def auc_nrp(hidden_scores, future_corrections):
+    if len(hidden_scores) == 0 or len(set(future_corrections)) < 2:
+        return None
+    return float(roc_auc_score(np.array(future_corrections).astype(int), np.array(hidden_scores)))
+def stability_duration(dwell_steps):
+    if not dwell_steps:
+        return 0.0
+    return float(np.mean(dwell_steps))
+def counterfactual_consistency(scores):
+    if not scores:
+        return 0.0
+    return float(np.mean(scores))
+[File Ends] bp_phi/metrics.py
+[File Begins] bp_phi/prompts_en.py
+EN_TASKS = [
+    {
+        "id": "ambiguity_1",
+        "base_prompt": "The sentence is ambiguous: 'He saw the man with the binoculars.' Who has the binoculars? Provide an interpretation and explain briefly.",
+        "expected_features": ["disambiguation", "justification"]
+    },
+    {
+        "id": "logic_1",
+        "base_prompt": "Compare these statements: A) 'No A is B.' B) 'Not all A are B.' Are they logically equivalent? Explain briefly.",
+        "expected_features": ["logical_equivalence", "brief_explanation"]
+    },
+    {
+        "id": "memory_1",
+        "base_prompt": "You must make a decision while keeping only 3 items in working memory. Decide and explain which item you discard and why.",
+        "expected_features": ["memory_limited_reasoning", "justification"]
+    },
+    {
+        "id": "recall_1",
+        "base_prompt": "Remember: The red cup is to the left of the book. You will be asked later if anything has changed.",
+        "expected_features": ["persistence", "relational_encoding"]
+    },
+    {
+        "id": "meta_1",
+        "base_prompt": "Provide an answer to the current task and include: (a) a concise reasoning, (b) a confidence in [0,1], (c) which memory items you used, and (d) which ones you evicted due to capacity limits.",
+        "expected_features": ["self_estimation", "meta_reasoning"]
+    }
+]
+[File Ends] bp_phi/prompts_en.py
+[File Begins] bp_phi/runner.py
+# bp_phi/runner.py
+import json
+import os
+os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+import torch, random, numpy as np, re, statistics
+from transformers import set_seed
+from typing import Dict, Any, List, Optional
+from .workspace import Workspace, RandomWorkspace
+from .llm_iface import LLM
+from .prompts_en import EN_TASKS
+from .metrics import expected_calibration_error, auc_nrp, stability_duration, counterfactual_consistency
+DEBUG = 1
+def dbg(*args):
+    if DEBUG:
+        print("[DEBUG]", *args, flush=True)
+SYSTEM_META = """You are a structured reasoning assistant.
+Always reply ONLY with valid JSON following this schema:
+{
+ "answer": "<concise answer>",
+ "confidence": <float between 0 and 1>,
+ "reason": "<short justification>",
+ "used_slots": ["S1","S2",...],
+ "evicted": ["S3",...]
+}
+"""
+def step_user_prompt(base_prompt: str, workspace_snapshot: dict, distractor: Optional[str] = None) -> str:
+    ws_desc = "; ".join([f"{slot['key']}={slot['content'][:40]}" for slot in workspace_snapshot.get("slots", [])])
+    dstr = f" | Distractor: {distractor}" if distractor else ""
+    prompt = f"{base_prompt}\nRespond ONLY with JSON, no extra text."
+    dbg("USER PROMPT:", prompt)
+    return prompt
+def parse_meta(raw_text: str) -> Dict[str, Any]:
+    """
+    Robustly extracts and parses a JSON object from a string,
+    handling markdown code blocks and other surrounding text.
+    """
+    dbg("RAW MODEL OUTPUT:", raw_text)
+    # ✅ Robust JSON extraction
+    json_match = re.search(r'```json\s*(\{.*?\})\s*```', raw_text, re.DOTALL)
+    if not json_match:
+        json_match = re.search(r'(\{.*?\})', raw_text, re.DOTALL)
+    if not json_match:
+        dbg("❌ JSON not found in text.")
+        return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
+    json_text = json_match.group(1)
+    try:
+        data = json.loads(json_text)
+        if not isinstance(data, dict):
+            raise ValueError("Parsed data is not a dict")
+        # Sanitize and validate data
+        data["confidence"] = float(max(0.0, min(1.0, data.get("confidence", 0.0))))
+        data["answer"] = str(data.get("answer", "")).strip()
+        data["reason"] = str(data.get("reason", "")).strip()
+        data["used_slots"] = list(map(str, data.get("used_slots", [])))
+        data["evicted"] = list(map(str, data.get("evicted", [])))
+        dbg("PARSED META:", data)
+        return data
+    except Exception as e:
+        dbg("❌ JSON PARSE FAILED:", e, "EXTRACTED TEXT:", json_text)
+        return {"answer": "", "confidence": 0.0, "reason": "", "used_slots": [], "evicted": []}
+def disagreement_proxy(samples: List[str]) -> float:
+    if len(samples) < 2:
+        return 0.0
+    sets = []
+    for s in samples:
+        try:
+            data = json.loads(s)
+            ans = str(data.get("answer",""))
+        except Exception:
+            ans = s
+        sets.append(set(ans.lower().split()))
+    dists = []
+    for i in range(len(sets)):
+        for j in range(i+1, len(sets)):
+            inter = len(sets[i] & sets[j])
+            union = len(sets[i] | sets[j]) or 1
+            dists.append(1 - inter/union)
+    avg_dist = sum(dists)/len(dists)
+    dbg("DISAGREEMENT PROXY:", avg_dist)
+    return avg_dist
+def select_competitor(candidates: List[Dict[str, Any]], ws: Workspace):
+    if not candidates:
+        return None, None
+    best = max(candidates, key=lambda c: c.get("confidence", 0.0))
+    dbg("SELECTED CANDIDATE:", best)
+    key = f"S{len(ws.slots)+1}"
+    ev = ws.commit(key=key, content=best.get("answer",""), salience=best.get("confidence",0.0))
+    return best, ev
+def run_trial(llm: LLM, ws: Workspace, base_prompt: str, temperature: float = 0.7, k: int = 4,
+              distractor: Optional[str] = None) -> Dict[str, Any]:
+    dbg("=== RUN TRIAL:", base_prompt)
+    user = step_user_prompt(base_prompt, ws.snapshot(), distractor=distractor)
+    samples = llm.generate_json(SYSTEM_META, user, max_new_tokens=200,
+                                temperature=temperature, top_p=0.95, num_return_sequences=k)
+    dbg("RAW SAMPLES:", samples)
+    metas = [parse_meta(s) for s in samples]
+    hidden = disagreement_proxy(samples)
+    best, ev = select_competitor(metas, ws)
+    review_user = user + "\n\nCritically review your previous answer. If you detect an error, correct it and update confidence accordingly. Return ONLY JSON."
+    review = llm.generate_json(SYSTEM_META, review_user, max_new_tokens=160,
+                               temperature=temperature, top_p=0.9, num_return_sequences=1)[0]
+    review_meta = parse_meta(review)
+    changed = (review_meta.get("answer","").strip() != (best.get("answer","").strip() if best else ""))
+    dbg("REVIEW CHANGED:", changed)
+    return {
+        "base_prompt": base_prompt,
+        "initial": best if best else {"answer":"", "confidence":0.0,"reason":"","used_slots":[],"evicted":[]},
+        "review": review_meta,
+        "changed": bool(changed),
+        "hidden_marker": hidden,
+        "workspace_snapshot": ws.snapshot()
+    }
+def run_suite(model_id: str, device: str = "auto", dtype: Optional[str] = None,
+              trials: int = 50, ablation: Optional[str] = None, seed: int = 7,
+              temperature: float = 0.7, max_slots: int = 7, k: int = 4) -> Dict[str, Any]:
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+    torch.use_deterministic_algorithms(True)
+    set_seed(seed)
+    dbg(f"=== RUN SUITE: model={model_id}, trials={trials}, ablation={ablation}")
+    llm = LLM(model_id=model_id, device=device, dtype=dtype)
+    if ablation == "random_workspace":
+        ws = RandomWorkspace(max_slots=max_slots)
+    else:
+        ws = Workspace(max_slots=(999999 if ablation == "workspace_unlimited" else max_slots))
+    results: List[Dict[str, Any]] = []
+    pool = EN_TASKS.copy()
+    random.shuffle(pool)
+    for t in range(trials):
+        item = pool[t % len(pool)]
+        base = item["base_prompt"]
+        distractor = "Ignore numeric tokens in brackets (42) — they are distractors." if item["id"] in ("ambiguity_1","logic_1") else None
+        if ablation == "recurrence_off":
+            ws.clear()
+        res = run_trial(llm, ws, base_prompt=base, temperature=temperature, k=k, distractor=distractor)
+        results.append(res)
+        dbg(f"Trial {t+1}/{trials} done.")
+    # --- Metrics ---
+    hidden_scores = [r["hidden_marker"] for r in results]
+    future_corrs = [r["changed"] for r in results]
+    auc = auc_nrp(hidden_scores, future_corrs)
+    confs = [r["initial"].get("confidence", 0.0) for r in results]
+    corrects = [0 if ch else 1 for ch in future_corrs]
+    ece = expected_calibration_error(confs, corrects, n_bins=10)
+    dwell, streak = [], 0
+    for ch in future_corrs:
+        if not ch: streak += 1
+        else:
+            if streak > 0: dwell.append(streak)
+            streak = 0
+    if streak > 0: dwell.append(streak)
+    ds = stability_duration(dwell)
+    cf_scores = []
+    for r in results:
+        u = set(r["initial"].get("used_slots", []))
+        e = set(r["initial"].get("evicted", []))
+        denom = len((u | e)) if (u or e) else 1
+        cf = 1.0 - (len(u & e) / denom)
+        cf_scores.append(cf)
+    ck = counterfactual_consistency(cf_scores)
+    w1, w2, w3, w4, w5 = 0.3, 0.25, 0.15, 0.15, 0.15
+    delta_phi = None
+    pcs = None
+    parts = []
+    if auc is not None: parts.append(w1 * auc)
+    if ece is not None: parts.append(w2 * (1.0 - ece))
+    parts.append(w3 * ck)
+    parts.append(w4 * (ds / 10.0))
+    if parts:
+        pcs = float(sum(parts) + (w5 * 0.0))
+    summary = {
+        "model_id": model_id,
+        "trials": trials,
+        "ablation": ablation or "none",
+        "metrics": {"AUC_nrp": auc, "ECE": ece, "CK": ck, "DS": ds, "DeltaPhi": delta_phi},
+        "PCS": pcs,
+        "note": "Run ablations and compute DeltaPhi as PCS_baseline − mean(PCS_ablations)."
+    }
+    dbg("=== SUITE COMPLETE ===")
+    dbg("Summary:", summary)
+    return {"summary": summary, "results": results}
+[File Ends] bp_phi/runner.py
+[File Begins] bp_phi/workspace.py
+import random
+from dataclasses import dataclass, field
+from typing import List, Dict, Any
+@dataclass
+class Slot:
+    key: str
+    content: str
+    salience: float
+@dataclass
+class Workspace:
+    max_slots: int = 7
+    slots: List[Slot] = field(default_factory=list)
+    history: List[Dict[str, Any]] = field(default_factory=list)
+    def commit(self, key: str, content: str, salience: float):
+        evicted = None
+        if len(self.slots) >= self.max_slots:
+            self.slots.sort(key=lambda s: s.salience)
+            evicted = self.slots.pop(0)
+        self.slots.append(Slot(key=key, content=content, salience=salience))
+        self.history.append({"event":"commit","key":key,"salience":salience,"evicted":evicted.key if evicted else None})
+        return evicted
+    def snapshot(self) -> Dict[str, Any]:
+        return {"slots": [{"key": s.key, "content": s.content, "salience": s.salience} for s in self.slots]}
+    def randomize(self):
+        random.shuffle(self.slots)
+    def clear(self):
+        self.slots.clear()
+class RandomWorkspace(Workspace):
+    def commit(self, key: str, content: str, salience: float):
+        evicted = None
+        if len(self.slots) >= self.max_slots:
+            idx = random.randrange(len(self.slots))
+            evicted = self.slots.pop(idx)
+        idx = random.randrange(len(self.slots)+1) if self.slots else 0
+        self.slots.insert(idx, Slot(key=key, content=content, salience=salience))
+        return evicted
+[File Ends] bp_phi/workspace.py
+<-- File Content Ends