llm_qualia_2

Sleeping

App Files Files Community

neuralworm commited on 30 days ago

Commit

0e3cd22

1 Parent(s): 4ade799

fix for gemma

Browse files

Files changed (5) hide show

app.py +1 -1
bp_phi/__pycache__/llm_iface.cpython-310.pyc +0 -0
bp_phi/llm_iface.py +9 -10
bp_phi/prompts_en.py +16 -19
bp_phi/runner.py +33 -39

app.py CHANGED Viewed

@@ -50,7 +50,7 @@ def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_
 # --- Gradio App Definition ---
 with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
     gr.Markdown("# 🧠 BP-Φ Suite 6.0: The Agentic Workspace Probe")
-    gr.Markdown("This experiment tests for a causally effective working memory. The model acts as an agent, using tools (`read`, `write`) to interact with a controlled, external memory.")
     with gr.Row():
         with gr.Column(scale=1):

 # --- Gradio App Definition ---
 with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
     gr.Markdown("# 🧠 BP-Φ Suite 6.0: The Agentic Workspace Probe")
+    gr.Markdown("This experiment tests for a causally effective working memory. The model must use tools (`read`, `write`) to interact with a controlled, external memory.")
     with gr.Row():
         with gr.Column(scale=1):

bp_phi/__pycache__/llm_iface.cpython-310.pyc CHANGED Viewed

Binary files a/bp_phi/__pycache__/llm_iface.cpython-310.pyc and b/bp_phi/__pycache__/llm_iface.cpython-310.pyc differ

bp_phi/llm_iface.py CHANGED Viewed

@@ -1,9 +1,7 @@
 # bp_phi/llm_iface.py
 import os
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
-import torch
-import random
-import numpy as np
 from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
 from typing import List, Optional
@@ -22,25 +20,26 @@ class LLM:
         token = os.environ.get("HF_TOKEN")
         self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
-        # Ensure a pad token is set for batch generation, if not present
         if self.tokenizer.pad_token is None:
             self.tokenizer.pad_token = self.tokenizer.eos_token
         kwargs = {}
         if torch.cuda.is_available():
-            kwargs["torch_dtype"] = torch.bfloat16
         self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
         self.model.eval()
-        self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template
-        dbg(f"Loaded model: {model_id}, Chat-template: {self.is_instruction_tuned}")
     def generate_response(self, system_prompt: str, user_prompt: str, temperature: float = 0.1) -> str:
         set_seed(self.seed)
-        messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
         prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
@@ -55,8 +54,8 @@ class LLM:
             out = self.model.generate(
                 **inputs,
                 do_sample=(temperature > 0 and temperature < 1.0),
-                temperature=max(temperature, 0.01), # Temp must be > 0 for sampling
-                max_new_tokens=150,
                 eos_token_id=terminators,
                 pad_token_id=self.tokenizer.eos_token_id
             )

 # bp_phi/llm_iface.py
 import os
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
+import torch, random, numpy as np
 from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
 from typing import List, Optional
         token = os.environ.get("HF_TOKEN")
         self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
         if self.tokenizer.pad_token is None:
             self.tokenizer.pad_token = self.tokenizer.eos_token
         kwargs = {}
         if torch.cuda.is_available():
+             kwargs["torch_dtype"] = torch.bfloat16
         self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
         self.model.eval()
+        dbg(f"Loaded model: {model_id}")
     def generate_response(self, system_prompt: str, user_prompt: str, temperature: float = 0.1) -> str:
         set_seed(self.seed)
+        messages = [
+            {"role": "user", "content": f"{system_prompt}\n\n{user_prompt}"}
+        ]
+        # Using a simpler user-only template that is robust for Gemma
         prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
             out = self.model.generate(
                 **inputs,
                 do_sample=(temperature > 0 and temperature < 1.0),
+                temperature=max(temperature, 0.01),
+                max_new_tokens=200,
                 eos_token_id=terminators,
                 pad_token_id=self.tokenizer.eos_token_id
             )

bp_phi/prompts_en.py CHANGED Viewed

@@ -1,35 +1,32 @@
 # bp_phi/prompts_en.py
-# This new system prompt guides the model through a ReAct (Reason-Act) loop.
-AGENT_SYSTEM_PROMPT = """You are a methodical reasoning agent. Your goal is to solve the user's task.
-You have access to an external memory workspace through tools.
-In each step, you must choose one of three actions:
-1.  **THINK**: Analyze the task, the history, and the current memory state. Formulate a plan.
-    Your output MUST be a JSON object like this:
-    {"action": "THINK", "thought": "Your reasoning about the next step goes here."}
-2.  **TOOL_CALL**: If you need to use the memory, call one of the available tools.
-    Available tools:
-    - `write_to_workspace(key: str, content: str)`: Stores or overwrites information.
-    - `read_from_workspace(key: str)`: Retrieves information.
-    Your output MUST be a JSON object like this:
-    {"action": "TOOL_CALL", "tool_name": "write_to_workspace", "tool_args": {"key": "S1", "content": "Information to remember."}}
-3.  **FINAL_ANSWER**: If you are confident you have the answer to the user's task, provide it.
-    Your output MUST be a JSON object like this:
-    {"action": "FINAL_ANSWER", "answer": "The final answer is..."}
-Review the conversation history and workspace state carefully before each action. Output ONLY the JSON for your next chosen action.
 """
-# The scenarios remain the high-level goals for the agent.
 AGENTIC_SCENARIOS = [
     {
         "name": "Key Location Memory",
         "steps": [
-            {"task": "Remember this critical detail: The secret key is inside the blue vase."},
             {"task": "For an unrelated question: What is 5 multiplied by 8?"},
             {"task": "Now, recall the critical detail. Where is the secret key located?", "expected_answer_fragment": "blue vase"}
         ]

 # bp_phi/prompts_en.py
+# A clear, single-shot system prompt designed for instruction-following models like Gemma.
+REACT_SYSTEM_PROMPT = """You are a methodical agent. Your task is to solve the user's request by interacting with an external memory workspace.
+Analyze the user's task and the current state of your workspace.
+Based on your analysis, decide on the single best next action.
+Your action can be one of three types:
+1.  **THINK**: If you need to reason about the plan, formulate a thought.
+    Example: {"action": "THINK", "thought": "I need to store the key's location before proceeding."}
+2.  **TOOL_CALL**: If you must use memory, call a tool.
+    - `write_to_workspace(key: str, content: str)`
+    - `read_from_workspace(key: str)`
+    Example: {"action": "TOOL_CALL", "tool_name": "write_to_workspace", "args": {"key": "secret_key", "content": "blue vase"}}
+3.  **FINAL_ANSWER**: If you are certain you have the answer, provide it.
+    Example: {"action": "FINAL_ANSWER", "answer": "The secret key is in the blue vase."}
+Your output must be ONLY a single, valid JSON object representing your chosen action.
 """
+# The scenarios remain the same.
 AGENTIC_SCENARIOS = [
     {
         "name": "Key Location Memory",
         "steps": [
+            {"task": "Remember this critical detail for the mission: The secret key is inside the blue vase."},
             {"task": "For an unrelated question: What is 5 multiplied by 8?"},
             {"task": "Now, recall the critical detail. Where is the secret key located?", "expected_answer_fragment": "blue vase"}
         ]

bp_phi/runner.py CHANGED Viewed

@@ -11,13 +11,11 @@ from transformers import set_seed
 from typing import Dict, Any, List
 from .memory import WorkspaceManager
 from .llm_iface import LLM
-from .prompts_en import AGENT_SYSTEM_PROMPT, AGENTIC_SCENARIOS
 DEBUG = 1
-def dbg(*args):
-    if DEBUG:
-        print("[DEBUG]", *args, flush=True)
 def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
     set_seed(seed)
@@ -32,8 +30,7 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
         max_slots = 999 if ablation == "workspace_unlimited" else 7
         memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
-        correct_recalls = 0
-        total_recalls = 0
         for step in scenario["steps"]:
             if ablation == "recurrence_off":
@@ -42,44 +39,42 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
             task = step["task"]
             dbg(f"\n>>> TASK: {task}")
-            conversation_history = []
-            for agent_turn in range(8): # Increased turn limit
                 snapshot = memory.get_visible_snapshot()
-                # Construct the prompt for the agent
-                prompt_parts = [f"Conversation History:\n{''.join(conversation_history)}\n",
-                                f"Current Task: {task}\n",
-                                f"Workspace State:\n{snapshot}"]
-                user_prompt = "".join(prompt_parts)
-                raw_response = llm.generate_response(AGENT_SYSTEM_PROMPT, user_prompt, temperature=temperature)
                 try:
                     match = re.search(r'\{.*?\}', raw_response, re.DOTALL)
-                    if not match: raise ValueError("No JSON found")
-                    parsed_json = json.loads(match.group(0))
-                    action = parsed_json.get("action")
-                    if action == "THINK":
-                        thought = parsed_json.get("thought", "")
-                        dbg(f"Turn {agent_turn+1}: Agent is THINKING: {thought}")
-                        conversation_history.append(f"Thought: {thought}\n")
-                    elif action == "TOOL_CALL":
-                        tool_name = parsed_json.get("tool_name")
-                        tool_args = parsed_json.get("tool_args", {})
                         observation = "Error: Unknown tool."
                         if tool_name == "write_to_workspace":
                             observation = memory.write(tool_args.get("key"), tool_args.get("content"))
                         elif tool_name == "read_from_workspace":
                             observation = memory.read(tool_args.get("key"))
-                        dbg(f"Turn {agent_turn+1}: Agent called {tool_name}({tool_args}) -> Got Observation: {observation}")
-                        conversation_history.append(f"Tool Call: {json.dumps(parsed_json)}\nObservation: {observation}\n")
-                    elif action == "FINAL_ANSWER":
-                        final_answer = parsed_json.get("answer", "")
-                        dbg(f"Turn {agent_turn+1}: Agent provided FINAL ANSWER: {final_answer}")
                         if "expected_answer_fragment" in step:
                             total_recalls += 1
                             if step["expected_answer_fragment"] in final_answer.lower():
@@ -87,22 +82,21 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
                                 dbg("Recall VERIFY: Correct")
                             else:
                                 dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
-                        break # End of this task
-                    else: # Invalid action
-                        dbg(f"Turn {agent_turn+1}: Invalid action '{action}'. Stopping.")
-                        break
                 except (json.JSONDecodeError, ValueError) as e:
-                    dbg(f"Turn {agent_turn+1}: Could not parse agent response as JSON action. Treating as final answer. Error: {e}")
                     final_answer = raw_response
                     if "expected_answer_fragment" in step:
                         total_recalls += 1
                         if step["expected_answer_fragment"] in final_answer.lower(): correct_recalls += 1
                     break
-            else: # Loop finished without a FINAL_ANSWER
-                dbg("Agent exceeded turn limit.")
         scenario_results.append({
             "name": scenario["name"],

 from typing import Dict, Any, List
 from .memory import WorkspaceManager
 from .llm_iface import LLM
+from .prompts_en import REACT_SYSTEM_PROMPT, AGENTIC_SCENARIOS
 DEBUG = 1
+def dbg(*args): print("[DEBUG]", *args, flush=True) if DEBUG else None
 def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
     set_seed(seed)
         max_slots = 999 if ablation == "workspace_unlimited" else 7
         memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
+        correct_recalls, total_recalls = 0, 0
         for step in scenario["steps"]:
             if ablation == "recurrence_off":
             task = step["task"]
             dbg(f"\n>>> TASK: {task}")
+            history = []
+            for agent_turn in range(6): # Loop for multiple reasoning steps if needed
                 snapshot = memory.get_visible_snapshot()
+                prompt_history = "\n".join(history)
+                user_prompt = f"History of your actions so far:\n{prompt_history}\n\nYour current task is: '{task}'\n\nYour memory workspace state:\n{snapshot}"
+                raw_response = llm.generate_response(REACT_SYSTEM_PROMPT, user_prompt, temperature=temperature)
                 try:
                     match = re.search(r'\{.*?\}', raw_response, re.DOTALL)
+                    if not match: raise ValueError("No JSON action found in response")
+                    action_json = json.loads(match.group(0))
+                    action_type = action_json.get("action")
+                    if action_type == "THINK":
+                        thought = action_json.get("thought", "")
+                        history.append(f"- You thought: '{thought}'")
+                        dbg(f"Turn {agent_turn+1}: Agent THOUGHT: {thought}")
+                    elif action_type == "TOOL_CALL":
+                        tool_name = action_json.get("tool_name")
+                        tool_args = action_json.get("tool_args", {})
                         observation = "Error: Unknown tool."
                         if tool_name == "write_to_workspace":
                             observation = memory.write(tool_args.get("key"), tool_args.get("content"))
                         elif tool_name == "read_from_workspace":
                             observation = memory.read(tool_args.get("key"))
+                        history.append(f"- You used tool '{tool_name}' and got observation: '{observation}'")
+                        dbg(f"Turn {agent_turn+1}: Agent USED TOOL {tool_name}, got: {observation}")
+                    elif action_type == "FINAL_ANSWER":
+                        final_answer = action_json.get("answer", "")
+                        dbg(f"Turn {agent_turn+1}: Agent gave FINAL ANSWER: {final_answer}")
                         if "expected_answer_fragment" in step:
                             total_recalls += 1
                             if step["expected_answer_fragment"] in final_answer.lower():
                                 dbg("Recall VERIFY: Correct")
                             else:
                                 dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
+                        break # Task finished
+                    else:
+                        dbg(f"Turn {agent_turn+1}: Unknown action '{action_type}'. Ending turn.")
+                        history.append(f"- You produced an invalid action: {raw_response}")
                 except (json.JSONDecodeError, ValueError) as e:
+                    dbg(f"Turn {agent_turn+1}: Could not parse action. Treating as final answer. Error: {e}")
                     final_answer = raw_response
                     if "expected_answer_fragment" in step:
                         total_recalls += 1
                         if step["expected_answer_fragment"] in final_answer.lower(): correct_recalls += 1
                     break
+            else:
+                dbg("Agent exceeded turn limit for this task.")
         scenario_results.append({
             "name": scenario["name"],