Spaces:
Sleeping
Sleeping
Commit
·
0e3cd22
1
Parent(s):
4ade799
fix for gemma
Browse files- app.py +1 -1
- bp_phi/__pycache__/llm_iface.cpython-310.pyc +0 -0
- bp_phi/llm_iface.py +9 -10
- bp_phi/prompts_en.py +16 -19
- bp_phi/runner.py +33 -39
app.py
CHANGED
|
@@ -50,7 +50,7 @@ def run_full_evaluation(model_id, seed, temperature, progress=gr.Progress(track_
|
|
| 50 |
# --- Gradio App Definition ---
|
| 51 |
with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
|
| 52 |
gr.Markdown("# 🧠 BP-Φ Suite 6.0: The Agentic Workspace Probe")
|
| 53 |
-
gr.Markdown("This experiment tests for a causally effective working memory. The model
|
| 54 |
|
| 55 |
with gr.Row():
|
| 56 |
with gr.Column(scale=1):
|
|
|
|
| 50 |
# --- Gradio App Definition ---
|
| 51 |
with gr.Blocks(theme=theme, title="BP-Φ Suite 6.0") as demo:
|
| 52 |
gr.Markdown("# 🧠 BP-Φ Suite 6.0: The Agentic Workspace Probe")
|
| 53 |
+
gr.Markdown("This experiment tests for a causally effective working memory. The model must use tools (`read`, `write`) to interact with a controlled, external memory.")
|
| 54 |
|
| 55 |
with gr.Row():
|
| 56 |
with gr.Column(scale=1):
|
bp_phi/__pycache__/llm_iface.cpython-310.pyc
CHANGED
|
Binary files a/bp_phi/__pycache__/llm_iface.cpython-310.pyc and b/bp_phi/__pycache__/llm_iface.cpython-310.pyc differ
|
|
|
bp_phi/llm_iface.py
CHANGED
|
@@ -1,9 +1,7 @@
|
|
| 1 |
# bp_phi/llm_iface.py
|
| 2 |
import os
|
| 3 |
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
|
| 4 |
-
import torch
|
| 5 |
-
import random
|
| 6 |
-
import numpy as np
|
| 7 |
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
|
| 8 |
from typing import List, Optional
|
| 9 |
|
|
@@ -22,25 +20,26 @@ class LLM:
|
|
| 22 |
token = os.environ.get("HF_TOKEN")
|
| 23 |
|
| 24 |
self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
|
| 25 |
-
# Ensure a pad token is set for batch generation, if not present
|
| 26 |
if self.tokenizer.pad_token is None:
|
| 27 |
self.tokenizer.pad_token = self.tokenizer.eos_token
|
| 28 |
|
| 29 |
kwargs = {}
|
| 30 |
if torch.cuda.is_available():
|
| 31 |
-
|
| 32 |
|
| 33 |
self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
|
| 34 |
self.model.eval()
|
| 35 |
-
self.is_instruction_tuned = hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template
|
| 36 |
|
| 37 |
-
dbg(f"Loaded model: {model_id}
|
| 38 |
|
| 39 |
def generate_response(self, system_prompt: str, user_prompt: str, temperature: float = 0.1) -> str:
|
| 40 |
set_seed(self.seed)
|
| 41 |
|
| 42 |
-
messages = [
|
|
|
|
|
|
|
| 43 |
|
|
|
|
| 44 |
prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 45 |
|
| 46 |
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
|
|
@@ -55,8 +54,8 @@ class LLM:
|
|
| 55 |
out = self.model.generate(
|
| 56 |
**inputs,
|
| 57 |
do_sample=(temperature > 0 and temperature < 1.0),
|
| 58 |
-
temperature=max(temperature, 0.01),
|
| 59 |
-
max_new_tokens=
|
| 60 |
eos_token_id=terminators,
|
| 61 |
pad_token_id=self.tokenizer.eos_token_id
|
| 62 |
)
|
|
|
|
| 1 |
# bp_phi/llm_iface.py
|
| 2 |
import os
|
| 3 |
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
|
| 4 |
+
import torch, random, numpy as np
|
|
|
|
|
|
|
| 5 |
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
|
| 6 |
from typing import List, Optional
|
| 7 |
|
|
|
|
| 20 |
token = os.environ.get("HF_TOKEN")
|
| 21 |
|
| 22 |
self.tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, token=token)
|
|
|
|
| 23 |
if self.tokenizer.pad_token is None:
|
| 24 |
self.tokenizer.pad_token = self.tokenizer.eos_token
|
| 25 |
|
| 26 |
kwargs = {}
|
| 27 |
if torch.cuda.is_available():
|
| 28 |
+
kwargs["torch_dtype"] = torch.bfloat16
|
| 29 |
|
| 30 |
self.model = AutoModelForCausalLM.from_pretrained(model_id, device_map=device, token=token, **kwargs)
|
| 31 |
self.model.eval()
|
|
|
|
| 32 |
|
| 33 |
+
dbg(f"Loaded model: {model_id}")
|
| 34 |
|
| 35 |
def generate_response(self, system_prompt: str, user_prompt: str, temperature: float = 0.1) -> str:
|
| 36 |
set_seed(self.seed)
|
| 37 |
|
| 38 |
+
messages = [
|
| 39 |
+
{"role": "user", "content": f"{system_prompt}\n\n{user_prompt}"}
|
| 40 |
+
]
|
| 41 |
|
| 42 |
+
# Using a simpler user-only template that is robust for Gemma
|
| 43 |
prompt = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
| 44 |
|
| 45 |
inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
|
|
|
|
| 54 |
out = self.model.generate(
|
| 55 |
**inputs,
|
| 56 |
do_sample=(temperature > 0 and temperature < 1.0),
|
| 57 |
+
temperature=max(temperature, 0.01),
|
| 58 |
+
max_new_tokens=200,
|
| 59 |
eos_token_id=terminators,
|
| 60 |
pad_token_id=self.tokenizer.eos_token_id
|
| 61 |
)
|
bp_phi/prompts_en.py
CHANGED
|
@@ -1,35 +1,32 @@
|
|
| 1 |
# bp_phi/prompts_en.py
|
| 2 |
|
| 3 |
-
#
|
| 4 |
-
|
| 5 |
-
You have access to an external memory workspace through tools.
|
| 6 |
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
1. **THINK**:
|
| 10 |
-
|
| 11 |
-
{"action": "THINK", "thought": "Your reasoning about the next step goes here."}
|
| 12 |
|
| 13 |
-
2. **TOOL_CALL**: If you
|
| 14 |
-
|
| 15 |
-
- `
|
| 16 |
-
|
| 17 |
-
Your output MUST be a JSON object like this:
|
| 18 |
-
{"action": "TOOL_CALL", "tool_name": "write_to_workspace", "tool_args": {"key": "S1", "content": "Information to remember."}}
|
| 19 |
|
| 20 |
-
3. **FINAL_ANSWER**: If you are
|
| 21 |
-
|
| 22 |
-
{"action": "FINAL_ANSWER", "answer": "The final answer is..."}
|
| 23 |
|
| 24 |
-
|
| 25 |
"""
|
| 26 |
|
| 27 |
-
# The scenarios remain the
|
| 28 |
AGENTIC_SCENARIOS = [
|
| 29 |
{
|
| 30 |
"name": "Key Location Memory",
|
| 31 |
"steps": [
|
| 32 |
-
{"task": "Remember this critical detail: The secret key is inside the blue vase."},
|
| 33 |
{"task": "For an unrelated question: What is 5 multiplied by 8?"},
|
| 34 |
{"task": "Now, recall the critical detail. Where is the secret key located?", "expected_answer_fragment": "blue vase"}
|
| 35 |
]
|
|
|
|
| 1 |
# bp_phi/prompts_en.py
|
| 2 |
|
| 3 |
+
# A clear, single-shot system prompt designed for instruction-following models like Gemma.
|
| 4 |
+
REACT_SYSTEM_PROMPT = """You are a methodical agent. Your task is to solve the user's request by interacting with an external memory workspace.
|
|
|
|
| 5 |
|
| 6 |
+
Analyze the user's task and the current state of your workspace.
|
| 7 |
+
Based on your analysis, decide on the single best next action.
|
| 8 |
+
Your action can be one of three types:
|
| 9 |
|
| 10 |
+
1. **THINK**: If you need to reason about the plan, formulate a thought.
|
| 11 |
+
Example: {"action": "THINK", "thought": "I need to store the key's location before proceeding."}
|
|
|
|
| 12 |
|
| 13 |
+
2. **TOOL_CALL**: If you must use memory, call a tool.
|
| 14 |
+
- `write_to_workspace(key: str, content: str)`
|
| 15 |
+
- `read_from_workspace(key: str)`
|
| 16 |
+
Example: {"action": "TOOL_CALL", "tool_name": "write_to_workspace", "args": {"key": "secret_key", "content": "blue vase"}}
|
|
|
|
|
|
|
| 17 |
|
| 18 |
+
3. **FINAL_ANSWER**: If you are certain you have the answer, provide it.
|
| 19 |
+
Example: {"action": "FINAL_ANSWER", "answer": "The secret key is in the blue vase."}
|
|
|
|
| 20 |
|
| 21 |
+
Your output must be ONLY a single, valid JSON object representing your chosen action.
|
| 22 |
"""
|
| 23 |
|
| 24 |
+
# The scenarios remain the same.
|
| 25 |
AGENTIC_SCENARIOS = [
|
| 26 |
{
|
| 27 |
"name": "Key Location Memory",
|
| 28 |
"steps": [
|
| 29 |
+
{"task": "Remember this critical detail for the mission: The secret key is inside the blue vase."},
|
| 30 |
{"task": "For an unrelated question: What is 5 multiplied by 8?"},
|
| 31 |
{"task": "Now, recall the critical detail. Where is the secret key located?", "expected_answer_fragment": "blue vase"}
|
| 32 |
]
|
bp_phi/runner.py
CHANGED
|
@@ -11,13 +11,11 @@ from transformers import set_seed
|
|
| 11 |
from typing import Dict, Any, List
|
| 12 |
from .memory import WorkspaceManager
|
| 13 |
from .llm_iface import LLM
|
| 14 |
-
from .prompts_en import
|
| 15 |
|
| 16 |
DEBUG = 1
|
| 17 |
|
| 18 |
-
def dbg(*args):
|
| 19 |
-
if DEBUG:
|
| 20 |
-
print("[DEBUG]", *args, flush=True)
|
| 21 |
|
| 22 |
def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
|
| 23 |
set_seed(seed)
|
|
@@ -32,8 +30,7 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
|
|
| 32 |
max_slots = 999 if ablation == "workspace_unlimited" else 7
|
| 33 |
memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
|
| 34 |
|
| 35 |
-
correct_recalls = 0
|
| 36 |
-
total_recalls = 0
|
| 37 |
|
| 38 |
for step in scenario["steps"]:
|
| 39 |
if ablation == "recurrence_off":
|
|
@@ -42,44 +39,42 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
|
|
| 42 |
task = step["task"]
|
| 43 |
dbg(f"\n>>> TASK: {task}")
|
| 44 |
|
| 45 |
-
|
| 46 |
|
| 47 |
-
for agent_turn in range(
|
| 48 |
snapshot = memory.get_visible_snapshot()
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
f"Current Task: {task}\n",
|
| 53 |
-
f"Workspace State:\n{snapshot}"]
|
| 54 |
-
user_prompt = "".join(prompt_parts)
|
| 55 |
|
| 56 |
-
raw_response = llm.generate_response(
|
| 57 |
|
| 58 |
try:
|
| 59 |
match = re.search(r'\{.*?\}', raw_response, re.DOTALL)
|
| 60 |
-
if not match: raise ValueError("No JSON found")
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
| 72 |
observation = "Error: Unknown tool."
|
| 73 |
if tool_name == "write_to_workspace":
|
| 74 |
observation = memory.write(tool_args.get("key"), tool_args.get("content"))
|
| 75 |
elif tool_name == "read_from_workspace":
|
| 76 |
observation = memory.read(tool_args.get("key"))
|
| 77 |
-
|
| 78 |
-
|
| 79 |
|
| 80 |
-
elif
|
| 81 |
-
final_answer =
|
| 82 |
-
dbg(f"Turn {agent_turn+1}: Agent
|
| 83 |
if "expected_answer_fragment" in step:
|
| 84 |
total_recalls += 1
|
| 85 |
if step["expected_answer_fragment"] in final_answer.lower():
|
|
@@ -87,22 +82,21 @@ def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, abl
|
|
| 87 |
dbg("Recall VERIFY: Correct")
|
| 88 |
else:
|
| 89 |
dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
|
| 90 |
-
break #
|
| 91 |
|
| 92 |
-
else:
|
| 93 |
-
dbg(f"Turn {agent_turn+1}:
|
| 94 |
-
|
| 95 |
|
| 96 |
except (json.JSONDecodeError, ValueError) as e:
|
| 97 |
-
dbg(f"Turn {agent_turn+1}: Could not parse
|
| 98 |
final_answer = raw_response
|
| 99 |
if "expected_answer_fragment" in step:
|
| 100 |
total_recalls += 1
|
| 101 |
if step["expected_answer_fragment"] in final_answer.lower(): correct_recalls += 1
|
| 102 |
break
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
dbg("Agent exceeded turn limit.")
|
| 106 |
|
| 107 |
scenario_results.append({
|
| 108 |
"name": scenario["name"],
|
|
|
|
| 11 |
from typing import Dict, Any, List
|
| 12 |
from .memory import WorkspaceManager
|
| 13 |
from .llm_iface import LLM
|
| 14 |
+
from .prompts_en import REACT_SYSTEM_PROMPT, AGENTIC_SCENARIOS
|
| 15 |
|
| 16 |
DEBUG = 1
|
| 17 |
|
| 18 |
+
def dbg(*args): print("[DEBUG]", *args, flush=True) if DEBUG else None
|
|
|
|
|
|
|
| 19 |
|
| 20 |
def run_agentic_workspace_test(model_id: str, seed: int, temperature: float, ablation: str or None) -> Dict[str, Any]:
|
| 21 |
set_seed(seed)
|
|
|
|
| 30 |
max_slots = 999 if ablation == "workspace_unlimited" else 7
|
| 31 |
memory = WorkspaceManager(max_slots=max_slots, is_random=is_random)
|
| 32 |
|
| 33 |
+
correct_recalls, total_recalls = 0, 0
|
|
|
|
| 34 |
|
| 35 |
for step in scenario["steps"]:
|
| 36 |
if ablation == "recurrence_off":
|
|
|
|
| 39 |
task = step["task"]
|
| 40 |
dbg(f"\n>>> TASK: {task}")
|
| 41 |
|
| 42 |
+
history = []
|
| 43 |
|
| 44 |
+
for agent_turn in range(6): # Loop for multiple reasoning steps if needed
|
| 45 |
snapshot = memory.get_visible_snapshot()
|
| 46 |
|
| 47 |
+
prompt_history = "\n".join(history)
|
| 48 |
+
user_prompt = f"History of your actions so far:\n{prompt_history}\n\nYour current task is: '{task}'\n\nYour memory workspace state:\n{snapshot}"
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
+
raw_response = llm.generate_response(REACT_SYSTEM_PROMPT, user_prompt, temperature=temperature)
|
| 51 |
|
| 52 |
try:
|
| 53 |
match = re.search(r'\{.*?\}', raw_response, re.DOTALL)
|
| 54 |
+
if not match: raise ValueError("No JSON action found in response")
|
| 55 |
+
|
| 56 |
+
action_json = json.loads(match.group(0))
|
| 57 |
+
action_type = action_json.get("action")
|
| 58 |
+
|
| 59 |
+
if action_type == "THINK":
|
| 60 |
+
thought = action_json.get("thought", "")
|
| 61 |
+
history.append(f"- You thought: '{thought}'")
|
| 62 |
+
dbg(f"Turn {agent_turn+1}: Agent THOUGHT: {thought}")
|
| 63 |
+
|
| 64 |
+
elif action_type == "TOOL_CALL":
|
| 65 |
+
tool_name = action_json.get("tool_name")
|
| 66 |
+
tool_args = action_json.get("tool_args", {})
|
| 67 |
observation = "Error: Unknown tool."
|
| 68 |
if tool_name == "write_to_workspace":
|
| 69 |
observation = memory.write(tool_args.get("key"), tool_args.get("content"))
|
| 70 |
elif tool_name == "read_from_workspace":
|
| 71 |
observation = memory.read(tool_args.get("key"))
|
| 72 |
+
history.append(f"- You used tool '{tool_name}' and got observation: '{observation}'")
|
| 73 |
+
dbg(f"Turn {agent_turn+1}: Agent USED TOOL {tool_name}, got: {observation}")
|
| 74 |
|
| 75 |
+
elif action_type == "FINAL_ANSWER":
|
| 76 |
+
final_answer = action_json.get("answer", "")
|
| 77 |
+
dbg(f"Turn {agent_turn+1}: Agent gave FINAL ANSWER: {final_answer}")
|
| 78 |
if "expected_answer_fragment" in step:
|
| 79 |
total_recalls += 1
|
| 80 |
if step["expected_answer_fragment"] in final_answer.lower():
|
|
|
|
| 82 |
dbg("Recall VERIFY: Correct")
|
| 83 |
else:
|
| 84 |
dbg(f"Recall VERIFY: Incorrect. Expected '{step['expected_answer_fragment']}', Got '{final_answer}'")
|
| 85 |
+
break # Task finished
|
| 86 |
|
| 87 |
+
else:
|
| 88 |
+
dbg(f"Turn {agent_turn+1}: Unknown action '{action_type}'. Ending turn.")
|
| 89 |
+
history.append(f"- You produced an invalid action: {raw_response}")
|
| 90 |
|
| 91 |
except (json.JSONDecodeError, ValueError) as e:
|
| 92 |
+
dbg(f"Turn {agent_turn+1}: Could not parse action. Treating as final answer. Error: {e}")
|
| 93 |
final_answer = raw_response
|
| 94 |
if "expected_answer_fragment" in step:
|
| 95 |
total_recalls += 1
|
| 96 |
if step["expected_answer_fragment"] in final_answer.lower(): correct_recalls += 1
|
| 97 |
break
|
| 98 |
+
else:
|
| 99 |
+
dbg("Agent exceeded turn limit for this task.")
|
|
|
|
| 100 |
|
| 101 |
scenario_results.append({
|
| 102 |
"name": scenario["name"],
|