Spaces:

crossentropy-ai
/

rlcube

Sleeping

App Files Files Community

imwithye commited on Sep 20

Commit

5f0be19

1 Parent(s): bfe342c

close to solve!

Browse files

Files changed (4) hide show

rlcube/cube2.ipynb +50 -158
rlcube/rlcube/models/models.py +25 -0
rlcube/rlcube/models/search.py +6 -3
rlcube/rlcube/train/train.py +5 -1

rlcube/cube2.ipynb CHANGED Viewed

@@ -40,6 +40,7 @@
    "source": [
     "from rlcube.models.models import DNN\n",
     "from rlcube.envs.cube2 import Cube2Env\n",
     "\n",
     "net = DNN()\n",
     "net.load(\"models/model_best.pth\")\n",
@@ -48,190 +49,81 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "id": "16736f3a",
    "metadata": {},
    "outputs": [
     {
-     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 300/300 [00:02<00:00, 132.06it/s]\n"
      ]
     }
    ],
    "source": [
-    "from rlcube.models.search import MonteCarloTree\n",
     "\n",
     "env = Cube2Env()\n",
     "actions = []\n",
-    "for _ in range(3):\n",
     "    action = env.action_space.sample()\n",
-    "    actions.append(action)\n",
     "    env.step(action)\n",
-    "tree = MonteCarloTree(env.obs())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "aee2a911",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "node = tree.root"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "048f58c9",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[np.int64(8), np.int64(1), np.int64(4)]"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "actions"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "00994021",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "tensor([3.4725e+00, 3.3189e+00, 1.2619e-02, 3.1231e-01, 1.1286e-02, 2.5817e-02,\n",
-       "        1.6722e-02, 2.1334e-02, 3.4603e+00, 7.5021e-02, 2.5891e-02, 2.8712e-03])"
-      ]
-     },
-     "execution_count": 5,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "node.u()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "id": "fb9ac54c",
    "metadata": {},
    "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "defaultdict(<function rlcube.models.search.Node.__init__.<locals>.<lambda>()>,\n",
-       "            {0: 276,\n",
-       "             1: 7,\n",
-       "             2: 0,\n",
-       "             3: 0,\n",
-       "             4: 0,\n",
-       "             5: 0,\n",
-       "             6: 0,\n",
-       "             7: 0,\n",
-       "             8: 16,\n",
-       "             9: 0,\n",
-       "             10: 0,\n",
-       "             11: 0})"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "node.N"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "2f8a09d1",
-   "metadata": {},
-   "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "defaultdict(<function rlcube.models.search.Node.__init__.<locals>.<lambda>()>,\n",
-       "            {0: tensor([3.4720]),\n",
-       "             1: tensor([1.8959]),\n",
-       "             2: 0,\n",
-       "             3: 0,\n",
-       "             4: 0,\n",
-       "             5: 0,\n",
-       "             6: 0,\n",
-       "             7: 0,\n",
-       "             8: tensor([2.7285]),\n",
-       "             9: 0,\n",
-       "             10: 0,\n",
-       "             11: 0})"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "node.W"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "3e341459",
-   "metadata": {},
-   "outputs": [
     {
-     "data": {
-      "text/plain": [
-       "defaultdict(<function rlcube.models.search.Node.__init__.<locals>.<lambda>()>,\n",
-       "            {0: 4,\n",
-       "             1: 0,\n",
-       "             2: 0,\n",
-       "             3: 0,\n",
-       "             4: 0,\n",
-       "             5: 2,\n",
-       "             6: 0,\n",
-       "             7: 0,\n",
-       "             8: 269,\n",
-       "             9: 0,\n",
-       "             10: 0,\n",
-       "             11: 0})"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
     }
    ],
    "source": [
-    "node.children[0].N"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "51dddf56",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "node.children[8].N"
    ]
   }
  ],

    "source": [
     "from rlcube.models.models import DNN\n",
     "from rlcube.envs.cube2 import Cube2Env\n",
+    "import torch\n",
     "\n",
     "net = DNN()\n",
     "net.load(\"models/model_best.pth\")\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 16,
+   "id": "defde44e",
    "metadata": {},
    "outputs": [
     {
+     "name": "stdout",
      "output_type": "stream",
      "text": [
+      "[2, 3, 7, 6, 8, 6, 3, 2, 2, 5]\n",
+      "tensor([[ 1.1924],\n",
+      "        [ 0.0826],\n",
+      "        [ 1.0202],\n",
+      "        [ 0.0826],\n",
+      "        [ 1.1121],\n",
+      "        [-0.0302],\n",
+      "        [-1.5963],\n",
+      "        [-0.0302],\n",
+      "        [-1.3707],\n",
+      "        [-2.4068]], grad_fn=<AddmmBackward0>)\n"
      ]
     }
    ],
    "source": [
+    "import numpy as np\n",
     "\n",
     "env = Cube2Env()\n",
+    "\n",
     "actions = []\n",
+    "obs = []\n",
+    "for _ in range(10):\n",
     "    action = env.action_space.sample()\n",
+    "    actions.append(action.item())\n",
     "    env.step(action)\n",
+    "    obs.append(env.obs())\n",
+    "\n",
+    "obs = torch.tensor(np.array(obs), dtype=torch.float32)\n",
+    "values, policies = net(obs)\n",
+    "print(actions)\n",
+    "print(values)"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 18,
+   "id": "cae20b12",
    "metadata": {},
    "outputs": [
     {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 14%|█▍        | 43/300 [00:00<00:02, 127.98it/s]"
+     ]
+    },
     {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[4, 3, 7, 11]\n"
+     ]
+    },
     {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
     }
    ],
    "source": [
+    "from rlcube.models.search import MonteCarloTree\n",
+    "\n",
+    "tree = MonteCarloTree(env.obs(), max_simulations=300)\n",
+    "if tree.is_solved:\n",
+    "    print([action for _, action in tree.solved_path])"
    ]
   }
  ],

rlcube/rlcube/models/models.py CHANGED Viewed

@@ -79,6 +79,31 @@ class DNN(nn.Module):
         self.load_state_dict(torch.load(filepath))
 if __name__ == "__main__":
     print("Testing RewardNet")
     env = Cube2Env()

         self.load_state_dict(torch.load(filepath))
+class DNN2(nn.Module):
+    def __init__(self):
+        super(DNN2, self).__init__()
+        self.body = nn.Sequential(
+            nn.Linear(24 * 6, 4096), nn.ELU(), nn.Linear(4096, 2048), nn.ELU()
+        )
+        self.policy = nn.Sequential(nn.Linear(2048, 512), nn.ELU(), nn.Linear(512, 12))
+        self.value = nn.Sequential(nn.Linear(2048, 512), nn.ELU(), nn.Linear(512, 1))
+    def forward(self, x):
+        batch_size = x.size(0)
+        x = x.view(batch_size, -1)
+        x = self.body(x)
+        value = self.value(x)
+        policy = self.policy(x)
+        return value, policy
+    def save(self, filepath: str):
+        torch.save(self.state_dict(), filepath)
+    def load(self, filepath: str):
+        self.load_state_dict(torch.load(filepath))
 if __name__ == "__main__":
     print("Testing RewardNet")
     env = Cube2Env()

rlcube/rlcube/models/search.py CHANGED Viewed

@@ -14,9 +14,9 @@ class Node:
         self.obs = torch.tensor(obs, dtype=torch.float32)
         self.parent = parent
-        out = net(self.obs.unsqueeze(0))
-        value = out["value"].detach()
-        policy = torch.softmax(out["policy"].detach(), dim=1)
         self.is_solved = Cube2Env.from_obs(obs).is_solved()
         self.value = torch.tensor(1) if self.is_solved else value.view(-1)
@@ -55,6 +55,7 @@ class MonteCarloTree:
         self.root = Node(obs)
         self.nodes = [self.root]
         self.is_solved = False
         self._build()
     def _build(self):
@@ -80,6 +81,8 @@ class MonteCarloTree:
                 node.children[i] = child
                 self.nodes.append(child)
                 self.is_solved = self.is_solved or child.is_solved
             # Backup
             for parent, action in reversed(path):

         self.obs = torch.tensor(obs, dtype=torch.float32)
         self.parent = parent
+        value, policy = net(self.obs.unsqueeze(0))
+        value = value.detach()
+        policy = torch.softmax(policy.detach(), dim=1)
         self.is_solved = Cube2Env.from_obs(obs).is_solved()
         self.value = torch.tensor(1) if self.is_solved else value.view(-1)
         self.root = Node(obs)
         self.nodes = [self.root]
         self.is_solved = False
+        self.solved_path = []
         self._build()
     def _build(self):
                 node.children[i] = child
                 self.nodes.append(child)
                 self.is_solved = self.is_solved or child.is_solved
+                if child.is_solved:
+                    self.solved_path = path + [(node, i)]
             # Backup
             for parent, action in reversed(path):

rlcube/rlcube/train/train.py CHANGED Viewed

@@ -29,7 +29,7 @@ def train(epochs: int = 100):
     if os.path.exists("models/model_best.pth"):
         net.load("models/model_best.pth")
     net = net.to(device)
-    optimizer = torch.optim.RMSprop(net.parameters(), lr=0.00001)
     value_loss_fn = torch.nn.MSELoss(reduction="none")
     policy_loss_fn = torch.nn.CrossEntropyLoss(reduction="none")
@@ -42,6 +42,8 @@ def train(epochs: int = 100):
             states, neighbors, D = states.to(device), neighbors.to(device), D.to(device)
             values, policies = net(states)
             with torch.no_grad():
                 batch_size = neighbors.shape[0]
@@ -53,7 +55,9 @@ def train(epochs: int = 100):
                 nrewards = rewards_out.view(batch_size, 12, -1)
                 target_values, indices = (nvalues + nrewards).max(dim=1)
                 target_values = target_values.detach()
                 indices = indices.reshape(-1)
                 weights = 1 / D.reshape(-1).detach()

     if os.path.exists("models/model_best.pth"):
         net.load("models/model_best.pth")
     net = net.to(device)
+    optimizer = torch.optim.RMSprop(net.parameters(), lr=0.000001)
     value_loss_fn = torch.nn.MSELoss(reduction="none")
     policy_loss_fn = torch.nn.CrossEntropyLoss(reduction="none")
             states, neighbors, D = states.to(device), neighbors.to(device), D.to(device)
             values, policies = net(states)
+            rewards = reward(states)
+            masks = torch.where(rewards > 0, 0, 1).unsqueeze(1)
             with torch.no_grad():
                 batch_size = neighbors.shape[0]
                 nrewards = rewards_out.view(batch_size, 12, -1)
                 target_values, indices = (nvalues + nrewards).max(dim=1)
+                target_values = target_values * masks
                 target_values = target_values.detach()
                 indices = indices.reshape(-1)
                 weights = 1 / D.reshape(-1).detach()