bayesian optimization added, code ported to pytorch

fhbee · Dec 8, 2020 · 55bbb47 · 55bbb47
1 parent 9e8335c
commit 55bbb47
Show file tree

Hide file tree

Showing 6 changed files with 167 additions and 129 deletions.
diff --git a/.gitattributes b/.gitattributes
diff --git a/DQN.py b/DQN.py
@@ -1,14 +1,18 @@
-from keras.optimizers import Adam
-from keras.models import Sequential
-from keras.layers.core import Dense, Dropout
 import random
 import numpy as np
 import pandas as pd
 from operator import add
 import collections
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import copy
+DEVICE = 'cpu' # 'cuda' if torch.cuda.is_available() else 'cpu'
 
-class DQNAgent(object):
+class DQNAgent(torch.nn.Module):
     def __init__(self, params):
+        super().__init__()
         self.reward = 0
         self.gamma = 0.9
         self.dataframe = pd.DataFrame()
@@ -24,22 +28,43 @@ def __init__(self, params):
         self.memory = collections.deque(maxlen=params['memory_size'])
         self.weights = params['weights_path']
         self.load_weights = params['load_weights']
-        self.model = self.network()
-
+        self.optimizer = None
+        self.network()
+
     def network(self):
-        model = Sequential()
-        model.add(Dense(output_dim=self.first_layer, activation='relu', input_dim=11))
-        model.add(Dense(output_dim=self.second_layer, activation='relu'))
-        model.add(Dense(output_dim=self.third_layer, activation='relu'))
-        model.add(Dense(output_dim=3, activation='softmax'))
-        opt = Adam(self.learning_rate)
-        model.compile(loss='mse', optimizer=opt)
-
+        # Layers
+        self.f1 = nn.Linear(11, self.first_layer)
+        self.f2 = nn.Linear(self.first_layer, self.second_layer)
+        self.f3 = nn.Linear(self.second_layer, self.third_layer)
+        self.f4 = nn.Linear(self.third_layer, 3)
+        # weights
         if self.load_weights:
-            model.load_weights(self.weights)
-        return model
+            self.model = self.load_state_dict(torch.load(self.weights))
+            print("weights loaded")
+
+    def forward(self, x):
+        x = F.relu(self.f1(x))
+        x = F.relu(self.f2(x))
+        x = F.relu(self.f3(x))
+        x = F.softmax(self.f4(x), dim=-1)
+        return x
 
     def get_state(self, game, player, food):
+        """
+        Return the state.
+        The state is a numpy array of 11 values, representing:
+            - Danger 1 OR 2 steps ahead
+            - Danger 1 OR 2 steps on the right
+            - Danger 1 OR 2 steps on the left
+            - Snake is moving left
+            - Snake is moving right
+            - Snake is moving up
+            - Snake is moving down
+            - The food is on the left
+            - The food is on the right
+            - The food is on the upper side
+            - The food is on the lower side      
+        """
         state = [
             (player.x_change == 20 and player.y_change == 0 and ((list(map(add, player.position[-1], [20, 0])) in player.position) or
             player.position[-1][0] + 20 >= (game.game_width - 20))) or (player.x_change == -20 and player.y_change == 0 and ((list(map(add, player.position[-1], [-20, 0])) in player.position) or
@@ -70,7 +95,7 @@ def get_state(self, game, player, food):
             food.x_food > player.x,  # food right
             food.y_food < player.y,  # food up
             food.y_food > player.y  # food down
-            ]
+        ]
 
         for i in range(len(state)):
             if state[i]:
@@ -81,6 +106,13 @@ def get_state(self, game, player, food):
         return np.asarray(state)
 
     def set_reward(self, player, crash):
+        """
+        Return the reward.
+        The reward is:
+            -10 when Snake crashes. 
+            +10 when Snake eats food
+            0 otherwise
+        """
         self.reward = 0
         if crash:
             self.reward = -10
@@ -90,25 +122,54 @@ def set_reward(self, player, crash):
         return self.reward
 
     def remember(self, state, action, reward, next_state, done):
+        """
+        Store the <state, action, reward, next_state, is_done> tuple in a 
+        memory buffer for replay memory.
+        """
         self.memory.append((state, action, reward, next_state, done))
 
     def replay_new(self, memory, batch_size):
+        """
+        Replay memory.
+        """
         if len(memory) > batch_size:
             minibatch = random.sample(memory, batch_size)
         else:
             minibatch = memory
         for state, action, reward, next_state, done in minibatch:
+            self.train()
+            torch.set_grad_enabled(True)
             target = reward
+            next_state_tensor = torch.tensor(np.expand_dims(next_state, 0), dtype=torch.float32).to(DEVICE)
+            state_tensor = torch.tensor(np.expand_dims(state, 0), dtype=torch.float32, requires_grad=True).to(DEVICE)
             if not done:
-                target = reward + self.gamma * np.amax(self.model.predict(np.array([next_state]))[0])
-            target_f = self.model.predict(np.array([state]))
+                target = reward + self.gamma * torch.max(self.forward(next_state_tensor)[0])
+            output = self.forward(state_tensor)
+            target_f = output.clone()
             target_f[0][np.argmax(action)] = target
-            self.model.fit(np.array([state]), target_f, epochs=1, verbose=0)
+            target_f.detach()
+            self.optimizer.zero_grad()
+            loss = F.mse_loss(output, target_f)
+            loss.backward()
+            self.optimizer.step()            
 
     def train_short_memory(self, state, action, reward, next_state, done):
+        """
+        Train the DQN agent on the <state, action, reward, next_state, is_done>
+        tuple at the current timestep.
+        """
+        self.train()
+        torch.set_grad_enabled(True)
         target = reward
+        next_state_tensor = torch.tensor(next_state.reshape((1, 11)), dtype=torch.float32).to(DEVICE)
+        state_tensor = torch.tensor(state.reshape((1, 11)), dtype=torch.float32, requires_grad=True).to(DEVICE)
         if not done:
-            target = reward + self.gamma * np.amax(self.model.predict(next_state.reshape((1, 11)))[0])
-        target_f = self.model.predict(state.reshape((1, 11)))
+            target = reward + self.gamma * torch.max(self.forward(next_state_tensor[0]))
+        output = self.forward(state_tensor)
+        target_f = output.clone()
         target_f[0][np.argmax(action)] = target
-        self.model.fit(state.reshape((1, 11)), target_f, epochs=1, verbose=0)
+        target_f.detach()
+        self.optimizer.zero_grad()
+        loss = F.mse_loss(output, target_f)
+        loss.backward()
+        self.optimizer.step()
diff --git a/README.md b/README.md
@@ -1,18 +1,25 @@
 # Deep Reinforcement Learning
 ## Project: Train AI to play Snake
+*UPDATE:*
+
+This project has been recently updated:
+- The code of Deep Reinforcement Learning was ported from Keras/TF to Pytorch. To see the original version of the code in Keras/TF, please refer to this repository: [snake-ga-tf](https://github.com/maurock/snake-ga-tf). 
+- I added Bayesian Optimization to optimize some parameters of Deep RL.
 
 ## Introduction
-The goal of this project is to develop an AI Bot able to learn how to play the popular game Snake from scratch. In order to do it, I implemented a Deep Reinforcement Learning algorithm. This approach consists in giving the system parameters related to its state, and a positive or negative reward based on its actions. No rules about the game are given, and initially the Bot has no information on what it needs to do. The goal for the system is to figure it out and elaborate a strategy to maximize the score - or the reward.
-We are going to see how a Deep Q-Learning algorithm learns how to play snake, scoring up to 50 points and showing a solid strategy after only 5 minutes of training.
+The goal of this project is to develop an AI Bot able to learn how to play the popular game Snake from scratch. In order to do it, I implemented a Deep Reinforcement Learning algorithm. This approach consists in giving the system parameters related to its state, and a positive or negative reward based on its actions. No rules about the game are given, and initially the Bot has no information on what it needs to do. The goal for the system is to figure it out and elaborate a strategy to maximize the score - or the reward. \
+We are going to see how a Deep Q-Learning algorithm learns how to play Snake, scoring up to 50 points and showing a solid strategy after only 5 minutes of training. \
+Additionally, it is possible to run the Bayesian Optimization method to find the optimal parameters of the Deep neural network, as well as some parameters of the Deep RL approach.
 
 ## Install
-This project requires Python 3.6 with the pygame library installed, as well as Keras with Tensorflow backend.
+This project requires Python 3.6 with the pygame library installed, as well as Pytorch. \
+The full list of requirements is in `requirements.txt`. 
 ```bash
 git clone [email protected]:maurock/snake-ga.git
 ```
 
 ## Run
-To run the game, executes in the snake-ga folder:
+To run and show the game, executes in the snake-ga folder:
 
 ```python
 python snakeClass.py --display=True --speed=50
@@ -22,15 +29,24 @@ Arguments description:
 - --display - Type bool, default True, display or not game view
 - --speed - Type integer, default 50, game speed
 
-This will run and show the agent. The default configuration loads the file *weights/weights.hdf5* and runs a test.
-The Deep neural network can be customized in the file snakeClass.py modifying the dictionary *params* in the function *define_parameters()*
+The default configuration loads the file *weights/weights.hdf5* and runs a test.
+The parameters of the Deep neural network can be changed in *snakeClass.py* by modifying the dictionary `params` in the function `define_parameters()`
 
 To train the agent, set in the file snakeClass.py:
 - params['load_weights'] = False
 - params['train'] = True
 
 In snakeClass.py you can set argument *--display*=False and *--speed*=0, if you do not want to see the game running. This speeds up the training phase.
 
+## Optimize Deep RL with Bayesian Optimization
+To optimize the Deep neural network and additional parameters, run:
+
+```python
+python snakeClass.py --bayesianopt=True
+```
+
+This method uses Bayesian optimization to optimize some parameters of Deep RL. The parameters and the features' search space can be modified in *bayesOpt.py*, by editing the `optim_params` dictionary in `optimize_RL`.
+
 ## For Mac users
 It seems there is a OSX specific problem, since many users cannot see the game running.
 To fix this problem, in update_screen(), add this line.

diff --git a/requirements.txt b/requirements.txt
@@ -1,65 +1,9 @@
-absl-py==0.8.0
-astor==0.8.0
-blinker==1.4
-brotlipy==0.7.0
-cachetools==4.1.0
-certifi==2020.4.5.2
-cffi==1.14.0
-chardet==3.0.4
-click==7.1.2
-cmake-example==0.0.1
-cryptography==2.9.2
-cycler==0.10.0
-gast==0.2.2
-google-auth==1.14.1
-google-auth-oauthlib==0.4.1
-google-pasta==0.1.7
-grpcio==1.27.2
-h5py==2.10.0
-idna==2.9
-Keras==2.3.1
-Keras-Applications==1.0.8
-Keras-Preprocessing==1.1.0
-kiwisolver==1.2.0
-Markdown==3.1.1
 matplotlib==3.2.0
-mkl-fft==1.1.0
-mkl-random==1.1.1
-mkl-service==2.3.0
-msgpack-numpy==0.4.4.3
-numpy==1.18.1
-oauthlib==3.1.0
-opt-einsum==3.1.0
+Keras==2.2.4
+numpy==1.17.2
+torch==1.4.0
+seaborn==0.9.0
+pygame==1.9.3
 pandas==0.25.1
-protobuf==3.12.3
-pyasn1==0.4.8
-pyasn1-modules==0.2.7
-pycparser==2.20
-pygame==1.9.6
-PyJWT==1.7.1
-pyOpenSSL==19.1.0
-pyparsing==2.4.7
-pyreadline==2.1
-PySocks==1.7.1
-python-dateutil==2.8.1
-pytz==2020.1
-PyYAML==5.3.1
-requests==2.23.0
-requests-oauthlib==1.3.0
-rsa==4.0
-scipy==1.4.1
-seaborn==0.10.1
-six==1.15.0
-tabulate==0.8.3
-tensorboard==2.2.1
-tensorboard-plugin-wit==1.6.0
-tensorflow==2.1.0
-tensorflow-estimator==1.14.0
-tensorpack==0.9.4
-termcolor==1.1.0
-tgan==0.1.0
-urllib3==1.25.9
-Werkzeug==0.16.1
-win-inet-pton==1.1.0
-wincertstore==0.2
-wrapt==1.11.2
+GPyOpt==1.2.6
+numpy==1.19.4