From b73c367f86cbd254459aa060d8002a8748dba87f Mon Sep 17 00:00:00 2001
From: Niklas H <nhopner@gmail.com>
Date: Tue, 23 Mar 2021 12:16:51 +0100
Subject: [PATCH] Changes to make it work

---
 components/agent.py                           |  64 ++++--
 components/autoencoder.py                     |  58 +++--
 components/state_builder.py                   |  63 ++++-
 cross_circle_gym/envs/cross_circle_base.py    | 141 ++++++++----
 .../envs/cross_circle_mixed_rand.py           |   5 +-
 main.py                                       | 216 ++++++++++--------
 scripts/hyperparameter.sh                     |  52 +++++
 scripts/training.sh                           |  43 ++++
 utils.py                                      | 116 ++++++++++
 9 files changed, 568 insertions(+), 190 deletions(-)
 create mode 100644 scripts/hyperparameter.sh 
 create mode 100644 scripts/training.sh
 create mode 100644 utils.py

diff --git a/components/agent.py b/components/agent.py
index 8b1edeb..798792f 100644
--- a/components/agent.py
+++ b/components/agent.py
@@ -70,10 +70,13 @@ def update_target_model(self):
     def remember(self, state, action, reward, next_state, done):
         self.memory.append((state, action, reward, next_state, done))
 
-    def act(self, state):
-        if np.random.rand() <= self.epsilon:
-            return random.randrange(self.action_size)
-        act_values = self.model.predict(state)
+    def act(self, state,random=True):
+        if random:
+            if np.random.rand() <= self.epsilon:
+                return random.randrange(self.action_size)
+            act_values = self.model.predict(state)
+        else:
+            act_values = self.model.predict(state)
         return np.argmax(act_values[0])  # returns action
 
     def replay(self, batch_size):
@@ -98,24 +101,30 @@ def save(self, name):
 
 class TabularAgent:
     '''RL agent as described in the DSRL paper'''
-    def __init__(self, action_size, neighbor_radius=25):
+    def __init__(self, action_size,alpha,epsilon_decay,neighbor_radius=25):
         self.action_size = action_size
+        self.alpha = alpha
         self.epsilon = 1
-        self.epsilon_decay = 0.999
+        self.epsilon_decay = epsilon_decay
         self.epsilon_min = 0.1
         self.gamma = 0.95
         self.neighbor_radius=neighbor_radius
+        self.offset = neighbor_radius*2
         self.tables = {}
 
-    def act(self, state):
+    def act(self, state,random_act=True):
         '''
         Determines action to take based on given state
         State: Array of interactions
                (entities in each interaction are presorted by type for consistency)
         Returns: action to take, chosen e-greedily
         '''
+        if not random_act:
+            return np.argmax(self._total_rewards(state))
         if np.random.rand() <= self.epsilon:
-            print('random action, e:', self.epsilon)
+            #print('random action, e:', self.epsilon)
+            if self.epsilon > self.epsilon_min:
+                self.epsilon *= self.epsilon_decay
             return random.randrange(self.action_size)
 
         if self.epsilon > self.epsilon_min:
@@ -125,20 +134,37 @@ def act(self, state):
 
     def update(self, state, action, reward, next_state, done):
         '''Update tables based on reward and action taken'''
-        curr_tr = self._total_rewards(state)
-        next_tr = self._total_rewards(next_state)
-        print('Reward for action {}: {}. Current total rewards: {}'.format(action, reward, curr_tr))
-        print('Next Total Reward:', next_tr)
+
+
 
         for interaction in state:
             type_1, type_2 = interaction['types_after'] # TODO resolve: should this too be types_before?
             table = self.tables.setdefault(type_1, {}).setdefault(type_2, self._make_table())
-
-            if done:
-                table[interaction['loc_difference']][action] = reward
+            id1,id2 = interaction['interaction']
+            interaction_next_state = [inter for inter in next_state if inter['interaction']==(id1,id2)]
+            if len(interaction_next_state)==0:
+                continue
+            elif len(interaction_next_state)>1:
+                raise ValueError('This should not happen')
             else:
-                table[interaction['loc_difference']][action] = \
-                    reward + self.gamma * (np.max(next_tr) - curr_tr[action])
+                #print('Now we should update the Q-values')
+                #print(f'The current reward is {reward}')
+                interaction_next_state = interaction_next_state[0]
+                interaction['loc_difference'] = (interaction['loc_difference'][0]+self.offset,interaction['loc_difference'][1]+self.offset)
+                interaction_next_state['loc_difference'] = (interaction_next_state['loc_difference'][0]+self.offset,interaction_next_state['loc_difference'][1]+self.offset)
+                #print(interaction_next_state['loc_difference'])
+                #print(interaction['loc_difference'])
+                next_action_value = table[interaction_next_state['loc_difference']]
+                #print(f'The next action value {next_action_value}')
+                if done:
+                    table[interaction['loc_difference']][action] = reward
+                else:
+                    #print(f'Q-value before update {table[interaction["loc_difference"]][action]}')
+                    #print(f'Location {interaction["loc_difference"]}')
+                    #print(f"The new value should be {table[interaction['loc_difference']][action] + self.alpha*(reward + self.gamma * np.max(next_action_value) - table[interaction['loc_difference']][action])}")
+                    #print(interaction['loc_difference'])
+                    table[interaction['loc_difference']][action] = table[interaction['loc_difference']][action] + self.alpha*(reward + self.gamma * np.max(next_action_value) - table[interaction['loc_difference']][action])
+                    #print(f'Q-value after update {table[interaction["loc_difference"]][action]}')
 
     def _total_rewards(self, interactions):
         action_rewards = np.zeros(self.action_size)
@@ -154,8 +180,8 @@ def _make_table(self):
         3-D table: rows = loc_difference_x, cols = loc_difference_y, z = q-values for actions
         Rows and cols added to as needed
         '''
-        return np.zeros((self.neighbor_radius * 2, self.neighbor_radius * 2, self.action_size),
-                        dtype=int)
+        return np.zeros((self.neighbor_radius * 8, self.neighbor_radius * 8, self.action_size),
+                        dtype=float)
 
     def save(self, filename):
         '''Save agent's tables'''
diff --git a/components/autoencoder.py b/components/autoencoder.py
index d316494..889c64b 100644
--- a/components/autoencoder.py
+++ b/components/autoencoder.py
@@ -15,9 +15,9 @@
 
 class SymbolAutoencoder():
     '''Implements the DSRL paper section 3.1. Extract entities from raw image'''
-    def __init__(self, input_shape, neighbor_radius=25):
+    def __init__(self, input_shape,filter_size,neighbor_radius=25):
         self.neighbor_radius = neighbor_radius
-
+        self.filter_size = filter_size
         input_img = Input(shape=input_shape)
         encoded = Conv2D(16, (5, 5), activation='relu', padding='same')(input_img)
         encoded = MaxPooling2D((POOL_SIZE, POOL_SIZE), padding='same')(encoded)
@@ -30,6 +30,8 @@ def __init__(self, input_shape, neighbor_radius=25):
         self.autoencoder = Model(input_img, decoded)
         self.autoencoder.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
 
+        self.repr_entity_activations = []
+
     def train(self, train_data, epochs=50, batch_size=128, shuffle=True,
               validation=None, tensorboard=False):
         '''Train the autoencoder on provided images'''
@@ -62,21 +64,22 @@ def _extract_positions(self, encoded_image):
         features -= background_value
         #apply the local maximum filter; all pixel of maximal value
         #in their neighborhood are set to 1
-        filtered = maximum_filter(features, size=(4, 4))    #TODO: Abstract size
-        filtered = np.asarray(filtered == features, dtype=int) - np.asarray(filtered == 0,
-                                                                            dtype=int)
+        filtered = maximum_filter(features, size=(self.filter_size, self.filter_size))    #TODO: Abstract size
+        filtered = np.asarray(filtered == features, dtype=int) - np.asarray(filtered == 0,dtype=int)
         filtered.reshape(encoded_image.shape[:-1])
         filtered *= POOL_SIZE # Pooling = downsampling = everything is scaled down by POOL_SIZE
         #2d image of the positions, and just the indices
         return filtered, np.transpose(np.nonzero(filtered))
 
-    def visualize(self, images):
+    def visualize(self, images,show=False):
         '''Visualize autoencoder processing steps'''
         if len(images) > 20:
             raise Exception('Too many visualization images, please provide <= 20')
         logger.info('Visualizing...')
 
+
         encoded_imgs = self.encode(images)
+        print(f'Encoded Image {encoded_imgs.shape}')
         position_maps = [self._extract_positions(x)[0] for x in encoded_imgs]
         decoded_imgs = self.predict(images)
 
@@ -90,14 +93,14 @@ def flatten_to_img(array):
             plt_i = i+1
             # display original
             axis = plt.subplot(4, n_plots, plt_i)
-            plt.imshow(flatten_to_img(images[i]))
+            plt.imshow(images[i])
             plt.gray()
             axis.get_xaxis().set_visible(False)
             axis.get_yaxis().set_visible(False)
 
             # display reconstruction
             axis = plt.subplot(4, n_plots, plt_i + n_plots)
-            plt.imshow(flatten_to_img(decoded_imgs[i]))
+            plt.imshow(decoded_imgs[i])
             plt.gray()
             axis.get_xaxis().set_visible(False)
             axis.get_yaxis().set_visible(False)
@@ -117,8 +120,10 @@ def flatten_to_img(array):
             axis.get_xaxis().set_visible(False)
             axis.get_yaxis().set_visible(False)
 
-        print('\nPlot visible, close it to proceed')
-        plt.show()
+        if show:
+            plt.show()
+        #print('\nPlot visible, close it to proceed')
+        return plt.gcf()
 
     def get_entities(self, image):
         '''
@@ -128,31 +133,35 @@ def get_entities(self, image):
                  etc.
                 }
         '''
-
+        #print('Inside the get entities function')
         encoded = self.encode(image.reshape((1,) + image.shape))[0]
         pos_map, entities = self._extract_positions(encoded)
 
-        repr_entity_activations = []    # Representative depth slice for a certain type
+        #print(f'Number of identified entities: {len(entities)}')
+        #print(f'Number of identified entities: {entities.shape}')
+        #print(entities)
+
+
         typed_entities = []   # Actual Entity() array
         found_types = []
         # TODO: Enhancements: knn classifier instead of this caveman shit
         for entity_coords in entities:
             activations = encoded[entity_coords[0], entity_coords[1], :]
-            if not repr_entity_activations:
-                repr_entity_activations.append(activations)
+            if not self.repr_entity_activations:
+                self.repr_entity_activations.append(activations)
                 e_type = 'type0'
 
             else:
-                for i, e_activations in enumerate(repr_entity_activations):
+                for i, e_activations in enumerate(self.repr_entity_activations):
                     dist = sqeuclidean(activations, e_activations)
                     if dist < ENTITY_DIST_THRESHOLD:    # Same type
-                        repr_entity_activations[i] = (e_activations + activations) / 2
+                        self.repr_entity_activations[i] = (e_activations + activations) / 2
                         e_type = 'type' + str(i)
                         break
                 else:
                     # No type match, make new type
-                    repr_entity_activations.append(activations)
-                    new_type_idx = len(repr_entity_activations) - 1
+                    self.repr_entity_activations.append(activations)
+                    new_type_idx = len(self.repr_entity_activations) - 1
                     e_type = 'type' + str(new_type_idx)
 
             min_coords = entity_coords-self.neighbor_radius
@@ -170,12 +179,12 @@ def get_entities(self, image):
         return typed_entities, found_types
 
     @staticmethod
-    def from_saved(filename, input_shape, neighbor_radius=None):
+    def from_saved(filename, input_shape, filter_size, neighbor_radius=None):
         '''Load autoencoder weights from filename, given input shape'''
         if neighbor_radius is not None:
-            ret = SymbolAutoencoder(input_shape, neighbor_radius=neighbor_radius)
+            ret = SymbolAutoencoder(input_shape,filter_size, neighbor_radius=neighbor_radius)
         else:
-            ret = SymbolAutoencoder(input_shape)
+            ret = SymbolAutoencoder(input_shape,filter_size)
         ret.autoencoder.load_weights(filename)
         return ret
 
@@ -214,3 +223,10 @@ def disappeared(self):
     def _transition(self, from_type, to_type):
         self.last_transition = [from_type, to_type]
         self.entity_type = to_type
+
+    def __repr__(self):
+        text = ''
+        text += f'Entity ID {self.id} \n'
+        text += f'Entitiy Type {self.entity_type} \n'
+        text += f'Position {self.position} \n'
+        return text
diff --git a/components/state_builder.py b/components/state_builder.py
index 748007c..0d2f99a 100644
--- a/components/state_builder.py
+++ b/components/state_builder.py
@@ -15,9 +15,12 @@ def __init__(self, neighbor_radius=25):
         self.do_not_exist = []  # entities to be removed as they no longer exist
         self.sim_weights = [2, 1, 1]
         self.neighbor_radius = neighbor_radius
+        self.offset = 0
 
     def build_state(self, entities, found_types):
         '''Tag entities across time, build interactions'''
+        #print(f'Entities found:')
+
         if not self.tracked_entities and self.next_free_entity_id == 0:
             # Init type transition matrix
             self.type_transition_matx = DataFrame(0,
@@ -30,21 +33,28 @@ def build_state(self, entities, found_types):
 
             # init tracking for objects
             self._init_tracking(entities)
+            #print(f'Inititlaized Transition Matrix {self.type_transition_matx}')
+            #print(f'Inititliazed tracked entities {self.tracked_entities}')
+            #print(f'Next free entitiy ID {self.next_free_entity_id}')
 
         else:
             # Update type transition matrix if there are new types
             num_current_types = self.type_transition_matx.shape[0]
             for e_type in found_types:
+                # Wrong!
                 if e_type not in self.type_transition_matx.index:
                     # New, never before seen entity type, make new entry in trans matrix
                     # make column
                     self.type_transition_matx.insert(num_current_types, e_type, 0)
+                    num_current_types += 1
                     # make row
                     self.type_transition_matx.loc[e_type] = np.zeros(num_current_types, dtype=int)
 
                     # set initial transition to 0 because assumption: objects tend to stay the same
                     self.type_transition_matx.at[e_type, e_type] = 1
 
+                    #print(f'Updated transition matrix  {self.type_transition_matx}')
+
             # print(self.type_transition_matx)
 
             # Update tracking for objects
@@ -59,6 +69,7 @@ def restart(self):
         self.tracked_entities = []
         self.next_free_entity_id = 0
         self.do_not_exist = []
+        self.offset = 0
 
     def _init_tracking(self, entities):
         '''Set up tags for all existing entities'''
@@ -87,7 +98,7 @@ def _is_same_entity(self, old_e, new_e):
         similarity = self.sim_weights[0] * l_dist + \
                      self.sim_weights[1] * l_trans + \
                      self.sim_weights[2] * l_neighbors
-        similarity = similarity/3
+        similarity = similarity/4
 
         # print(l_dist, l_trans, l_neighbors, 'similarity:', similarity)
 
@@ -95,7 +106,7 @@ def _is_same_entity(self, old_e, new_e):
 
     def _update_tracking(self, new_entities):
         '''Track entities across time, using their last state'''
-
+        #print('We are inside the update tracking function')
         # if an entity is not matched with any in new entities,
         # place it in possibly_disappeared, and remove it if encountered
         # If there are any in possibly_disappeared by the time the
@@ -103,16 +114,24 @@ def _update_tracking(self, new_entities):
         possibly_disappeared = []
 
         newly_nonexistent = []
+
+        #print('We go through the tracked entities')
+        old_number_of_tracked_entities = len(self.tracked_entities)
         for i, tracked_e in enumerate(self.tracked_entities):
             # print(tracked_e.__dict__)
             if not tracked_e.exists:
-                print('Marked for deletion next loop', tracked_e.__dict__)
-                print('---')
+                #print('Marked for deletion next loop', tracked_e.__dict__)
+                #print('---')
                 newly_nonexistent.append(i)
                 continue
+
             for new_e_i, new_e in enumerate(new_entities):
                 # print('comparing', new_e.__dict__)
                 if self._is_same_entity(tracked_e, new_e):
+                    #print(f'We found a match for entity {tracked_e.id}')
+
+                    #print(f'The position of the new entity {new_e.position} ')
+                    #print(f'The postion of the old entity {tracked_e.position}')
                     # print('same entity')
                     # Update transition matrix
                     # (even if not transitioned, how often the type stays the same is important)
@@ -126,8 +145,8 @@ def _update_tracking(self, new_entities):
                     break
             else:
                 # new entity, and/or tracked_e disappeared
-                print('match not found', tracked_e.__dict__)
-                print('---')
+                #print('match not found', tracked_e.__dict__)
+                #print('---')
                 possibly_disappeared.append(i)
 
         for disapp_idx in possibly_disappeared: # well, they definitely disappeared
@@ -139,12 +158,21 @@ def _update_tracking(self, new_entities):
             # that the entity disappeared
             self.tracked_entities[disapp_idx].disappeared()
 
+        #print(f'Self-Do-Exist: {self.do_not_exist}')
+        #print(f'len {len(self.tracked_entities)}')
+        #print(f'These are the entities to be removed {self.do_not_exist}')
+        #print(f'These are the tracked entities {self.tracked_entities}')
         self.do_not_exist.reverse()
         for dne_idx in self.do_not_exist:
-            print('DNE', dne_idx)
-            del self.tracked_entities[dne_idx]
+            #print('DNE', dne_idx)
+            if self.offset<0:
+                del self.tracked_entities[dne_idx+self.offset]
+            else:
+                del self.tracked_entities[dne_idx]
 
         self.do_not_exist = newly_nonexistent   # to be removed next time
+        #print(f'These entities must be removed in the next timestep {self.do_not_exist} ')
+        #print(f'These are the current tracked entities {self.tracked_entities}')
 
         for entity_to_add in new_entities:
             entity_to_add.id = self.next_free_entity_id
@@ -157,6 +185,10 @@ def _update_tracking(self, new_entities):
             # increment id for next appearing object
             self.next_free_entity_id += 1
 
+        new_number_of_tracked_entities = len(self.tracked_entities)
+        self.offset = new_number_of_tracked_entities-old_number_of_tracked_entities
+
+
 
     def _build_representation(self):
         '''Build time-abstracted representation + object interactions'''
@@ -173,8 +205,12 @@ def interaction(el_1, el_2, loc_diff, types_before, types_after):
         interactions_built = [] # pairs of entities for which interaction has already been built
         # Build interactions
         for entity in self.tracked_entities:
+
+            #print([(np.abs((x.position - entity.position)),(x.prev_state['position']-entity.prev_state['position'])-(x.position-entity.position)) for x in self.tracked_entities if np.all(np.abs((x.position - entity.position)) < self.neighbor_radius*2)])
+
             within_radius = [x for x in self.tracked_entities
-                             if np.all((x.position - entity.position) < self.neighbor_radius*2)]
+                             if np.all(np.abs((x.position - entity.position)) < self.neighbor_radius*2)]
+
             for w_r in within_radius:
                 sorted_e = (entity, w_r) if entity.entity_type < w_r.entity_type else (w_r, entity)
                 interact_ids = (sorted_e[0].id, sorted_e[1].id)
@@ -183,8 +219,8 @@ def interaction(el_1, el_2, loc_diff, types_before, types_after):
                     continue
 
                 # position change
-                loc_diff = (sorted_e[0].position - sorted_e[0].prev_state['position']) - \
-                                    (sorted_e[1].position - sorted_e[1].prev_state['position'])
+                loc_diff = (sorted_e[0].prev_state['position'] - sorted_e[1].prev_state['position']) - (sorted_e[0].position - sorted_e[1].position)
+                #print(f'The loc_diff is {loc_diff}')
                 types_before = (sorted_e[0].prev_state['entity_type'], sorted_e[1].prev_state['entity_type'])
                 types_after = (sorted_e[0].entity_type, sorted_e[1].entity_type)
                 if np.array_equal(loc_diff, (0, 0)) and np.array_equal(types_before, types_after):
@@ -193,7 +229,10 @@ def interaction(el_1, el_2, loc_diff, types_before, types_after):
                 interactions.append(interaction(sorted_e[0], sorted_e[1], loc_diff,
                                                 types_before, types_after))
                 interactions_built.append(interact_ids)
-
+        #print(f'We now print the calculated interactions')
+        #for interaction in interactions:
+         #   print(interaction)
+        #print(f'Number of Interactions {len(interactions)}')
         return interactions
 
     def _mark_transition(self, from_type, to_type):
diff --git a/cross_circle_gym/envs/cross_circle_base.py b/cross_circle_gym/envs/cross_circle_base.py
index c252923..380af15 100644
--- a/cross_circle_gym/envs/cross_circle_base.py
+++ b/cross_circle_gym/envs/cross_circle_base.py
@@ -1,15 +1,53 @@
-'''Base class for the DSRL paper toy game'''
+'''Base class for the DSRL paper toy game - adapted from the Lua environment here: https://github.com/Kaixhin/rlenvs/blob/master/rlenvs/XOWorld.lua'''
+
+
 import gym
 from gym import spaces
 from gym.utils import seeding
 import numpy as np
 from matplotlib import pyplot as plt
 from matplotlib.colors import to_rgb
-import imageio
-import os
 from skimage.transform import resize
 
 
+
+
+AGENT_MASK = np.expand_dims(np.array([[0, 0, 0, 1, 1, 1, 1, 0, 0, 0],
+                              [0, 0, 0, 1, 1, 1, 1, 0, 0, 0],
+                              [0, 0, 0, 1, 1, 1, 1, 0, 0, 0],
+                              [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                              [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                              [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                              [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                              [0, 0, 0, 1, 1, 1, 1, 0, 0, 0],
+                              [0, 0, 0, 1, 1, 1, 1, 0, 0, 0],
+                              [0, 0, 0, 1, 1, 1, 1, 0, 0, 0]]),axis=2)
+
+CROSS_MASK = np.expand_dims(np.array([[1, 1, 1, 1, 0, 0, 0, 1, 1, 1],
+                              [0, 1, 1, 1, 1, 0, 1, 1, 1, 1],
+                              [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],
+                              [0, 0, 0, 1, 1, 1, 1, 1, 0, 0],
+                              [0, 0, 0, 1, 1, 1, 1, 1, 0, 0],
+                              [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],
+                              [0, 1, 1, 1, 1, 0, 1, 1, 1, 1],
+                              [1, 1, 1, 1, 0, 0, 0, 1, 1, 1],
+                              [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                              [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),axis=2)
+
+CIRCLE_MASK = np.expand_dims(np.array([[0, 0, 0, 1, 1, 1, 1, 1, 0, 0],
+                              [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],
+                              [0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                              [1, 1, 1, 1, 0, 0, 0, 1, 1, 1],
+                              [1, 1, 1, 0, 0, 0, 0, 0, 1, 1],
+                              [1, 1, 1, 0, 0, 0, 0, 0, 1, 1],
+                              [1, 1, 1, 0, 0, 0, 0, 0, 1, 1],
+                              [1, 1, 1, 1, 0, 0, 0, 1, 1, 1],
+                              [0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                              [0, 0, 1, 1, 1, 1, 1, 1, 1, 0]]),axis=2)
+
+MASKS = {'circle':CIRCLE_MASK,'cross':CROSS_MASK,'agent':AGENT_MASK}
+
+
 class Entity(object):
     def __init__(self, y, x, h, w, kind, center=False, z=None):
         if center:
@@ -66,13 +104,14 @@ class CrossCircleBase(gym.Env):
     }
 
     def __init__(
-            self, field_dim=100, background_colour='white', shape_colours="white white white",
-            entity_size=10, min_entities=25, max_entities=50, max_overlap_factor=0.2, overlap_factor=0.25, step_size=10):
+            self, field_dim=84, background_colour='white', shape_colours="white white white",
+            entity_size=10, min_entities=16, max_entities=16, max_overlap_factor=0.0, overlap_factor=0.2, step_size=1, color_state=False):
 
         self.field_dim = field_dim
         self.background_colour = background_colour
         self.shape_colours = shape_colours
         self.entity_size = entity_size
+        self.color_state = color_state
 
         self.min_entities = min_entities
         self.max_entities = max_entities
@@ -82,18 +121,19 @@ def __init__(
         self.step_size = step_size
 
         self.action_space = spaces.Discrete(4)
-        self.observation_space = spaces.Box(0, 1, shape=(self.field_dim, self.field_dim, 3))
+        self.observation_space = spaces.Box(0, 1, shape=(self.field_dim, self.field_dim, 1))
         self.reward_range = (-1, 1)
 
         self.entities = {'cross': [], 'circle': []}
         self.agent = None
 
         self.masks = {}
+
         for entity_type in 'circle cross agent'.split():
-            f = os.path.join(os.path.dirname(__file__), "images", "{}.png".format(entity_type))
-            mask = imageio.imread(f)
+            mask = MASKS[entity_type]
             mask = resize(mask, (self.entity_size, self.entity_size), mode='edge', preserve_range=True)
-            self.masks[entity_type] = np.tile(mask[..., 3:], (1, 1, 3)) / 255.
+            self.masks[entity_type] = mask
+
 
         self.background_colour = None
         if background_colour:
@@ -117,7 +157,10 @@ def __init__(
     @property
     def combined_state(self):
         '''Add state layers into one array'''
-        image = np.zeros((self.field_dim, self.field_dim, 3)) * self.background_colour
+        if self.color_state:
+            image = np.zeros((self.field_dim, self.field_dim, 3)) * self.background_colour
+        else:
+            image = np.zeros((self.field_dim, self.field_dim, 1))
 
         all_entities = []
         for entity_type, entities in self.entities.items():
@@ -131,10 +174,12 @@ def combined_state(self):
                 continue
 
             _alpha = self.masks[entity.kind]
-            if self.shape_colours is None:
-                _image = np.random.rand(self.entity_size, self.entity_size, 3)
-            else:
-                _image = np.tile(self.shape_colours[entity.kind], (self.entity_size, self.entity_size, 1))
+
+            if self.color_state:
+                if self.shape_colours is None:
+                    _image = np.random.rand(self.entity_size, self.entity_size, 3)
+                else:
+                    _image = np.tile(self.shape_colours[entity.kind], (self.entity_size, self.entity_size, 1))
 
             top = int(entity.top)
             bottom = top + int(entity.h)
@@ -142,7 +187,10 @@ def combined_state(self):
             left = int(entity.left)
             right = left + int(entity.w)
 
-            image[top:bottom, left:right, ...] = _alpha * _image + (1 - _alpha) * image[top:bottom, left:right, ...]
+            if self.color_state:
+                image[top:bottom, left:right, ...] = _alpha * _image + (1 - _alpha) * image[top:bottom, left:right, ...]
+            else:
+                image[top:bottom, left:right] = _alpha
 
         return image
 
@@ -193,8 +241,13 @@ def layout(self, random=True, mixed=True, min_entities=None, max_entities=None,
 
         if random:
             sub_image_shapes = [(self.entity_size, self.entity_size) for i in range(n_entities)]
+
+
             entities = self._sample_entities(sub_image_shapes, self.max_overlap_factor)
 
+            if entities==0:
+                return 0
+
             for i, e in enumerate(entities):
                 if mixed and i % 2 == 0:
                     entity_type = 'cross'
@@ -237,40 +290,44 @@ def _sample_entities(self, patch_shapes, max_overlap_factor=None, size_std=None)
         rects = []
 
         for i in range(n_rects):
-            n_tries = 0
-            while True:
-                if size_std is None:
-                    shape_multipliers = 1.
-                else:
-                    shape_multipliers = np.maximum(np.random.randn(2) * size_std + 1.0, 0.5)
+            try:
+                n_tries = 0
+                while True:
+                    if size_std is None:
+                        shape_multipliers = 1.
+                    else:
+                        shape_multipliers = np.maximum(np.random.randn(2) * size_std + 1.0, 0.5)
 
-                m, n = np.ceil(shape_multipliers * patch_shapes[i, :2]).astype('i')
+                    m, n = np.ceil(shape_multipliers * patch_shapes[i, :2]).astype('i')
 
-                rect = Entity(
-                    np.random.randint(0, self.field_dim-m+1),
-                    np.random.randint(0, self.field_dim-n+1), m, n, kind=None)
+                    rect = Entity(
+                        np.random.randint(0, self.field_dim-m+1),
+                        np.random.randint(0, self.field_dim-n+1), m, n, kind=None)
 
-                if max_overlap_factor is None:
-                    rects.append(rect)
-                    break
-                else:
-                    violation = False
-                    for r in rects:
-                        if rect.overlap_area(r) / (self.entity_size**2) > max_overlap_factor:
-                            violation = True
-                            break
-
-                    if not violation:
+                    if max_overlap_factor is None:
                         rects.append(rect)
                         break
+                    else:
+                        violation = False
+                        for r in rects:
+                            if rect.overlap_area(r) / (self.entity_size**2) > max_overlap_factor:
+                                violation = True
+                                break
+
+                        if not violation:
+                            rects.append(rect)
+                            break
+
+                    n_tries += 1
 
-                n_tries += 1
+                    if n_tries > 10000:
+                        raise Exception(
+                            "Could not fit rectangles. "
+                            "(n_rects: {}, field_dim: {}, max_overlap_factor: {})".format(
+                                n_rects, self.field_dim, max_overlap_factor))
+            except:
+                return 0
 
-                if n_tries > 10000:
-                    raise Exception(
-                        "Could not fit rectangles. "
-                        "(n_rects: {}, field_dim: {}, max_overlap_factor: {})".format(
-                            n_rects, self.field_dim, max_overlap_factor))
 
         return rects
 
diff --git a/cross_circle_gym/envs/cross_circle_mixed_rand.py b/cross_circle_gym/envs/cross_circle_mixed_rand.py
index 1135939..5544573 100644
--- a/cross_circle_gym/envs/cross_circle_mixed_rand.py
+++ b/cross_circle_gym/envs/cross_circle_mixed_rand.py
@@ -16,9 +16,12 @@ def make_random_state(self, min_entities=1, max_entities=30):
                       'cross': np.zeros((self.field_dim, self.field_dim)),
                       'agent': np.zeros((self.field_dim, self.field_dim))
                      }
-        self.layout(random=True,
+        error = self.layout(random=True,
                     mixed=True,
                     min_entities=min_entities,
                     max_entities=max_entities,
                     random_agent=True)
+        if error==0:
+            return []
+
         return self.combined_state
diff --git a/main.py b/main.py
index 16d87be..032b55c 100644
--- a/main.py
+++ b/main.py
@@ -1,25 +1,29 @@
 '''Main module for the paper's algorithm'''
-#pylint:disable=C0103,R0913
+
 import argparse
-import os.path
-import pickle
+import os
+
+from collections import deque
+from datetime import datetime
+
+
 import numpy as np
+import tensorflow as tf
+import tqdm
 
-import gym
 from gym import logger
-from sklearn.model_selection import train_test_split
 
-#pylint:disable=W0611
 import cross_circle_gym
-#pylint:enable=W0611
-from components.autoencoder import SymbolAutoencoder
+
 from components.state_builder import StateRepresentationBuilder
-from components.agent import TabularAgent #, DDQNAgent
+from components.agent import TabularAgent
+from utils import prepare_training
 
+# Experiment Parameters
 parser = argparse.ArgumentParser(description=None)
-parser.add_argument('env_id', nargs='?', default='CrossCircle-MixedRand-v0',
-                    help='Select the environment to run')
+parser.add_argument('--experiment_name', type=str, default='default', help='Name of the experiment')
 parser.add_argument('--load', type=str, help='load existing model from filename provided')
+parser.add_argument('--image_dir', type=str, help='laod images from directory provided')
 parser.add_argument('--episodes', '-e', type=int, default=1000,
                     help='number of DQN training episodes')
 parser.add_argument('--load-train', action='store_true',
@@ -29,109 +33,131 @@
                     help='activate own improvements over original paper')
 parser.add_argument('--visualize', '--vis', action='store_true',
                     help='plot autoencoder input & output')
-parser.add_argument('--save', type=str, help='save model to filename provided')
-
-args = parser.parse_args()
-
-TRAIN_IMAGES_FILE = 'train_images.pkl'
-NEIGHBOR_RADIUS = 25  # 1/2 side of square in which to search for neighbors
-
-# You can set the level to logger.DEBUG or logger.WARN if you
-# want to change the amount of output.
-logger.setLevel(logger.INFO)
-
-
-
-env = gym.make(args.env_id)
-seed = env.seed(1)[0]
+parser.add_argument('--save', type=str, help='save model to directory provided')
+parser.add_argument('--logdir',type=str,default='./logs', help='Log directory')
+parser.add_argument('--log_level',type=str,default='warn',help='Detail of logging output')
+parser.add_argument('--evaluation_frequency', type=int, default=100,
+                    help='How often to evaluate the agent')
+parser.add_argument('--tensorboard', action='store_true', default=False,
+                    help='Switch on tensorboard for the autoencoder training')
+parser.add_argument('--play', action='store_true', default=False,
+                    help='Choose the agents action for 20 timesteps to see what the autoencoder does')
+
+# Environment
+parser.add_argument('--random', action='store_true', default=False,
+                    help='Should the position of the entities be random')
+parser.add_argument('--double', action='store_true', default=False,
+                    help='Only negative objects (circles) or also positive ones (cross)')
+parser.add_argument('--n_entities', type=int, default=16,
+                    help='Number of entities in the environment')
+parser.add_argument('--entity_size', type=int, default=10, help='Size of the entities')
+parser.add_argument('--neighborhood_size', type=int, default=10,
+                    help='Size of the neighborhood')
+parser.add_argument('--step_size', type=float, default=1.0, help='Step-Size')
+parser.add_argument('--overlap_factor', type=float, default=0.01,
+                    help='How much must an gent overlap with an entitiy to collect it')
+parser.add_argument('--colour_state', action='store_true', default=False,
+                    help='Whether to use the colour image as a state or a one-channel black and white image')
+
+# Training parameters
+parser.add_argument('--alpha', type=float, default=0.01, help='Learning Rate')
+parser.add_argument('--epsilon_decay', type=float, default=0.99995,
+                    help='Decay rate of epsilon')
+parser.add_argument('--timesteps', type=int, default=100, help='Length of a training episode')
+
+# Autoencdoer
+parser.add_argument('--filter_size', default=10, type=int, help='Size of the filter')
 
 
+args = parser.parse_args()
 
-def make_autoencoder_train_data(num, min_entities=1, max_entities=30):
-    '''Make training images for the autoencoder'''
-    temp_env = gym.make('CrossCircle-MixedRand-v0')
-    temp_env.seed(0)
-    states = []
-    for _ in range(num):
-        states.append(temp_env.make_random_state(min_entities, max_entities))
-    return np.asarray(states)
+now = datetime.now().strftime("%d_%m_%Y_%H_%M_%S")
+args.logdir = os.path.join(args.logdir,args.experiment_name,now)
 
-if not os.path.exists(TRAIN_IMAGES_FILE) or args.new_images:
-    logger.info('Making test images...')
-    images = make_autoencoder_train_data(5000, max_entities=30)
-    with open(TRAIN_IMAGES_FILE, 'wb') as f:
-        pickle.dump(images, f)
+# Choose environment
+if args.random and args.double:
+    env_id = 'CrossCircle-MixedRand-v0'
+elif args.random and not args.double:
+    env_id = 'CrossCircle-NegRand-v0'
+elif not args.random and args.double:
+    env_id = 'CrossCircle-MixedGrid-v0'
 else:
-    logger.info('Loading test images...')
-    with open(TRAIN_IMAGES_FILE, 'rb') as f:
-        images = pickle.load(f)
-
-#input_shape = images[0].shape + (1,)
-input_shape = images[0].shape
-if args.load:
-    autoencoder = SymbolAutoencoder.from_saved(args.load,
-                                               images[0].shape,
-                                               neighbor_radius=NEIGHBOR_RADIUS)
+    env_id = 'CrossCircle-NegGrid-v0'
+args.env_id = env_id
+
+# Set logger
+if args.log_level=='warn':
+    logger.setLevel(logger.WARN)
+elif args.log_level=='info':
+    logger.setLevel(logger.INFO)
 else:
-    autoencoder = SymbolAutoencoder(images[0].shape, neighbor_radius=NEIGHBOR_RADIUS)
+    raise NotImplementedError('Log-level not implemented')
+args.logger = logger
 
-if args.load_train or args.visualize or not args.load:
-    logger.info('Splitting sets...')
-    X_train, X_test = train_test_split(images, test_size=0.2, random_state=seed)
-    X_train, X_val = train_test_split(X_train, test_size=0.2, random_state=seed)
-
-    if args.load_train or not args.load:
-        logger.info('Training...')
-        autoencoder.train(X_train, epochs=10, validation=X_val)
-
-    if args.visualize:
-        #Visualize autoencoder
-        vis_imgs = X_test[:10]
-        autoencoder.visualize(vis_imgs)
-
-if args.save:
-    autoencoder.save_weights(args.save)
+autoencoder,env = prepare_training(args)
 
+state_builder = StateRepresentationBuilder(neighbor_radius=args.neighborhood_size)
+action_size = env.action_space.n
+agent = TabularAgent(action_size,args.alpha,args.epsilon_decay,args.neighborhood_size)
 
-# entities, found_types = autoencoder.get_entities(X_test[0])
+done = False
+time_steps = args.timesteps
 
-state_builder = StateRepresentationBuilder()
-# state = state_builder.build_state(entities, found_types)
-# print(state)
+number_of_evaluations = 0
+buffered_rewards = deque(maxlen=200)
 
-# state_size = None # TODO
-action_size = env.action_space.n
-# if args.enhancements:
-#     agent = DDQNAgent(state_size, action_size)
-# else:
-agent = TabularAgent(action_size)
-# # agent.load('./save/cartpole-ddqn.h5')
-done = False
-batch_size = 32
-time_steps = 100
+summary_writer = tf.summary.create_file_writer(args.logdir)
 
-for e in range(args.episodes):
+for e in tqdm.tqdm(range(args.episodes)):
     state_builder.restart()
     state = env.reset()
-    state = np.reshape(state, input_shape)
     state = state_builder.build_state(*autoencoder.get_entities(state))
-    for time in range(time_steps):
-        env.render(wait=1)
+    total_reward = 0
+
+    for t in range(time_steps):
         action = agent.act(state)
         next_state, reward, done, _ = env.step(action)
-        next_state = np.reshape(next_state, input_shape)
+        total_reward += reward
         next_state = state_builder.build_state(*autoencoder.get_entities(next_state))
-        # next_state = np.reshape(next_state, [1, state_size])
         agent.update(state, action, reward, next_state, done)
         state = next_state
         if done:
             break
-    # if args.enhancements:
-    #     agent.update_target_model()
-    print('episode: {}/{}, e: {:.2}'
-          .format(e, args.episodes, agent.epsilon))
-
-    # if len(agent.memory) > batch_size:
-    #     agent.replay(batch_size)
-    if e % 10 == 0:
-        agent.save('tab_agent.h5')
+
+    buffered_rewards.append(total_reward)
+
+    with summary_writer.as_default():
+        tf.summary.scalar('Averaged Reward',np.mean(buffered_rewards),e)
+        tf.summary.scalar('Epsilon',agent.epsilon,e)
+
+
+    if e % args.evaluation_frequency == 0:
+        number_of_evaluations += 1
+        agent.save(os.path.join(args.logdir,'tab_agent.h5'))
+        evaluation_reward = []
+        with summary_writer.as_default():
+            for i in range(10):
+                done = False
+                state_builder.restart()
+                image = env.reset()
+                state = state_builder.build_state(*autoencoder.get_entities(image))
+                total_reward = 0
+                for t in range(time_steps):
+                    action = agent.act(state,random_act=False)
+                    next_image, reward, done, _ = env.step(action)
+                    if i==0:
+                        tf.summary.image(f'Agent Behaviour {number_of_evaluations}',np.reshape(image,(1,)+image.shape),t)
+                    total_reward += reward
+                    next_state = state_builder.build_state(*autoencoder.get_entities(next_image))
+                    state = next_state
+                    image = next_image
+                evaluation_reward.append(total_reward)
+
+            tf.summary.scalar('Evaluation Reward',np.mean(evaluation_reward),number_of_evaluations)
+
+
+
+
+
+
+
diff --git a/scripts/hyperparameter.sh  b/scripts/hyperparameter.sh 
new file mode 100644
index 0000000..7d7b33c
--- /dev/null
+++ b/scripts/hyperparameter.sh 	
@@ -0,0 +1,52 @@
+#! /bin/bash
+
+# Script to perform Hyperparameter Search
+
+load='../autoencoder_models/gray_10_model.h5'  # If pretrained autoencoder exist here is the file-path of the model
+image_dir='../'
+logdir='../logs'
+log_level='info' # info, warn
+
+evaluation_frequency=50
+
+
+# Environment
+n_entities=16
+entity_size=10
+neighborhood_size=10
+step_size=1.0
+overlap_factor=0.01
+
+# Training parameters
+epsilon_decay=0.99999
+
+
+# Autoencdoer
+filter_size=7
+
+for alpha in 0.1 0.01 0.001
+do
+  for neighborhood_size in 10 20 50
+  do
+    for step_size in 1 2 5 10
+      experiment_name="Alpha_{$alpha}_neighborhood_size_{$neighborhood_size}_Step_{$step_size}"
+      echo "Experiment {$experiment_name} starts"
+      python ../main.py --experiment_name $experiment_name \
+                      --load $load \
+                      --logdir $logdir \
+                      --image_dir $image_dir \
+                      --log_level $log_level \
+                      --evaluation_frequency $evaluation_frequency \
+                      --n_entities $n_entities \
+                      --entity_size $entity_size \
+                      --neighborhood_size $neighborhood_size \
+                      --step_size $step_size \
+                      --overlap_factor $overlap_factor \
+                      --alpha $alpha \
+                      --epsilon_decay $epsilon_decay \
+                      --filter_size $filter_size
+      done
+  done
+done
+
+
diff --git a/scripts/training.sh b/scripts/training.sh
new file mode 100644
index 0000000..2cc3c81
--- /dev/null
+++ b/scripts/training.sh
@@ -0,0 +1,43 @@
+#! /bin/bash
+
+experiment_name='default'
+
+load='../autoencoder_models/gray_10_model.h5'  # If pretrained autoencoder exist here is the file-path of the model
+image_dir='../'
+logdir='../logs'
+log_level='info' # info, warn
+
+evaluation_frequency=50
+
+
+# Environment
+n_entities=16
+entity_size=10
+neighborhood_size=10
+step_size=2.0
+overlap_factor=0.01
+
+# Training parameters
+alpha=0.01
+epsilon_decay=0.99999
+
+
+# Autoencdoer
+filter_size=7
+
+python ../main.py --experiment_name $experiment_name \
+                  --load $load \
+                  --logdir $logdir \
+                  --image_dir $image_dir \
+                  --log_level $log_level \
+                  --evaluation_frequency $evaluation_frequency \
+                  --n_entities $n_entities \
+                  --entity_size $entity_size \
+                  --neighborhood_size $neighborhood_size \
+                  --step_size $step_size \
+                  --overlap_factor $overlap_factor \
+                  --alpha $alpha \
+                  --epsilon_decay $epsilon_decay \
+                  --filter_size $filter_size\
+
+
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..91a3fce
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,116 @@
+import os
+
+import gym
+import numpy as np
+import pickle
+from sklearn.model_selection import train_test_split
+
+from components.autoencoder import SymbolAutoencoder
+
+def make_autoencoder_train_data(env_parameters, num, args, min_entities=1, max_entities=30):
+    '''
+    Make training images for the autoencoder
+
+    env_parameters: (dict) dictionary that specifies the properties of the environment
+    num: (int) number of samples the data should consist of
+    min_entities, max_entities: (int) min/max number of entities that can appear \
+                                in a single environment frame
+
+    return: (np.array) BxWxHxC dataset of environment images
+    '''
+
+    temp_env = gym.make('CrossCircle-MixedRand-v0',**env_parameters)
+    temp_env.seed(0)
+    states = []
+    for i in range(num):
+        state = temp_env.make_random_state(min_entities, max_entities)
+        if len(state)==0:
+            continue
+        states.append(state)
+    args.logger.info(f'Final number of states collected in the current configuration {len(states)}')
+
+    if (len(states)/num)<0.8:
+        raise Exception('With the current environment configuration entities do /'
+                        'not fit onto the grid without overlapping too much')
+    return np.asarray(states)
+
+def prepare_training(args):
+    '''
+    (1) Creates environment
+    (2) Checks whether training images for the autoencoder exist, if not creates them
+    (3) Creates the autoencoder
+    (4) Trains or loads the weights of the autoencoder
+
+    return: trained autoencoder, environment
+    '''
+
+    # Create the environment
+    env_parameters = {'entity_size': args.entity_size,
+                      'min_entities': args.n_entities,
+                      'max_entities': args.n_entities,
+                      'step_size': args.step_size,
+                      'overlap_factor': args.overlap_factor}
+    env = gym.make(args.env_id, **env_parameters)
+    seed = env.seed(1)[0]
+
+    # Load or create images
+    if args.colour_state:
+        GRAY = 'colour'
+    else:
+        GRAY = 'gray'
+
+    TRAIN_IMAGES_FILE = f'train_images_{GRAY}.pkl'
+    print(os.path.join(args.image_dir,TRAIN_IMAGES_FILE))
+    if not os.path.exists(os.path.join(args.image_dir,TRAIN_IMAGES_FILE)) or args.new_images:
+        args.logger.info('Making test images...')
+        images = make_autoencoder_train_data(env_parameters, 5000, args, max_entities=20)
+        with open(os.path.join(args.image_dir,TRAIN_IMAGES_FILE), 'wb') as f:
+            pickle.dump(images, f)
+    else:
+        args.logger.info('Loading test images...')
+        with open(os.path.join(args.image_dir,TRAIN_IMAGES_FILE), 'rb') as f:
+            images = pickle.load(f)
+
+    # Create the autoencoder
+    input_shape = images[0].shape
+    if args.load:
+        autoencoder = SymbolAutoencoder.from_saved(args.load,
+                                                   input_shape,
+                                                   args.filter_size,
+                                                   neighbor_radius=args.neighborhood_size)
+    else:
+        autoencoder = SymbolAutoencoder(input_shape, args.filter_size, neighbor_radius=args.neighborhood_size)
+
+
+    # Train or load autoencoder
+    if args.load_train or args.visualize or not args.load:
+        args.logger.info('Splitting sets...')
+        X_train, X_test = train_test_split(images, test_size=0.2, random_state=seed)
+        X_train, X_val = train_test_split(X_train, test_size=0.2, random_state=seed)
+
+        if args.load_train or not args.load:
+            args.logger.info('Training...')
+            autoencoder.train(X_train, epochs=10, validation=X_val,tensorboard=args.tensorboard)
+
+        if args.visualize:
+            # Visualize autoencoder
+            vis_imgs = X_test[:10]
+            autoencoder.visualize(vis_imgs)
+
+    if args.save:
+        autoencoder.save_weights(os.path.join(args.save, f'{GRAY}_{args.entity_size}_model.h5'))
+
+
+    # Visualize the results of the autoencoder
+    if args.play:
+        # Visualize your own moves for 10 steps
+        state = env.reset()
+        for i in range(20):
+            state = np.reshape(state, (1,) + input_shape)
+            autoencoder.visualize(state,show=True)
+            action = int(input('Next action: '))
+            state, reward, _, _ = env.step(action)
+            print(f'The overall reward is {reward}')
+
+    return autoencoder, env
+