diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d97f79b
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+__pycache__
+Output_Graphics/
+Results Presentations/
diff --git a/DeepClean_2D/Custom_Normalisation_V1.py b/DeepClean_2D/Custom_Normalisation_V1.py
new file mode 100644
index 0000000..6b24806
--- /dev/null
+++ b/DeepClean_2D/Custom_Normalisation_V1.py
@@ -0,0 +1,123 @@
+# -*- coding: utf-8 -*-
+"""
+Custom Normalisation Transform V1
+@author: Adill Al-Ashgar
+Created on Tue Nov 15 19:02:32 2022
+
+USER NOTICE!
+Input data must be tensor, output is tensor with values scaled betwen 0.5 and 1. Additional arguments required specifies the max and min possible values of the input tensor
+"""
+import torch
+import numpy as np
+################################################
+
+def add_noise2(inputs, noise_points=0, dimensions=(88,128), time_dimension=100):
+    for noise_point in range (0, noise_points+1):
+        np_x = np.random.randint(0, dimensions[0]) 
+        np_y = np.random.randint(0, dimensions[1]) 
+        np_TOF = np.random.randint(0, time_dimension) 
+        inputs[np_x][np_y] = np_TOF
+    return(inputs)
+#testin = torch.tensor(np.array([(0,1,10,100),(100,10,1,0),(0,0,0,0)]))
+#print(testin)
+#print(add_noise2(testin, 2, (3,4), 100))
+
+def scale_Ndata(data, reconstruction_cutoff=0.2, min_val=0, max_val=100):  
+# Scale numpy array data to the range reconstruction_cutoff to 1 leaving 0's as 0    
+    scaled_vals = reconstruction_cutoff + (data - min_val) * (1 - reconstruction_cutoff) / (max_val - min_val)
+    output = np.where(data==0, data, scaled_vals)
+    return output
+
+def scale_Tdata(data, reconstruction_cutoff, min_val=0, max_val=100):  
+# Scale tensor array data to the range reconstruction_cutoff to 1 leaving 0's as 0    
+    scaled_vals = torch.min(data, min_val)
+    scaled_vals = torch.mul(scaled_vals, (1 - reconstruction_cutoff) / (max_val - min_val))
+    scaled_vals = torch.add(scaled_vals, reconstruction_cutoff)
+    output = torch.where(data==0, data, scaled_vals)
+    return output
+################################################
+
+def numpy_normalisatation(data):     #old, takes 0 - 100 data and turns it into 127.5-255 with 0's remaining 0
+    output = ((data / 100) * 127.5) + 127.5
+    output = np.where(data==0,data, output)
+    return output
+
+def custom_np_to_tensor_no_norm(data):
+    data = np.expand_dims(data, axis=0)
+    data = torch.tensor(data)
+    return(data)
+
+def custom_normalisation(data, min=0, max=100):
+    output = data / (max*2)    #Divides all values in the input tensor (data), by the maximum allowed value in time dimension multiplied by 2 (max*2) this normalises the data between 0 and 0.5
+    output = output + 0.5
+    output = np.where(data == 0, data, output)
+    return(output) 
+
+
+def custom_tensor_normalisation(data, min=0, max=100):
+    #data = np.expand_dims(data, axis=0)
+    #data = torch.tensor(data)
+
+    output = torch.div(data, max*2)    #Divides all values in the input tensor (data), by the maximum allowed value in time dimension multiplied by 2 (max*2) this normalises the data between 0 and 0.5
+    output = torch.add(output, 0.5)
+    output = torch.where(data == 0, data, output)
+    return(output) 
+
+
+#ti = (np.array([0,1,10,100]))
+def custom_normalisation_oldv2(data, min=0, max=100):
+    data = np.expand_dims(data, axis=0)
+    #print(np.shape(data))
+    data = torch.tensor(data)
+    #1.0 - ((100-val)/100) * 0.5
+    #print("IN",data)
+    output = torch.negative(data)   #TURNS VAL TO -VAL
+    #print("1",output)
+    output = torch.add(output, max) # -VAL + 100
+    #print("2",output)
+    output = torch.div(output, max) # /100
+    #print("3",output)
+    output = torch.mul(output, 0.5) # * 0.5
+    #print("4",output)
+    output = torch.negative(output) #TURNS RESULT TO -RESULT
+    #print("5",output)
+    output = torch.add(output, 1.0) # -RESULT + 1.0
+    #print("6",output)
+    output = torch.where(data == 0, data, output)
+    #print("OUT",output)
+
+    #extra bodge for now
+    #output = torch.mul(output, 127.5)  + 127.5 
+    #output = torch.add(output, 127.5 )
+    #output = torch.where(data == 0, data, output)
+    return(output)
+#custom_normalisation(ti)
+
+#Function
+def custom_normalisation_oldv1(data, min=0, max=100):
+    output = torch.div(data, max*2)    #Divides all values in the input tensor (data), by the maximum allowed value in time dimension multiplied by 2 (max*2) this normalises the data between 0 and 0.5
+    nonzero_id = torch.nonzero(output)    #Finda all index's of nonzero values in the tensor
+
+    for id in nonzero_id:   #iterates through all the nonzero tensor index's 
+        output[id[0]][id[1]][id[2]] = output[id[0]][id[1]][id[2]] + 0.5      #adds 0.5 to all non zero tensor indexs (could have doen this other way round but this is faster as the input is mostly zeros) 
+    
+    return(output)                        #Outputs tensor of same dimensions and size as input but scaled between 0.5 and 1 unless value is zero in which case it remains 0
+
+
+#output = custom_normalisation_V1(test_tensor)
+#print(output)
+
+def normalisation_reconstruction(data, reconstruction_threshold=0.5, time_dimension=100):
+    output_data = data.numpy()
+    print(data)
+    out = np.zeros([3,3])
+    for row, row_data in enumerate(output_data):
+        for column, TOF in enumerate(row_data):
+            if TOF >= 0.5:
+                TOF_denorm = (TOF - reconstruction_threshold) * 2 * time_dimension
+                out[row][column] = (TOF_denorm)
+    return(out)
+
+
+#reconstructed_data = normalisation_reconstruction(output)
+#print(reconstructed_data)
\ No newline at end of file
diff --git a/DeepClean_2D/DataLoader_Functions_V2.py b/DeepClean_2D/DataLoader_Functions_V2.py
new file mode 100644
index 0000000..9f0dd0d
--- /dev/null
+++ b/DeepClean_2D/DataLoader_Functions_V2.py
@@ -0,0 +1,179 @@
+# -*- coding: utf-8 -*-
+"""
+DataLoader Functions V2
+@author: Adill Al-Ashgar
+Created on Tue Nov 15 19:02:32 2022
+
+USER NOTICE!
+x Must be inside the root dir of the DataLoader or the Neural Net that calls it.
+"""
+#%% - Dependencies
+import torch
+import matplotlib.pyplot as plt
+from torchvision import datasets
+import os
+import numpy as np
+from torch.utils.data import random_split
+
+#%%
+def train_loader2d(path):   #fix need for two seperate loads, one on each loader
+    sample = (np.load(path))
+    sample = sample[0]               
+    return (sample)
+
+#%%
+def test_loader2d(path):
+    load = 1 # Set manually, 0 = Blank, no data, 1 = just signal, 2 = just noise, 3 = both, but with differing values (1,2)    #!!! OPION 3 NOT WORKING
+    sample = (np.load(path))
+    sample2 = sample[0] 
+    #sample2 = np.ma.masked_where(sample[1] == load, sample[1])                   
+    return (sample2)
+
+#%%
+def train_loader3d(path):   #fix need for two seperate loads, one on each loader
+    sample = (np.load(path))
+    sample = sample         
+    return torch.tensor(sample)
+
+#%%
+def test_loader3d(path):
+    #load = 1 # Set manually, 0 = Blank, no data, 1 = just signal, 2 = just noise, 3 = both, but with differing values (1,2)    #!!! OPION 3 NOT WORKING
+    sample = (np.load(path))
+    #sample2 = np.ma.masked_where(sample                   
+    return torch.tensor(sample)
+
+#%%
+def batch_learning(training_dataset_size, batch_size):
+    if batch_size == 1: 
+        output = "Stochastic Gradient Descent"
+    elif batch_size == training_dataset_size:
+        output = "Batch Gradient Descent"        
+    else:
+        output = "Mini-Batch Gradient Descent"
+    return(output) 
+
+#%%
+def initialise_data_loader (dataset_title, data_path, batch_size, train_transforms, test_transforms, debug_loader_batch = 0, plot_every_other = 1, batch_size_protection = 1):
+    # Input type check, 2D or 3D. Based on dataset foldr name. 3D if folder starts with S_
+    if dataset_title.startswith('S_'):
+        print("Detected 3D Input")
+        circular_or_spherical = 1 #2d or 3d loader. 0 = 2d, 1 = 3d
+    else:
+        print("Detected 2D Input")
+        circular_or_spherical = 0 #2d or 3d loader. 0 = 2d, 1 = 3d  
+    
+    # - Path images, greater than batch choice? CHECK
+    ####check for file count in folder####
+    if batch_size_protection == 1:
+        files_in_path = os.listdir(data_path + dataset_title + '/Data/') 
+        num_of_files_in_path = len(files_in_path)
+        learning = batch_learning(num_of_files_in_path, batch_size)
+        print("%s files in path." %num_of_files_in_path ,"// Batch size =",batch_size, "\nLearning via: " + learning,"\n")
+        if num_of_files_in_path < batch_size:
+            print("Error, the path selected has", num_of_files_in_path, "image files, which is", (batch_size - num_of_files_in_path) , "less than the chosen batch size. Please select a batch size less than the total number of images in the directory")
+            
+            #!!!Need code to make this event cancel the running of program and re ask for user input on batch size or just reask for the batch size
+            batch_err_message = "Choose new batch size, must be less than total amount of images in directory", (num_of_files_in_path)
+            batch_size = int(input(batch_err_message))  #!!! not sure why input message is printing with wierd brakets and speech marks in the terminal? Investigate
+
+    # - Data Loading
+    if circular_or_spherical == 0:
+        train_data = datasets.DatasetFolder(data_path + dataset_title, loader=train_loader2d, extensions='.npy', transform=train_transforms)
+        test_data = datasets.DatasetFolder(data_path + dataset_title, loader=test_loader2d, extensions='.npy', transform=test_transforms)
+    
+    else:
+        train_data = datasets.DatasetFolder(data_path + dataset_title, loader=train_loader3d, extensions='.npy', transform=train_transforms)
+        test_data = datasets.DatasetFolder(data_path + dataset_title, loader=test_loader3d, extensions='.npy', transform=test_transforms)
+            
+    ###Following section splits the training dataset into two, train_data (to be noised) and valid data (to use in eval)
+    m=len(train_data) #Just calculates length of train dataset, m is only used in the next line to decide the values of the split, (4/5 m) and (1/5 m)
+    train_data, val_data = random_split(train_data, [int(round((m-m*0.2))), int(round((m*0.2)))])    #random_split(data_to_split, [size of output1, size of output2]) just splits the train_dataset into two parts, 4/5 goes to train_data and 1/5 goes to val_data , validation?
+    
+    trainloader = torch.utils.data.DataLoader(train_data,batch_size=batch_size)
+    validloader = torch.utils.data.DataLoader(val_data, batch_size=batch_size) 
+    testloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=True)
+    
+    # - Debugging Outputs
+    if debug_loader_batch == 1:
+        train_features, train_labels = next(iter(trainloader))
+        print(f"Feature batch shape: {train_features.size()}")
+        print(f"Labels batch shape: {train_labels.size()}")
+
+        for i in range (0, batch_size, plot_every_other):   # Display image and label.
+            label = train_labels[i]
+            #print ("\nImage #",i+1)
+            #print(f"Label: {label}")   
+    
+            ##Checks if data is 2d if so plots 2d image
+            if circular_or_spherical == 0:
+                img = train_features[i].squeeze()
+                plt.imshow(img, cmap="gray")  #.T, cmap="gray")   #!!!fix the need for img.T which is the transpose, as it flips the image, 
+                plt.show()
+            
+            ##Checks if data is 3d if so plots 3d image
+            else:           
+                hits_3d = np.nonzero(train_features[i].squeeze())
+                #print(hits_3d)
+                x3d = hits_3d.T[2]
+                y3d = hits_3d.T[1]
+                z3d = hits_3d.T[0]
+                
+                fig = plt.figure()               #Plots spherical data
+                ax = plt.axes(projection='3d')
+                ax.scatter(x3d, y3d, z3d)#, s = signal_hit_size, c = "b") #Plots spherical data in blue
+                #ax.scatter(x_sph_noise_data,y_sph_noise_data,z_sph_noise_data, s = noise_hit_size, c = noise_colour) #Plots spherical noise in blue or red depending on the user selection of seperate_noise_colour
+                ax.set_xlim(0, 100) #Time resoloution of detector
+                ax.set_ylim(0, 88)  #width (px) of detector
+                ax.set_zlim(0, 128) #hight (px) of detector
+                plt.show()
+    
+
+    return(trainloader, testloader, validloader, train_data, test_data, val_data)
+
+
+
+
+
+"""
+#%% - Data Importer
+data_dir = 'dataset'
+train_dataset = torchvision.datasets.MNIST(data_dir, train=True, download=True)
+test_dataset  = torchvision.datasets.MNIST(data_dir, train=False, download=True)
+
+
+#%% - Data Preparation  #!!!Perhaps these should be passed ino the loader as user inputs, that allows for ease of changing between differnt tranforms in testing without having to flip to the data loader code
+
+####SECTION MODIFIED#####
+train_transform = transforms.Compose([                                         #train_transform variable holds the tensor tranformations to be performed on the training data.  transforms.Compose([ ,  , ]) allows multiple transforms to be chained together (in serial?) (#!!! does it do more than this??)
+                                       #transforms.RandomRotation(30),         #transforms.RandomRotation(angle (degrees?) ) rotates the tensor randomly up to max value of angle argument
+                                       #transforms.RandomResizedCrop(224),     #transforms.RandomResizedCrop(pixels) crops the data to 'pixels' in height and width (#!!! and (maybe) chooses a random centre point????)
+                                       #transforms.RandomHorizontalFlip(),     #transforms.RandomHorizontalFlip() flips the image data horizontally 
+                                       #transforms.Normalize((0.5), (0.5)),    #transforms.Normalize can be used to normalise the values in the array
+                                       transforms.ToTensor()])                 #other transforms can be dissabled but to tensor must be left enabled ! it creates a tensor from a numpy array #!!! ?
+
+test_transform = transforms.Compose([                                          #test_transform variable holds the tensor tranformations to be performed on the evaluation data.  transforms.Compose([ ,  , ]) allows multiple transforms to be chained together (in serial?) (#!!! does it do more than this??)
+                                      #transforms.Resize(255),                 #transforms.Resize(pixels? #!!!) ??
+                                      #transforms.CenterCrop(224),             #transforms.CenterCrop(pixels? #!!!) ?? Crops the given image at the center. If the image is torch Tensor, it is expected to have […, H, W] shape, where … means an arbitrary number of leading dimensions. If image size is smaller than output size along any edge, image is padded with 0 and then center cropp
+                                      #transforms.Normalize((0.5), (0.5)),     #transforms.Normalize can be used to normalise the values in the array
+                                      transforms.ToTensor()])                  #other transforms can be dissabled but to tensor must be left enabled ! it creates a tensor from a numpy array #!!! ?
+
+
+train_dataset.transform = train_transform       #!!! train_dataset is the class? object 'dataset' it has a subclass called transforms which is the list of transofrms to perform on the dataset when loading it. train_tranforms is the set of chained transofrms we created, this is set to the dataset transforms subclass 
+test_dataset.transform = test_transform         #!!! similar to the above but for the test(eval) dataset, check into this for the exact reason for using it, have seen it deone in other ways i.e as in the dataloader.py it is performed differntly. this way seems to be easier to follow
+
+####SECTION MODIFIED END#####
+
+
+
+###Following section splits the training dataset into two, train_data (to be noised) and valid data (to use in eval)
+m=len(train_dataset) #Just calculates length of train dataset, m is only used in the next line to decide the values of the split, (4/5 m) and (1/5 m)
+train_data, val_data = random_split(train_dataset, [int(round((m-m*0.2)), int(round((m*0.2))])    #random_split(data_to_split, [size of output1, size of output2]) just splits the train_dataset into two parts, 4/5 goes to train_data and 1/5 goes to val_data , validation?
+
+
+###Following section for Dataloaders, they just pull a random sample of images from each of the datasets we now have, train_data, valid_data, and test_data. the batch size defines how many are taken from each set, shuffle argument shuffles them each time?? #!!!
+batch_size=256                                                                                #User controll to set batch size for the dataloaders (Hyperparameter)?? #!!!
+
+train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size)                 #Training data loader, can be run to pull training data as configured
+valid_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size)                   #Validation data loader, can be run to pull training data as configured
+test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size,shuffle=True)   #Testing data loader, can be run to pull training data as configured. Also is shuffled using parameter shuffle #!!! why is it shuffled?
+"""
\ No newline at end of file
diff --git a/DeepClean_2D/DeepClean_2D_Main.py b/DeepClean_2D/DeepClean_2D_Main.py
new file mode 100644
index 0000000..fe1bd04
--- /dev/null
+++ b/DeepClean_2D/DeepClean_2D_Main.py
@@ -0,0 +1,500 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Sat Oct 8 2022
+DeepClean 2D v0.0.1
+@author: Adill Al-Ashgar
+"""
+#%% - Dependencies
+import numpy as np 
+import matplotlib.pyplot as plt
+import torch
+from torchvision import transforms
+import random 
+
+from Custom_Normalisation_V1 import scale_Ndata as custom_normalisation
+#from Custom_Normalisation_V1 import custom_normalisation
+from DataLoader_Functions_V2 import initialise_data_loader
+from autoencoders.autoencoder_2D_V1 import Encoder, Decoder
+
+
+
+def find_np_minmax(data):
+    print("NP SHAPE", np.shape(data))
+    return(data)
+
+
+def tensor_datascan(data):
+    #tensor_min = np.amin(data.numpy())
+    #tensor_max = np.amax(data.numpy())
+    #print("MIN", tensor_min, "MAX", tensor_max)
+    print("SHAPE tensor: ", np.shape(data.numpy()))
+    datascan = data.numpy().flatten()
+
+    data_hit_tensor = datascan[datascan != 0]
+    print("TENSOR SCAN HITS",len(data_hit_tensor))
+    print(data_hit_tensor)
+    plt.hist(datascan,20)
+    plt.show()
+    return(data)
+
+def numpy_datascan(data):
+
+    #tensor_min = np.amin(data.numpy())
+    #tensor_max = np.amax(data.numpy())
+    #print("MIN", tensor_min, "MAX", tensor_max)
+    print("SHAPE numpy: ", np.shape(data))
+    datascan = data.flatten()
+
+    data_hit_numpy = datascan[datascan != 0]
+    print("NUMPY SCAN HITS",len(data_hit_numpy))
+    print(data_hit_numpy)
+    plt.hist(datascan,20)
+    plt.show()
+    return(data)
+
+#%% - User Inputs
+learning_rate = 0.001                       #User controll to set optimiser learning rate(Hyperparameter)
+optim_w_decay = 1e-05                       #User controll to set optimiser weight decay (Hyperparameter)
+loss_function = torch.nn.MSELoss()          #User controll to set loss function (Hyperparameter)
+latent_space_nodes = 5                      #User controll to set number of nodes in the latent space, the bottleneck layer (Hyperparameter)
+noise_factor = 0                            #User controll to set the noise factor, a multiplier for the magnitude of noise added. 0 means no noise added, 1 is defualt level of noise added, 10 is 10x default level added (Hyperparameter)
+num_epochs = 4                            #User controll to set number of epochs (Hyperparameter)
+batch_size = 10                             #Data Loader, number of Images to pull per batch (add a check to make sure the batch size is smaller than the total number of images in the path selected)
+reconstruction_threshold = 0.2              #Threshold for 3d reconstruction, values below this confidence level are discounted
+seed = 0                                    #0 is default which gives no seeeding to RNG, if the value is not zero then this is used for the RNG seeding for numpy, random, and torch libraries
+
+#%% - Program Settings
+print_partial_training_losses = 1           #[default = 1]
+print_encoder_debug = 1                     #[default = 1]
+print_decoder_debug = 1                     #[default = 1]
+debug_noise_function = 0                    #[default = 1]
+print_epochs = 2                            #[default = 1] prints every other 'print_epochs' i.e if set to two then at end of every other epoch it will print a test on results
+plot_or_save = 0                            #[default = 0] 0 is normal behavior, If set to 1 then saves all end of epoch printouts to disk, if set to 2 then saves outputs whilst also printing for user
+outputfig_title = "Test"                    #Must be string, value is used in the titling of the output plots if plot_or_save is selected above
+telemetry_on = 1                            #[default = 1]
+
+#%% Dataloading
+# - Data Loader User Inputs
+dataset_title = "Dataset 1_Realistic"
+data_path = "C:/Users/Student/Documents/UNI/Onedrive - University of Bristol/Yr 3 Project/Circular and Spherical Dummy Datasets/" #"C:/Users/Student/Desktop/fake im data/"  #"/local/path/to/the/images/"
+time_dimension = 100
+
+# - Advanced Data Loader Settings
+debug_loader_batch = 0     #(Default = 0 = [OFF]) //INPUT 0 or 1//   #Setting debug loader batch will print to user the images taken in by the dataoader in this current batch and print the corresponding labels
+plot_every_other = 1       #(Default = 1) //MUST BE INTEGER INPUT//  #If debug loader batch is enabled this sets the interval for printing for user, 1 is every single img in the batch, 2 is every other img, 5 is every 5th image etc 
+batch_size_protection = 1  #(Default = 1 = [ON]) //INPUT 0 or 1//    #WARNING if turned off, debugging print will cause and exeption due to the index growing too large in the printing loop (img = train_features[i])
+
+# - Data Loader Preparation Transforms 
+#####For info on all transforms check out: https://pytorch.org/vision/0.9/transforms.html
+train_transforms = transforms.Compose([#transforms.Lambda(numpy_datascan),
+                                       transforms.Lambda(custom_normalisation),
+                                       #transforms.Lambda(numpy_datascan),
+                                       #transforms.Lambda(find_np_minmax),
+                                       transforms.ToTensor(),
+                                       #transforms.Lambda(custom_normalisation), #transforms.Lambda(function) allows creation of custom transform from a function 
+                                       #transforms.Lambda(tensor_datascan),
+                                       #transforms.Normalize(),
+                                       #transforms.RandomRotation(30),         #Compose is required to chain together multiple transforms in serial 
+                                       #transforms.RandomResizedCrop(224),
+                                       #transforms.RandomHorizontalFlip(),
+                                       #transforms.ToTensor()               #other transforms can be dissabled but to tensor must be left enabled
+                                       ]) 
+
+test_transforms = transforms.Compose([#transforms.Lambda(numpy_datascan),
+                                      transforms.Lambda(custom_normalisation),
+                                      #transforms.Lambda(numpy_datascan),
+                                      #transforms.Lambda(find_np_minmax),
+                                      #transforms.Resize(255),
+                                      #transforms.CenterCrop(224),
+                                      transforms.ToTensor(),
+                                      #transforms.Lambda(custom_normalisation),
+                                      #transforms.Lambda(tensor_datascan),
+ 
+                                      #transforms.Normalize()
+                                      ])
+
+# - Initialise Data Loader
+train_loader, test_loader, val_loader, train_dataset, test_dataset, val_datset = initialise_data_loader(dataset_title, data_path, batch_size, train_transforms, test_transforms, debug_loader_batch, plot_every_other, batch_size_protection)
+
+#%% - Classes
+### Gaussian Noise Generator Class
+class AddGaussianNoise(object):                   #Class generates noise with mean 0 and std deviation 1, (gaussian)
+    def __init__(self, mean=0., std=1.):
+        self.std = std
+        self.mean = mean
+        
+    def __call__(self, tensor):
+        return tensor + torch.randn(tensor.size()) * self.std + self.mean
+    
+    def __repr__(self):
+        return self.__class__.__name__ + '(mean={0}, std={1})'.format(self.mean, self.std)
+
+#%% - Functions
+### Random Noise Generator Function
+def add_noise(inputs,noise_factor=0.3, time_dimension=100):
+     cNOISE = torch.randn_like(inputs) #* time_dimension
+     noise_init = torch.randn_like(inputs)**2 * time_dimension
+     noise = torch.clip(cNOISE,0.,100.)
+     noisy = inputs# + noise
+     if debug_noise_function == 1:
+        print("INPUT", torch.min(inputs), torch.max(inputs))
+        plt.imshow(inputs[0][0])
+        plt.show()
+        print("cNOISE", torch.min(cNOISE), torch.max(cNOISE))
+        plt.imshow(cNOISE[0][0])
+        plt.show()
+        print("noise", torch.min(noise), torch.max(noise))
+        plt.imshow(noise[0][0])
+        plt.show()
+        print("noisy", torch.min(noisy), torch.max(noisy))
+        plt.imshow(noisy[0][0])
+        plt.show()
+     return noisy
+
+### Random Noise Generator Function
+def add_noise2(inputs, noise_points=0, dimensions=(88,128), time_dimension=100, debug_noise_function=0):
+    print("NOISETEST2",np.shape(inputs.numpy()))
+    for noise_point in range (0, noise_points+1):
+        np_x = np.random.randint(0, dimensions[0]) 
+        np_y = np.random.randint(0, dimensions[1]) 
+        np_TOF = np.random.randint(0, time_dimension) 
+        inputs[np_x][np_y] = np_TOF
+     
+    if debug_noise_function == 1:
+        plt.imshow(inputs[0][0])
+        plt.show()
+    return inputs
+
+
+
+
+def redimensionalise_time(data, reconstruction_threshold):
+    #Reconstruct 3D Image
+    x_list = []
+    y_list = []
+    TOF_list = []
+    #print("re-time in: min{}, max{}".format(0, (np.amax(data))))
+    for row, row_data in enumerate(data):
+        for column, TOF in enumerate(row_data):
+            #print("R,C",row,column)
+            if TOF != 0 and TOF >= reconstruction_threshold:        #the TOF != 0 is important if user sets a reconstruction threshold of 0
+                #Reverse Normalisation
+                x_list.append(row)
+                y_list.append(column)
+                TOF_denorm = (TOF - reconstruction_threshold) * 2 * time_dimension   #the value of 2 needs to be updated to programatic to really be able to modify reconsturction threshold without breaking program 
+                TOF_list.append(TOF_denorm)
+                #print("TOF",TOF,"denorn",TOF_denorm)
+    #print("re-time out: min{}, max{}".format(0, (np.amax(TOF_list))))
+    #plt.hist(data, 100)
+    #plt.show()
+    #plt.hist(TOF_list, 100)
+    #plt.show()
+    #print(TOF_list)
+    return (x_list, y_list, TOF_list)
+
+#3D Reconstruction
+def reconstruction_3D(image, time_dimension, reconstruction_threshold, settings, epoch, plot_or_save=0):
+    data_hit_numpy = image[image != 0]
+    print("IMAGE IN SCAN HITS",len(data_hit_numpy))
+    print(data_hit_numpy)
+#Remember image comes in in the form y,x not x,y so column and row are flipped in indexing
+    #2D Plot Check
+    plt.imshow(image)
+    plt.title("2D Reconstruction Epoch: %s" %epoch)
+    if plot_or_save == 0:
+        plt.show()
+    else:
+        plt.close()
+    
+    shape = np.shape(image)
+    x_list, y_list, z_list = redimensionalise_time(image, reconstruction_threshold)
+
+    #3D Plot
+    fig = plt.figure()               #Plots spherical data
+    ax = plt.axes(projection='3d')
+    ax.scatter(x_list, y_list, z_list)#, s = signal_hit_size, c = "b") #Plots spherical data in blue
+    ax.set_xlim(0, shape[0])
+    ax.set_ylim(0, shape[1])
+    ax.set_zlim(0, time_dimension)
+    ax.set_title("3D Reconstruction Epoch: %s" %epoch)
+    ax.set_xlabel("x")
+    ax.set_ylabel("y")
+    ax.set_zlabel("Time")
+    if plot_or_save == 0:
+        plt.show()
+    else:
+        Out_Label = 'Output_Graphics/{}, Reconstruction, Epoch {}, {} .png'.format(outputfig_title, epoch, settings) #!!!
+        plt.savefig(Out_Label, format='png')        
+        plt.close()
+
+###Ploting confidence of each pixel as histogram per epoch with line showing the detection threshold
+def belief_telemetry(data, reconstruction_threshold, epoch, settings, plot_or_save=0):
+    data2 = data.flatten()
+
+    #Plots histogram showing the confidence level of each pixel being a signal point
+    values, bins, bars = plt.hist(data2, 10, histtype='bar')
+    plt.axvline(x= reconstruction_threshold, color='red', marker='|', linestyle='dashed', linewidth=2, markersize=12)
+    plt.title("Epoch %s" %epoch)
+    plt.bar_label(bars, fontsize=10, color='navy') 
+    if plot_or_save == 0:
+        plt.show()
+    else:
+        Out_Label = 'Output_Graphics/{}, Confidence Histogram, Epoch {}, {} .png'.format(outputfig_title, epoch, settings) #!!!
+        plt.savefig(Out_Label, format='png')        
+        plt.close()
+
+    above_threshold = (data2 >= reconstruction_threshold).sum()
+    below_threshold = (data2 < reconstruction_threshold).sum()
+    return (above_threshold, below_threshold)
+
+def plot_telemetry(telemetry):
+    tele = np.array(telemetry)
+    #!!! Add labels to lines
+    plt.plot(tele[:,0],tele[:,1], color='r', label="Points above threshold") #red = num of points above threshold
+    plt.plot(tele[:,0],tele[:,2], color='b', label="Points below threshold") #blue = num of points below threshold
+    plt.title("Telemetry over epochs")
+    plt.xlabel("Epoch number")
+    plt.ylabel("Number of Signal Points")
+    plt.legend()
+    plt.show()    
+
+###RNG Seeding for Determinism Function
+def Determinism_Seeding(seed):
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+
+### Training Function
+def train_epoch_den(encoder, decoder, device, dataloader, loss_fn, optimizer,noise_factor=0.3, print_partial_training_losses=print_partial_training_losses):
+    # Set train mode for both the encoder and the decoder
+    encoder.train()
+    decoder.train()
+    train_loss = []
+    # Iterate the dataloader (we do not need the label values, this is unsupervised learning)
+    for image_batch, _ in dataloader: # with "_" we just ignore the labels (the second element of the dataloader tuple)
+        # Move tensor to the proper device
+        image_noisy = add_noise(image_batch,noise_factor)
+        image_batch = image_batch.to(device)
+        image_noisy = image_noisy.to(device)    
+        # Encode data
+        encoded_data = encoder(image_noisy)
+        # Decode data
+        decoded_data = decoder(encoded_data)
+        # Evaluate loss
+        loss = loss_fn(decoded_data, image_batch)
+        # Backward pass
+        optimizer.zero_grad()
+        loss.backward()
+        optimizer.step()
+        if print_partial_training_losses == 1:
+            # Print batch loss
+            print('\t partial train loss (single batch): %f' % (loss.data))
+        train_loss.append(loss.detach().cpu().numpy())
+    return np.mean(train_loss)
+
+### Testing Function
+def test_epoch_den(encoder, decoder, device, dataloader, loss_fn,noise_factor=0.3):
+    # Set evaluation mode for encoder and decoder
+    encoder.eval()
+    decoder.eval()
+    with torch.no_grad(): # No need to track the gradients
+        # Define the lists to store the outputs for each batch
+        conc_out = []
+        conc_label = []
+        for image_batch, _ in dataloader:
+            # Move tensor to the proper device
+            image_noisy = add_noise(image_batch,noise_factor)
+            image_noisy = image_noisy.to(device)
+            # Encode data
+            encoded_data = encoder(image_noisy)
+            # Decode data
+            decoded_data = decoder(encoded_data)
+            # Append the network output and the original image to the lists
+            conc_out.append(decoded_data.cpu())
+            conc_label.append(image_batch.cpu())
+        # Create a single tensor with all the values in the lists
+        conc_out = torch.cat(conc_out)
+        conc_label = torch.cat(conc_label) 
+        # Evaluate global loss
+        val_loss = loss_fn(conc_out, conc_label)
+    return val_loss.data
+
+###Plotting Function
+def plot_ae_outputs_den(encoder, decoder, epoch, outputfig_title, time_dimension, reconstruction_threshold, plot_or_save=0, n=10,noise_factor=0.5):       #Defines a function for plotting the output of the autoencoder. And also the input + clean training data? Function takes inputs, 'encoder' and 'decoder' which are expected to be classes (defining the encode and decode nets), 'n' which is the number of ?????Images in the batch????, and 'noise_factor' which is a multiplier for the magnitude of the added noise allowing it to be scaled in intensity.  
+    
+    #Initialise lists for true and recovered signal point values 
+    number_of_true_signal_points = []
+    number_of_recovered_signal_points = []
+    
+    #Start Plotting Results
+    plt.figure(figsize=(16,4.5))                                      #Sets the figure size
+
+    for i in range(n):                                                #Runs for loop where 'i' itterates over 'n' total values which range from 0 to n-1
+      
+      ax = plt.subplot(3,n,i+1)                                      #Creates a number of subplots for the 'Original images??????' i.e the labels. the position of the subplot is i+1 as it falls in the first row
+      img = test_dataset[i][0].unsqueeze(0)
+
+      #Determine the number of signal points on the input image (have to change this to take it directly from the embeded val in the datsaset as when addig noise this method will break)   
+      int_sig_points = (img >= reconstruction_threshold).sum()
+      number_of_true_signal_points.append(int(int_sig_points.numpy()))
+
+      #Following section creates the noised image data drom the original clean labels (images)   
+      image_noisy = add_noise(img,noise_factor)     
+      image_noisy = image_noisy.to(device)
+      
+      #Following section sets the autoencoder to evaluation mode rather than training (up till line 'with torch.no_grad()')
+      encoder.eval()                                   #.eval() is a kind of switch for some specific layers/parts of the model that behave differently during training and inference (evaluating) time. For example, Dropouts Layers, BatchNorm Layers etc. You need to turn off them during model evaluation, and .eval() will do it for you. In addition, the common practice for evaluating/validation is using torch.no_grad() in pair with model.eval() to turn off gradients computation
+      decoder.eval()                                   #Simarlary as above
+      
+      with torch.no_grad():                                               #As mentioned in .eval() comment, the common practice for evaluating/validation is using torch.no_grad() which turns off gradients computation whilst evaluating the model (the opposite of training the model)     
+      #Following line runs the autoencoder on the noised data
+         rec_img  = decoder(encoder(image_noisy))                        #Creates a recovered image (denoised image), by running a noisy image through the encoder and then the output of that through the decoder.
+      
+      #Determine the number of signal points on the recovered image 
+      int_rec_sig_points = (rec_img >= reconstruction_threshold).sum()      
+      number_of_recovered_signal_points.append(int(int_rec_sig_points.numpy()))
+
+      #Following section generates the img plots for the original(labels), noised, and denoised data)
+      plt.imshow(img.cpu().squeeze().numpy(), cmap='gist_gray')           #plt.imshow plots an image. The arguments for imshow are, 'image data array' and cmap= which is the colour map. #.squeeze() acts on a tensor and returns a tensor, it removes all dimensions of the tensor that are of length 1, (A×1×B) becomes (AxB) where A and B are greater than 1 #.numpy() creates a numpy array from a tensor #!!! is the .cpu part becuase the code was not made to accept the gpu/cpu check i made????
+      ax.get_xaxis().set_visible(False)                                   #Hides the x axis from showing in the plot as we are plotting images not graphs (we may want to retain axis?)
+      ax.get_yaxis().set_visible(False)                                   #Hides the y axis from showing in the plot as we are plotting images not graphs (we may want to retain axis?)
+      if i == n//2:                                                       #n//2 divides n by 2 without any remainder, i.e 6//2=3 and 7//2=3. So this line checks to see if i is equal to half of n without remainder. it will be yes once in the loop. not sure of its use
+        ax.set_title('EPOCH %s \nOriginal images' %(epoch+1))               #When above condition is reached, the plots title is set
+
+      ax = plt.subplot(3, n, i + 1 + n)                                   #Creates a number of subplots for the 'Corrupted images??????' i.e the labels. the position of the subplot is i+1+n as it falls in the second row
+      plt.imshow(image_noisy.cpu().squeeze().numpy(), cmap='gist_gray')   #plt.imshow plots an image. The arguments for imshow are, 'image data array' and cmap= which is the colour map. #.squeeze() acts on a tensor and returns a tensor, it removes all dimensions of the tensor that are of length 1, (A×1×B) becomes (AxB) where A and B are greater than 1 #.numpy() creates a numpy array from a tensor #!!! is the .cpu part becuase the code was not made to accept the gpu/cpu check i made????
+      ax.get_xaxis().set_visible(False)                                   #Hides the x axis from showing in the plot as we are plotting images not graphs (we may want to retain axis?)
+      ax.get_yaxis().set_visible(False)                                   #Hides the y axis from showing in the plot as we are plotting images not graphs (we may want to retain axis?)
+      if i == n//2:                                                       #n//2 divides n by 2 without any remainder, i.e 6//2=3 and 7//2=3. So this line checks to see if i is equal to half of n without remainder. it will be yes once in the loop. not sure of its use
+        ax.set_title('Corrupted images')                                  #When above condition is reached, the plots title is set
+
+      ax = plt.subplot(3, n, i + 1 + n + n)                               #Creates a number of subplots for the 'Reconstructed images??????' i.e the labels. the position of the subplot is i+1+n+n as it falls in the third row
+      plt.imshow(rec_img.cpu().squeeze().numpy(), cmap='gist_gray')       #plt.imshow plots an image. The arguments for imshow are, 'image data array' and cmap= which is the colour map. #.squeeze() acts on a tensor and returns a tensor, it removes all dimensions of the tensor that are of length 1, (A×1×B) becomes (AxB) where A and B are greater than 1 #.numpy() creates a numpy array from a tensor #!!! is the .cpu part becuase the code was not made to accept the gpu/cpu check i made????
+      ax.get_xaxis().set_visible(False)                                   #Hides the x axis from showing in the plot as we are plotting images not graphs (we may want to retain axis?)
+      ax.get_yaxis().set_visible(False)                                   #Hides the y axis from showing in the plot as we are plotting images not graphs (we may want to retain axis?)
+      if i == n//2:                                                       #n//2 divides n by 2 without any remainder, i.e 6//2=3 and 7//2=3. So this line checks to see if i is equal to half of n without remainder. it will be yes once in the loop. not sure of its use
+         ax.set_title('Reconstructed images')                             #When above condition is reached, the plots title is set 
+    
+    plt.subplots_adjust(left=0.1,              #Adjusts the exact layout of the plots including whwite space round edges
+                    bottom=0.1, 
+                    right=0.9, 
+                    top=0.9, 
+                    wspace=0.1, 
+                    hspace=0.3)     
+    
+    settings = "Settings = [ep {}][bs {}][lr {}][od {}][ls {}][nf {}][ds {}][sd {}]".format(num_epochs, batch_size, learning_rate, optim_w_decay, latent_space_nodes, noise_factor, dataset_title, seed)
+
+    if plot_or_save == 0:
+        if (epoch+1) % print_epochs == 0:
+            plt.show()                                 #After entire loop is finished, the generated plot is printed to screen
+        else:
+            plt.close()
+
+    elif plot_or_save == 1:
+        Out_Label = 'Output_Graphics/{}, Epoch {}, {} .png'.format(outputfig_title, epoch+1, settings) #!!!
+        plt.savefig(Out_Label, format='png')
+        plt.close()
+        print("\n# SAVED OUTPUT TEST IMAGE TO DISK #\n")    
+
+    if (epoch+1) % print_epochs == 0:        
+        ###3D Reconstruction
+        in_data = img.cpu().squeeze().numpy()
+        rec_data = rec_img.cpu().squeeze().numpy()
+        reconstruction_3D(in_data, time_dimension, reconstruction_threshold, settings, epoch+1, plot_or_save)
+        reconstruction_3D(rec_data, time_dimension, reconstruction_threshold, settings, epoch+1, plot_or_save)
+        
+        #Telemetry plots
+        if telemetry_on == 1:       #needs ttitles and labels etc added
+            above_threshold, below_threshold = belief_telemetry(rec_data, reconstruction_threshold, epoch+1, settings, plot_or_save)   
+            telemetry.append([epoch, above_threshold, below_threshold])
+
+    return(number_of_true_signal_points, number_of_recovered_signal_points)
+
+#%% - Program Internal Setup
+
+#Sets rng seeding for repoducability and debugging if user inputs a value for variable 'seeding' other than 0
+if seed != 0: 
+    Determinism_Seeding(seed)
+
+#%% - Setup model, loss criteria and optimiser    
+    
+### Define the loss function
+loss_fn = loss_function
+
+### Define a learning rate for the optimiser
+lr = learning_rate                                     #Just sets the learing rate value from the user inputs pannel at the top
+
+### Set the random seed for reproducible results WHY IS THIS ON?????????????????????? TEST
+#torch.manual_seed(seed)              
+
+### Initialize the two networks
+d = latent_space_nodes #!!!d is passed to the encoder & decoder in the lines below and represents the encoded space dimension. This is the number of layers the linear stages will shrink to? #!!!
+
+#model = Autoencoder(encoded_space_dim=encoded_space_dim)
+encoder = Encoder(encoded_space_dim=d,fc2_input_dim=128, encoder_debug=print_encoder_debug)
+decoder = Decoder(encoded_space_dim=d,fc2_input_dim=128, decoder_debug=print_decoder_debug)
+params_to_optimize = [{'params': encoder.parameters()} ,{'params': decoder.parameters()}] #Selects what to optimise, 
+
+### Define an optimizer (both for the encoder and the decoder!)
+wd = optim_w_decay                                                           #Just sets the weight decay value from the user inputs pannel at the top
+optim = torch.optim.Adam(params_to_optimize, lr=lr, weight_decay=wd)
+
+#%% - Compute device check
+#Following section checks if a CUDA enabled GPU is available. If found it is selected as the 'device' to perform the tensor opperations. If no CUDA GPU is found the 'device' is set to CPU (much slower) 
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+print(f'Selected device: {device}')  #Informs user if running on CPU or GPU - (NVIDIA CUDA)
+
+#Following section moves both the encoder and the decoder to the selected device i.e detected CUDA enabled GPU or to CPU
+encoder.to(device)   #Moves encoder to selected device, CPU/GPU
+decoder.to(device)   #Moves decoder to selected device, CPU/GPU
+
+#%% - Compute
+
+history_da={'train_loss':[],'val_loss':[]}                   #Just creates a variable called history_da which contains two lists, 'train_loss' and 'val_loss' which are both empty to start with. value are latter appeneded to the two lists by way of history_da['val_loss'].append(x)
+
+telemetry = [[0,0.5,0.5]]  #Initalises the telemetry memory, starting values are 0, 0.5, 0.5 which corrspond to epoch(0), above_threshold(0.5), below_threshold(0.5)
+
+for epoch in range(num_epochs):                              #For loop that iterates over the number of epochs where 'epoch' takes the values (0) to (num_epochs - 1)
+    print('EPOCH %d/%d' % (epoch + 1, num_epochs))
+    ### Training (use the training function)
+    train_loss=train_epoch_den(
+                               encoder=encoder, 
+                               decoder=decoder, 
+                               device=device, 
+                               dataloader=train_loader, 
+                               loss_fn=loss_fn, 
+                               optimizer=optim,
+                               noise_factor=noise_factor)
+    
+    ### Validation (use the testing function)
+    val_loss = test_epoch_den(
+                              encoder=encoder, 
+                              decoder=decoder, 
+                              device=device, 
+                              dataloader=val_loader, 
+                              loss_fn=loss_fn,
+                              noise_factor=noise_factor)
+    
+    # Print Validation_loss and plots at end of each epoch
+    history_da['train_loss'].append(train_loss)
+    history_da['val_loss'].append(val_loss)
+    print('\nEND OF EPOCH {}/{} \t train loss {:.3f} \t val loss {:.3f}\n'.format(epoch + 1, num_epochs,train_loss,val_loss))     #epoch +1 is to make up for the fact the range spans 0 to epoch-1 but we want to numerate things from 1 upwards for sanity
+    number_of_true_signal_points, number_of_recovered_signal_points = plot_ae_outputs_den(encoder, decoder, epoch, outputfig_title,time_dimension, reconstruction_threshold, plot_or_save, n=10, noise_factor=noise_factor)
+
+
+###Loss function plots
+epochs_range = range(1,num_epochs+1)
+plt.plot(epochs_range, history_da['train_loss']) 
+plt.title("Training loss")   
+plt.show()
+
+plt.plot(epochs_range, history_da['train_loss']) 
+plt.title("Validation loss") 
+plt.show()
+
+if telemetry_on == 1:
+    plot_telemetry(telemetry)
+
+#Comparison of true signal points to recovered signal points
+print("True signal points",number_of_true_signal_points)
+print("Recovered signal points: ",number_of_recovered_signal_points)
diff --git a/notebook9f0211ab59.ipynb b/notebook9f0211ab59.ipynb
new file mode 100644
index 0000000..f53b7ad
--- /dev/null
+++ b/notebook9f0211ab59.ipynb
@@ -0,0 +1 @@
+{"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.7.3"},"toc":{"base_numbering":1,"nav_menu":{"height":"85px","width":"179px"},"number_sections":true,"sideBar":true,"skip_h1_title":false,"title_cell":"Table of Contents","title_sidebar":"Contents","toc_cell":true,"toc_position":{},"toc_section_display":true,"toc_window_display":true}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Training neural networks can be very confusing!\n\nWhat’s a good learning rate? How many hidden layers should your network have? Is dropout actually useful? Why are your gradients vanishing?\n\nIn this post we'll peel the curtain behind some of the more confusing aspects of neural nets, and help you make smart decisions about your neural network architecture.\n\n**I highly recommend forking this kernel and playing with the different building blocks to hone your intuition.** I made a quick **demo** to walk you through this kernel: https://www.loom.com/share/fb64035e4576467489cf0f2ad9cff92a. If you have any more questions or feedback, please don't hesitate to [message me](https://twitter.com/lavanyaai)!\n\n## If you like this kernel, please give it an upvote. Thank you! :)","metadata":{}},{"cell_type":"markdown","source":"Please turn the **'Internet' toggle On** in the Settings panel to your left, in order to make changes to this kernel. You'll be prompted for your Weights and Biases API key, which you can get for free @ [wandb.com](https://www.wandb.com/).","metadata":{}},{"cell_type":"markdown","source":"# The Goal","metadata":{}},{"cell_type":"code","source":"# Please turn the 'Internet' toggle On in the Settings panel to your left, in order to make changes to this kernel.\n!pip install wandb -q","metadata":{"_kg_hide-input":false,"_kg_hide-output":true,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from __future__ import print_function\nimport keras\nfrom keras.datasets import cifar10\nfrom keras.preprocessing.image import ImageDataGenerator\nfrom keras.models import Sequential\nfrom keras.layers import Dense, Dropout, Activation, Flatten\nfrom keras.layers import Conv2D, MaxPooling2D\nimport os\n\n# Essentials\nimport numpy as np\nimport pandas as pd\n\n# Models\nimport tensorflow as tf\n\n# Ignore useless warnings\nimport warnings\nwarnings.filterwarnings(action=\"ignore\")\n\n# Set random state for numpy\nnp.random.seed(42)\n\n# WandB\nimport wandb\nfrom wandb.keras import WandbCallback\n# You can change your project name here. For more config options, see https://docs.wandb.com/docs/init.html\nwandb.init(anonymous='allow', project=\"neural-nets-kaggle\", name=\"basic_neural_network\")\n\n# Go to https://app.wandb.ai/authorize to get your WandB key","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"batch_size = 32\nnum_classes = 10\nepochs = 100\ndata_augmentation = True\nnum_predictions = 20\n\n# The data, split between train and test sets:\n(x_train, y_train), (x_test, y_test) = cifar10.load_data()\nprint('x_train shape:', x_train.shape)\nprint(x_train.shape[0], 'train samples')\nprint(x_test.shape[0], 'test samples')\n\n# Convert class vectors to binary class matrices.\ny_train = keras.utils.to_categorical(y_train, num_classes)\ny_test = keras.utils.to_categorical(y_test, num_classes)\nx_train = x_train.astype('float32') / 255.0\nx_test = x_test.astype('float32') / 255.0\n\nlabels = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"wandb.init(anonymous='allow', project=\"building-neural-nets\", name=\"cifar-10\")","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"%%wandb\nmodel = Sequential()\nmodel.add(Conv2D(32, (3, 3), padding='same', input_shape=x_train.shape[1:], activation='relu'))\nmodel.add(Conv2D(32, (3, 3), activation='relu'))\nmodel.add(MaxPooling2D(pool_size=(2, 2)))\nmodel.add(Dropout(0.25))\nmodel.add(Conv2D(64, (3, 3), padding='same', activation='relu'))\nmodel.add(Conv2D(64, (3, 3), activation='relu'))\nmodel.add(MaxPooling2D(pool_size=(2, 2)))\nmodel.add(Dropout(0.25))\nmodel.add(Flatten())\nmodel.add(Dense(512, activation='relu'))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(num_classes, activation='softmax'))\n\n# Let's train the model using RMSprop\nmodel.compile(loss='categorical_crossentropy',\n              optimizer=keras.optimizers.Nadam(lr=0.0001, beta_1=0.9, beta_2=0.999, clipnorm=1.0),\n              metrics=['accuracy'])\n\nmodel.fit(x_train, y_train,\n              batch_size=batch_size,\n              epochs=epochs,\n              validation_data=(x_test, y_test),\n              callbacks=[WandbCallback(data_type=\"image\", labels=labels, validation_data=(x_test, y_test)), keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)],\n              shuffle=True)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"<h1>Table of Contents<span class=\"tocSkip\"></span></h1>\n<div class=\"toc\"><ul class=\"toc-item\"><li><span><a href=\"#Training-neural-networks-can-be-very-confusing!\" data-toc-modified-id=\"Training-neural-networks-can-be-very-confusing!-1\"><span class=\"toc-item-num\">1&nbsp;&nbsp;</span>Training neural networks can be very confusing!</a></span></li><li><span><a href=\"#Setup\" data-toc-modified-id=\"Setup-2\"><span class=\"toc-item-num\">2&nbsp;&nbsp;</span>Setup</a></span><ul class=\"toc-item\"><li><span><a href=\"#Initialize-Hyperparameters\" data-toc-modified-id=\"Initialize-Hyperparameters-2.1\"><span class=\"toc-item-num\">2.1&nbsp;&nbsp;</span>Initialize Hyperparameters</a></span></li><li><span><a href=\"#Load-dataset\" data-toc-modified-id=\"Load-dataset-2.2\"><span class=\"toc-item-num\">2.2&nbsp;&nbsp;</span>Load dataset</a></span></li><li><span><a href=\"#Explore-Dataset\" data-toc-modified-id=\"Explore-Dataset-2.3\"><span class=\"toc-item-num\">2.3&nbsp;&nbsp;</span>Explore Dataset</a></span></li></ul></li><li><span><a href=\"#Model-Training\" data-toc-modified-id=\"Model-Training-3\"><span class=\"toc-item-num\">3&nbsp;&nbsp;</span>Model Training</a></span></li><li><span><a href=\"#Basic-Neural-Network\" data-toc-modified-id=\"Basic-Neural-Network-4\"><span class=\"toc-item-num\">4&nbsp;&nbsp;</span>Basic Neural Network</a></span></li><li><span><a href=\"#Learning-Rate\" data-toc-modified-id=\"Learning-Rate-5\"><span class=\"toc-item-num\">5&nbsp;&nbsp;</span>Learning Rate</a></span></li><li><span><a href=\"#Momentum\" data-toc-modified-id=\"Momentum-6\"><span class=\"toc-item-num\">6&nbsp;&nbsp;</span>Momentum</a></span></li><li><span><a href=\"#Vanishing-+-Exploding-Gradients\" data-toc-modified-id=\"Vanishing-+-Exploding-Gradients-7\"><span class=\"toc-item-num\">7&nbsp;&nbsp;</span>Vanishing + Exploding Gradients</a></span></li><li><span><a href=\"#Dropout\" data-toc-modified-id=\"Dropout-8\"><span class=\"toc-item-num\">8&nbsp;&nbsp;</span>Dropout</a></span></li><li><span><a href=\"#Optimizers\" data-toc-modified-id=\"Optimizers-9\"><span class=\"toc-item-num\">9&nbsp;&nbsp;</span>Optimizers</a></span></li><li><span><a href=\"#Learning-Rate-Scheduling\" data-toc-modified-id=\"Learning-Rate-Scheduling-10\"><span class=\"toc-item-num\">10&nbsp;&nbsp;</span>Learning Rate Scheduling</a></span></li><li><span><a href=\"#Make-Predictions\" data-toc-modified-id=\"Make-Predictions-11\"><span class=\"toc-item-num\">11&nbsp;&nbsp;</span>Make Predictions</a></span></li><li><span><a href=\"#A-Few-More-Things\" data-toc-modified-id=\"A-Few-More-Things-12\"><span class=\"toc-item-num\">12&nbsp;&nbsp;</span>A Few More Things</a></span></li><li><span><a href=\"#That-was-fun,-yeah?\" data-toc-modified-id=\"That-was-fun,-yeah?-13\"><span class=\"toc-item-num\">13&nbsp;&nbsp;</span>That was fun, yeah?</a></span><ul class=\"toc-item\"><li><span><a href=\"#Weights-&amp;-Biases\" data-toc-modified-id=\"Weights-&amp;-Biases-13.1\"><span class=\"toc-item-num\">13.1&nbsp;&nbsp;</span>Weights &amp; Biases</a></span></li></ul></li></ul></div>","metadata":{"toc":true}},{"cell_type":"markdown","source":"## Load dataset","metadata":{}},{"cell_type":"code","source":"(X_train_full, y_train_full), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()\nlabels=[\"T-shirt/top\",\"Trouser\",\"Pullover\",\"Dress\",\"Coat\",\n        \"Sandal\",\"Shirt\",\"Sneaker\",\"Bag\",\"Ankle boot\"]\n\n# Normalize pixel values\nX_train_full, X_test = X_train_full / 255.0, X_test / 255.0\n\n#reshape input data\nX_train_full = X_train_full.reshape(X_train_full.shape[0], config.img_width, config.img_height, 1)\nX_test = X_test.reshape(X_test.shape[0], config.img_width, config.img_height, 1)\n\n# one hot encode outputs\ny_train_full = tf.keras.utils.to_categorical(y_train_full)\ny_test = tf.keras.utils.to_categorical(y_test)\nnum_classes = y_test.shape[1]\n\n# Split into validation, and training sets\nX_valid, X_train = X_train_full[:config.validation_size], X_train_full[config.validation_size:]\ny_valid, y_train = y_train_full[:config.validation_size], y_train_full[config.validation_size:]\nX_train_full.shape, X_train.shape, X_valid.shape, X_test.shape","metadata":{"_kg_hide-input":false,"_kg_hide-output":false,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"## Explore Dataset","metadata":{}},{"cell_type":"markdown","source":"![](https://paper-attachments.dropbox.com/s_70ED39A7F97AC42BAB6E155E4809EB5A8988617EE839B00C0D8E924932439DDF_1565465694924_embedding.gif)\n\n\nFashion-MNIST is a dataset of that shares the same structure as MNIST (60,000 images in training set, 10,000 in test set, with each image being a 28x28 grayscale image, and 10 label classes to predict). Fashion MNIST is slightly more complicated than MNIST and hence serves as a good dataset for benchmarking machine learning algorithms.\n\nA selection of the images in the dataset are shown below.","metadata":{}},{"cell_type":"markdown","source":"# Model Training","metadata":{}},{"cell_type":"markdown","source":"# Basic Neural Network","metadata":{}},{"cell_type":"markdown","source":"**1. Input neurons**\nThe input vector needs one input neuron per feature. For tabular data, this is the number of relevant features in your dataset. You want to carefully select these features and remove any that may contain patterns that won't generalize beyond the training set (and cause overfitting). For images, this is the dimensions of your image (28*28=784 in case of MNIST).\n\n        \n**2. Output neurons**\nThis is the number of predictions you want to make.\n- **Regression:** For regression tasks, this can be one value (e.g. housing price). For multi-variate regression, it is one neuron per predicted value (e.g. for bounding boxes it can be 4 neurons – one each for bounding box height, width, x-coordinate, y-coordinate).\n- **Classification:** For binary classification (spam-not spam), we use one output neuron per positive class, wherein the output represents the probability of the positive class. For multi-class classification (e.g. in object detection where an instance can be classified as a car, a dog, a house etc.), we have one output neuron per class, and use the softmax activation function on the output layer to ensure the final probabilities sum to 1.\n        \n        \n**3. Hidden Layers and Neurons per Hidden Layers**\n    The number of hidden layers is highly dependent on the problem and the architecture of your neural network. You're essentially trying to Goldilocks your way into the perfect neural network architecture - not too big, not too small, just right.\n        \nGenerally, 1-5 hidden layers will serve you well for most problems. When working with image or speech data, you'd want your network to have dozens-hundreds of layers, not all of which might be fully connected. For these use cases, there are pre-trained models (YOLO, ResNet, VGG) that allow you to use large parts of their networks, and train your model on top of these networks to learn only the higher order features. In this case, your model will still have only a few layers to train.\n\nIn general using the same number of neurons for all hidden layers will suffice. For some datasets, having a large first layer and following it up with smaller layers will lead to better performance as the first layer can learn a lot of lower-level features that can feed into a few higher order features in the subsequent layers.\n\nUsually you will get more of a performance boost from adding more layers than adding more neurons in each layer.\n\nI'd recommend starting with 1-5 layers and 1-100 neurons and slowly adding more layers and neurons until you start overfitting. You can track your loss and accuracy within your Weights and Biases dashboard to see which hidden layers + hidden neurons combo leads to the best loss.\n\nSomething to keep in mind with choosing a smaller number of layers/neurons is that if the this number is too small, your network will not be able to learn the underlying patterns in your data and thus be useless. An approach to counteract this is to start with a huge number of hidden layers + hidden neurons and then use dropout and early stopping to let the neural network size itself down for you. Again, I'd recommend trying a few combinations and track the performance in your Weights and Biases dashboard to determine the perfect network size for your problem.\n\nAndrej Karpathy also recommends the [overfit then regularize](http://karpathy.github.io/2019/04/25/recipe/) approach – \"first get a model large enough that it can overfit (i.e. focus on training loss) and then regularize it appropriately (give up some training loss to improve the validation loss).\"\n\n        \n**4. Loss function**\n![](https://paper-attachments.dropbox.com/s_39292DB9CE2A9400103E176C2ABC438C6A626910E9DBB0D6FBE28EE673C7492C_1565307636338_image.png)\n   - **Regression:** Mean squared error is the most common loss function to optimize for, unless there are a significant number of outliers. In this case, use mean absolute error or Huber loss.     \n   - **Classification:** Cross-entropy will serve you well in most cases.\n\n\n**5. Batch Size**\nLarge batch sizes can be great because they can harness the power of GPUs to process more training instances per time. [OpenAI has found](https://openai.com/blog/science-of-ai/) larger batch size (of tens of thousands for image-classification  and  language modeling, and of millions in the case of RL agents) serve well for scaling and parallelizability.\n\nThere's a case to be made for smaller batch sizes too, however. According to [this paper](https://arxiv.org/abs/1804.07612) by Masters and Luschi, the advantage gained from increased parallelism from running large batches is offset by the increased performance generalization and smaller memory footprint achieved by smaller batches. They show that increased batch sizes reduce the acceptable range of learning rates that provide stable convergence. Their takeaway is that smaller is, in-fact, better; and that the best performance is obtained by mini-batch sizes between 2 and 32.\n\nIf you're not operating at massive scales, I would recommend starting with lower batch sizes and slowly increasing the size and monitoring performance in your [Weights and Biases](https://www.wandb.com/) dashboard to determine the best fit.\n        \n**6. Number of epochs**\n    I'd recommend starting with a large number of epochs and use early stopping to halt training when performance stops improving.\n        \n        \n**7. Scaling your features**\n    A quick note: Make sure all your features have similar scale before using them as inputs to your neural network. This ensures faster convergence. When your features have different scales (e.g. salaries in thousands and years of experience in tens), the cost function will look like the elongated bowl on the left. This means your optimization algorithm will take a long time to traverse the valley compared to using normalized features (on the right).\n    ![](https://paper-attachments.dropbox.com/s_39292DB9CE2A9400103E176C2ABC438C6A626910E9DBB0D6FBE28EE673C7492C_1565307687940_image.png)\n        ","metadata":{}},{"cell_type":"code","source":"config = wandb.config # Config is a variable that holds and saves hyperparameters and inputs\nconfig.dropout = 0.2\nconfig.conv_layer_1_size  = 32\nconfig.conv_layer_2_size = 64\nconfig.conv_layer_3_size = 128\nconfig.hidden_layer_size = 512\nconfig.learn_rate = 0.01\nconfig.learn_rate_low = 0.001\nconfig.kernel_size = 3\nconfig.pool_size = 2\nconfig.decay = 1e-6\nconfig.momentum = 0.9\nconfig.n_epochs = 25\n\nconfig.img_width=28\nconfig.img_height=28\nconfig.num_classes = 10\nconfig.batch_size = 128\nconfig.validation_size = 5000","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"%%wandb \n# build model\nmodel = tf.keras.Sequential()\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_1_size, kernel_size=(config.kernel_size, config.kernel_size), activation='relu', input_shape=(config.img_width, config.img_height, 1)))\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_1_size, kernel_size=(config.kernel_size, config.kernel_size), activation='relu'))\nmodel.add(tf.keras.layers.MaxPooling2D((config.pool_size, config.pool_size)))\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_2_size, kernel_size=(config.kernel_size, config.kernel_size), activation='relu'))\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_2_size, kernel_size=(config.kernel_size, config.kernel_size), activation='relu'))\nmodel.add(tf.keras.layers.MaxPooling2D(pool_size=(config.pool_size, config.pool_size)))\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_3_size, (config.kernel_size, config.kernel_size), activation='relu'))\nmodel.add(tf.keras.layers.Flatten())\nmodel.add(tf.keras.layers.Dense(config.hidden_layer_size, activation='relu'))\nmodel.add(tf.keras.layers.Dense(config.num_classes, activation='softmax'))\n\nmodel.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.SGD(lr=config.learn_rate), metrics=['accuracy'])\nmodel.fit(X_train, y_train, verbose=0, validation_data=(X_valid, y_valid), epochs=config.n_epochs,\n        callbacks=[WandbCallback(data_type=\"image\", validation_data=(X_valid, y_valid), labels=labels)])","metadata":{"_kg_hide-output":false,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Learning Rate\n\n\n\n![](https://paper-attachments.dropbox.com/s_39292DB9CE2A9400103E176C2ABC438C6A626910E9DBB0D6FBE28EE673C7492C_1565307718429_image.png)\n\nPicking the learning rate is very important, and you want to make sure you get this right! Ideally you want to re-tweak the learning rate when you tweak the other hyper-parameters of your network.\n    \nTo find the best learning rate, start with a very low values (10^-6) and slowly multiply it by a constant until it reaches a very high value (e.g. 10). Measure your model performance (vs the log of your learning rate) in your Weights and Biases dashboard to determine which rate served you well for your problem. You can then retrain your model using this optimal learning rate.\n    \nThe best learning rate is usually half of the learning rate that causes the model to diverge. Feel free to set different values for learn_rate in the code and seeing how it affects model performance to develop your intuition around learning rates.\n\nI'd also recommend using the [Learning Rate finder](https://arxiv.org/abs/1506.01186) method proposed by Leslie Smith. It an excellent way to find a good learning rate for most gradient optimizers (most variants of SGD) and works with most network architectures.\n    \nAlso, see the section on learning rate scheduling below.","metadata":{}},{"cell_type":"code","source":"wandb.init(anonymous='allow', project=\"building-neural-nets\", name=\"lower_learning_rate\")","metadata":{"_kg_hide-output":false,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"%%wandb\n# build model\nmodel = tf.keras.Sequential()\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_1_size, kernel_size=(config.kernel_size, config.kernel_size), activation='relu', input_shape=(config.img_width, config.img_height, 1)))\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_1_size, kernel_size=(config.kernel_size, config.kernel_size), activation='relu'))\nmodel.add(tf.keras.layers.MaxPooling2D((config.pool_size, config.pool_size)))\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_2_size, kernel_size=(config.kernel_size, config.kernel_size), activation='relu'))\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_2_size, kernel_size=(config.kernel_size, config.kernel_size), activation='relu'))\nmodel.add(tf.keras.layers.MaxPooling2D(pool_size=(config.pool_size, config.pool_size)))\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_3_size, (config.kernel_size, config.kernel_size), activation='relu'))\nmodel.add(tf.keras.layers.Flatten())\nmodel.add(tf.keras.layers.Dense(config.hidden_layer_size, activation='relu'))\nmodel.add(tf.keras.layers.Dense(config.num_classes, activation='softmax'))\n\nmodel.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.SGD(lr=config.learn_rate_low), metrics=['accuracy'])\nmodel.fit(X_train, y_train, verbose=0, validation_data=(X_valid, y_valid), epochs=config.n_epochs,\n        callbacks=[WandbCallback(data_type=\"image\", validation_data=(X_valid, y_valid), labels=labels)])","metadata":{"_kg_hide-output":false,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Momentum\n![](https://paper-attachments.dropbox.com/s_39292DB9CE2A9400103E176C2ABC438C6A626910E9DBB0D6FBE28EE673C7492C_1565307746069_image.png)\n\nGradient Descent takes tiny, consistent steps towards the local minima and when the gradients are tiny it can take a lot of time to converge. Momentum on the other hand takes into account the previous gradients, and accelerates convergence by pushing over valleys faster and avoiding local minima.\n    \nIn general you want your momentum value to be very close to one. 0.9 is a good place to start for smaller datasets, and you want to move progressively closer to one (0.999) the larger your dataset gets. (Setting nesterov=True lets momentum take into account the gradient of the cost function a few steps ahead of the current point, which makes it slightly more accurate and faster.)","metadata":{}},{"cell_type":"code","source":"wandb.init(anonymous='allow', project=\"building-neural-nets\", name=\"momentum\")","metadata":{"_kg_hide-output":false,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"%%wandb\n# build model\nmodel = tf.keras.Sequential()\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_1_size, kernel_size=(config.kernel_size, config.kernel_size), activation='relu', input_shape=(config.img_width, config.img_height, 1)))\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_1_size, kernel_size=(config.kernel_size, config.kernel_size), activation='relu'))\nmodel.add(tf.keras.layers.MaxPooling2D((config.pool_size, config.pool_size)))\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_2_size, kernel_size=(config.kernel_size, config.kernel_size), activation='relu'))\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_2_size, kernel_size=(config.kernel_size, config.kernel_size), activation='relu'))\nmodel.add(tf.keras.layers.MaxPooling2D(pool_size=(config.pool_size, config.pool_size)))\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_3_size, (config.kernel_size, config.kernel_size), activation='relu'))\nmodel.add(tf.keras.layers.Flatten())\nmodel.add(tf.keras.layers.Dense(config.hidden_layer_size, activation='relu'))\nmodel.add(tf.keras.layers.Dense(config.num_classes, activation='softmax'))\n\nmodel.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.SGD(lr=config.learn_rate_low, decay=config.decay, momentum=config.momentum,\n                            nesterov=True), metrics=['accuracy'])\nmodel.fit(X_train, y_train, verbose=0, validation_data=(X_valid, y_valid), epochs=config.n_epochs,\n        callbacks=[WandbCallback(data_type=\"image\", validation_data=(X_valid, y_valid), labels=labels)])","metadata":{"_kg_hide-output":false,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Vanishing + Exploding Gradients\n\n![](https://paper-attachments.dropbox.com/s_39292DB9CE2A9400103E176C2ABC438C6A626910E9DBB0D6FBE28EE673C7492C_1565308012033_vanishing_gradients.gif)\n\nJust like people, not all neural network layers learn at the same speed. So when the backprop algorithm propagates the error gradient from the output layer to the first layers, the gradients get smaller and smaller until they're almost negligible when they reach the first layers. This means the weights of the first layers aren't updated significantly at each step.\n    \nThis is the problem of vanishing gradients. (A similar problem of exploding gradients occurs when the gradients for certain layers get progressively larger, leading to massive weight updates for some layers as opposed to the others.)\n    \nThere are a few ways to counteract vanishing gradients. Let's take a look at them now!\n    \n**1. Activation functions (non-saturating functions)**\n- **Hidden Layer Activation**\n    In general, the performance from using different [activation functions](https://isaacchanghau.github.io/post/activation_functions/) improves in this order (from lowest→highest performing): logistic → tanh → ReLU → Leaky ReLU → ELU → SELU. ReLU is the most popular activation function and if you don't want to tweak your activation function, ReLU is a great place to start. But, keep in mind ReLU is becoming increasingly less effective than [ELU](https://arxiv.org/pdf/1511.07289.pdf) or [GELU](https://arxiv.org/pdf/1606.08415.pdf). If you’re feeling more adventurous, you can try the following:\n    - to combat neural network overfitting: RReLU\n    - reduce latency at runtime: leaky ReLU\n    - for massive training sets: PReLU\n    - for fast inference times: leaky ReLU\n    - if your network doesn't self-normalize: ELU\n    - for an overall robust activation function: SELU\n    \n    [This](https://arxiv.org/pdf/1811.03378.pdf) is an excellent paper that dives deeper into the comparison of various activation functions for neural networks.\n\n    As always, don't be afraid to experiment with a few different activation functions, and turn to your Weights and Biases dashboard to help you pick the one that works best for you!    \n\n- **Output Layer Activation**\n    - **Regression:** Regression problems don't require activation functions for their output neurons because we want the output to take on any value. In cases where we want out values to be bounded into a certain range, we can use tanh for -1→1 values and logistic function for 0→1 values. In cases where we're only looking for positive output, we can use softplus activation.\n\n    - **Classification:** Use the sigmoid activation function for binary classification to ensure the output is between 0 and 1. Use softmax for multi-class classification to ensure the output probabilities add up to 1.\n            \n            \n**2. Weight initialization method**\n    The right weight initialization method can speed up time-to-convergence considerably. The choice of your initialization method depends on your activation function. Some things to try:\n- When using ReLU or leaky RELU, use [He initialization](https://arxiv.org/pdf/1502.01852.pdf)\n- When using SELU or ELU, use [LeCun initialization](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)\n- When using softmax, logistic, or tanh, use [Glorot initialization](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf)\n\nMost initialization methods come in uniform and normal distribution flavors.\n        \n        \n**3. BatchNorm**\nBatchNorm simply learns the optimal means and scales of each layer's inputs. It does so by zero-centering and normalizing its input vectors, then scaling and shifting them. It also acts like a regularizer which means we don't need dropout or L2 reg.\n\nUsing BatchNorm lets us use larger learning rates (which result in faster convergence) and lead to huge improvements in most neural networks by reducing the vanishing gradients problem. The only downside is that it slightly increases training times because of the extra computations required at each layer.\n        \n        \n**4. Gradient Clipping**\nA great way to reduce gradients from exploding, specially when training RNNs, is to simply clip them when they exceed a certain value. I'd recommend trying clipnorm instead of clipvalue, which allows you to keep the direction of your gradient vector consistent. Clipnorm contains any gradients who's l2 norm is greater than a certain threshold.\n\nTry a few different threshold values to find one that works best for you.\n        \n        \n**5. Early Stopping**\nEarly Stopping lets you live it up by training a model with more hidden layers, hidden neurons and for more epochs than you need, and just stopping training when performance stops improving consecutively for n epochs. It also saves the best performing model for you. You can enable Early Stopping by setting up a callback when you fit your model and setting `save_best_only=True`.","metadata":{}},{"cell_type":"code","source":"wandb.init(anonymous='allow', project=\"building-neural-nets\", name=\"vanishing_gradients\")","metadata":{"_kg_hide-output":false,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"%%wandb\n# build model\nmodel = tf.keras.Sequential()\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_1_size, kernel_size=(config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal',\n                 input_shape=(config.img_width, config.img_height, 1)))\ntf.keras.layers.BatchNormalization(),\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_1_size, kernel_size=(config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal'))\ntf.keras.layers.BatchNormalization(),\nmodel.add(tf.keras.layers.MaxPooling2D((config.pool_size, config.pool_size)))\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_2_size, kernel_size=(config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal'))\ntf.keras.layers.BatchNormalization(),\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_2_size, kernel_size=(config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal'))\ntf.keras.layers.BatchNormalization(),\nmodel.add(tf.keras.layers.MaxPooling2D(pool_size=(config.pool_size, config.pool_size)))\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_3_size, (config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal'))\nmodel.add(tf.keras.layers.Flatten())\nmodel.add(tf.keras.layers.Dense(config.hidden_layer_size, activation='selu', kernel_initializer='lecun_normal'))\ntf.keras.layers.BatchNormalization(),\nmodel.add(tf.keras.layers.Dense(config.num_classes, activation='softmax'))\n\nmodel.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.SGD(lr=config.learn_rate, clipnorm=1.0),\n              metrics=['accuracy'])\nmodel.fit(X_train, y_train, verbose=0, validation_data=(X_test, y_test), epochs=config.n_epochs,\n          callbacks=[WandbCallback(data_type=\"image\", validation_data=(X_valid, y_valid), labels=labels), tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)])","metadata":{"_kg_hide-output":false,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Dropout\n\nDropout is a fantastic regularization technique that gives you a massive performance boost (~2% for state-of-the-art models) for how simple the technique actually is. All dropout does is randomly turn off a percentage of neurons at each layer, at each training step. This makes the network more robust because it can't rely on any particular set of input neurons for making predictions. The knowledge is distributed amongst the whole network. Around 2^n (where n is the number of neurons in the architecture) slightly-unique neural networks are generated during the training process, and ensembled together to make predictions.\n\nA good dropout rate is between 0.1 to 0.5; 0.3 for RNNs, and 0.5 for CNNs. Use larger rates for bigger layers. Increasing the dropout rate decreases overfitting, and decreasing the rate is helpful to combat under-fitting.\n\nYou want to experiment with different rates of dropout values, in earlier layers of your network, and check your Weights and Biases dashboard to pick the best performing one. You definitely don’t want to use dropout in the output layers.\n\nRead [this paper](https://arxiv.org/abs/1801.05134) before using Dropout in conjunction with BatchNorm.\n\nIn this kernel I used AlphaDropout, a flavor of the vanilla dropout that works well with SELU activation functions by preserving the input's mean and standard deviations.","metadata":{}},{"cell_type":"code","source":"wandb.init(anonymous='allow', project=\"building-neural-nets\", name=\"dropout\")","metadata":{"_kg_hide-output":false,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"%%wandb\n# build model\nmodel = tf.keras.Sequential()\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_1_size, kernel_size=(config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal',\n                 input_shape=(config.img_width, config.img_height, 1)))\ntf.keras.layers.BatchNormalization(),\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_1_size, kernel_size=(config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal'))\ntf.keras.layers.BatchNormalization(),\nmodel.add(tf.keras.layers.MaxPooling2D((config.pool_size, config.pool_size)))\ntf.keras.layers.AlphaDropout(rate=config.dropout),\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_2_size, kernel_size=(config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal'))\ntf.keras.layers.BatchNormalization(),\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_2_size, kernel_size=(config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal'))\ntf.keras.layers.BatchNormalization(),\nmodel.add(tf.keras.layers.MaxPooling2D(pool_size=(config.pool_size, config.pool_size)))\ntf.keras.layers.AlphaDropout(rate=config.dropout),\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_3_size, (config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal'))\ntf.keras.layers.AlphaDropout(rate=config.dropout),\nmodel.add(tf.keras.layers.Flatten())\nmodel.add(tf.keras.layers.Dense(config.hidden_layer_size, activation='selu', kernel_initializer='lecun_normal'))\ntf.keras.layers.BatchNormalization(),\ntf.keras.layers.AlphaDropout(rate=config.dropout),\nmodel.add(tf.keras.layers.Dense(config.num_classes, activation='softmax'))\n\nmodel.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.SGD(lr=config.learn_rate, clipnorm=1.0),\n              metrics=['accuracy'])\nmodel.fit(X_train, y_train, verbose=0, validation_data=(X_test, y_test), epochs=config.n_epochs,\n          callbacks=[WandbCallback(data_type=\"image\", validation_data=(X_valid, y_valid), labels=labels), tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)])","metadata":{"_kg_hide-output":false,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Optimizers\n\n\nGradient Descent isn't the only optimizer game in town! There's a few different ones to choose from. This post does a good job of describing some of the optimizers you can choose from.\n![](https://paper-attachments.dropbox.com/s_39292DB9CE2A9400103E176C2ABC438C6A626910E9DBB0D6FBE28EE673C7492C_1565307886324_optimization_algorithms.gif)\n\nMy general advice is to use Stochastic Gradient Descent if you care deeply about quality of convergence and if time is not of the essence.\n\nIf you care about time-to-convergence and a point close to optimal convergence will suffice, experiment with Adam, Nadam, RMSProp, and Adamax optimizers. You Weights and Biases dashboard will guide you to the optimizer that works best for you!\n\nAdam/Nadam are usually good starting points, and tend to be quite forgiving to a bad learning late and other non-optimal hyperparameters.\n\n[According to Andrej Karpathy](http://karpathy.github.io/2019/04/25/recipe/), \"a well-tuned SGD will almost always slightly outperform Adam\" in the case of ConvNets.\n\nIn this kernel, I got the best performance from Nadam, which is just your regular Adam optimizer with the Nesterov trick, and thus converges faster than Adam.","metadata":{}},{"cell_type":"code","source":"wandb.init(anonymous='allow', project=\"building-neural-nets\", name=\"nadam_optimizer\")","metadata":{"_kg_hide-output":false,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"%%wandb\n# build model\nmodel = tf.keras.Sequential()\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_1_size, kernel_size=(config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal',\n         input_shape=(config.img_width, config.img_height, 1)))\ntf.keras.layers.BatchNormalization(),\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_1_size, kernel_size=(config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal'))\ntf.keras.layers.BatchNormalization(),\nmodel.add(tf.keras.layers.MaxPooling2D((config.pool_size, config.pool_size)))\ntf.keras.layers.AlphaDropout(rate=config.dropout),\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_2_size, kernel_size=(config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal'))\ntf.keras.layers.BatchNormalization(),\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_2_size, kernel_size=(config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal'))\ntf.keras.layers.BatchNormalization(),\nmodel.add(tf.keras.layers.MaxPooling2D(pool_size=(config.pool_size, config.pool_size)))\ntf.keras.layers.AlphaDropout(rate=config.dropout),\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_3_size, (config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal'))\ntf.keras.layers.AlphaDropout(rate=config.dropout),\nmodel.add(tf.keras.layers.Flatten())\nmodel.add(tf.keras.layers.Dense(config.hidden_layer_size, activation='selu', kernel_initializer='lecun_normal'))\ntf.keras.layers.BatchNormalization(),\ntf.keras.layers.AlphaDropout(rate=config.dropout),\nmodel.add(tf.keras.layers.Dense(config.num_classes, activation='softmax'))\n\nmodel.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Nadam(lr=config.learn_rate, beta_1=0.9, beta_2=0.999, clipnorm=1.0), metrics=['accuracy'])\nmodel.fit(X_train, y_train, verbose=0, validation_data=(X_test, y_test), epochs=config.n_epochs,\n    callbacks=[WandbCallback(data_type=\"image\", labels=labels, validation_data=(X_valid, y_valid)), tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)])","metadata":{"_kg_hide-output":false,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Learning Rate Scheduling\n\n\n![](https://paper-attachments.dropbox.com/s_39292DB9CE2A9400103E176C2ABC438C6A626910E9DBB0D6FBE28EE673C7492C_1565307910210_image.png)\n\nWe talked about the importance of a good learning rate already – we don't want it to be too high, lest the cost function dance around the optimum value and diverge. We also don't want it to be too low because that means convergence will take a very long time.\n\nBabysitting the learning rate can be tough because both higher and lower learning rates have their advantages. The great news is that we don't have to commit to one learning rate! With learning rate scheduling we can start with higher rates to move faster through gradient slopes, and slow it down when we reach a gradient valley in the hyper-parameter space which requires taking smaller steps.\n\nThere are many ways to schedule learning rates including decreasing the learning rate exponentially, or by using a step function, or tweaking it when the performance starts dropping, or using 1cycle scheduling. In this kernel, I show you how to use the ReduceLROnPlateau callback to reduce the learning rate by a constant factor whenever the performance drops for n epochs.\n\nI would highly recommend also trying out 1cycle scheduling.\n\nUse a constant learning rate until you've trained all other hyper-parameters. And implement learning rate decay scheduling at the end.\n\nAs with most things, I'd recommend running a few different experiments with different scheduling strategies and using your Weights and Biases dashboard to pick the one that leads to the best model.","metadata":{}},{"cell_type":"code","source":"wandb.init(anonymous='allow', project=\"building-neural-nets\", name=\"learningrate\")","metadata":{"_kg_hide-output":false,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"%%wandb\n# build model\nmodel = tf.keras.Sequential()\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_1_size, kernel_size=(config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal',\n         input_shape=(config.img_width, config.img_height, 1)))\ntf.keras.layers.BatchNormalization(),\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_1_size, kernel_size=(config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal'))\ntf.keras.layers.BatchNormalization(),\nmodel.add(tf.keras.layers.MaxPooling2D((config.pool_size, config.pool_size)))\ntf.keras.layers.AlphaDropout(rate=config.dropout),\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_2_size, kernel_size=(config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal'))\ntf.keras.layers.BatchNormalization(),\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_2_size, kernel_size=(config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal'))\ntf.keras.layers.BatchNormalization(),\nmodel.add(tf.keras.layers.MaxPooling2D(pool_size=(config.pool_size, config.pool_size)))\ntf.keras.layers.AlphaDropout(rate=config.dropout),\nmodel.add(tf.keras.layers.Conv2D(config.conv_layer_3_size, (config.kernel_size, config.kernel_size), activation='selu', kernel_initializer='lecun_normal'))\ntf.keras.layers.AlphaDropout(rate=config.dropout),\nmodel.add(tf.keras.layers.Flatten())\nmodel.add(tf.keras.layers.Dense(config.hidden_layer_size, activation='selu', kernel_initializer='lecun_normal'))\ntf.keras.layers.BatchNormalization(),\ntf.keras.layers.AlphaDropout(rate=config.dropout),\nmodel.add(tf.keras.layers.Dense(config.num_classes, activation='softmax'))\n\nlr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)\nmodel.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Nadam(lr=config.learn_rate, beta_1=0.9, beta_2=0.999, clipnorm=1.0), metrics=['accuracy'])\nmodel.fit(X_train, y_train, verbose=0, validation_data=(X_test, y_test), epochs=config.n_epochs,\n    callbacks=[WandbCallback(data_type=\"image\", labels=labels, validation_data=(X_valid, y_valid)), tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True), lr_scheduler])","metadata":{"_kg_hide-output":false,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Make Predictions","metadata":{}},{"cell_type":"code","source":"model.evaluate(X_test, y_test)","metadata":{"_kg_hide-output":true,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"model.save(\"fashion_mnist_model.h5\")","metadata":{"_kg_hide-output":true,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Let's make things more interesting - CIFAR100","metadata":{}},{"cell_type":"code","source":"from __future__ import print_function\nimport keras\nfrom keras.datasets import cifar10\nfrom keras.preprocessing.image import ImageDataGenerator\nfrom keras.models import Sequential\nfrom keras.layers import Dense, Dropout, Activation, Flatten\nfrom keras.layers import Conv2D, MaxPooling2D\nimport os","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"batch_size = 32\nnum_classes = 10\nepochs = 100\ndata_augmentation = True\nnum_predictions = 20\n\n# The data, split between train and test sets:\n(x_train, y_train), (x_test, y_test) = cifar10.load_data()\nprint('x_train shape:', x_train.shape)\nprint(x_train.shape[0], 'train samples')\nprint(x_test.shape[0], 'test samples')\n\n# Convert class vectors to binary class matrices.\ny_train = keras.utils.to_categorical(y_train, num_classes)\ny_test = keras.utils.to_categorical(y_test, num_classes)\nx_train = x_train.astype('float32') / 255.0\nx_test = x_test.astype('float32') / 255.0\n\nlabels = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"wandb.init(anonymous='allow', project=\"building-neural-nets\", name=\"cifar-10\")","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"%%wandb\nmodel = Sequential()\nmodel.add(Conv2D(32, (3, 3), padding='same', input_shape=x_train.shape[1:], activation='relu'))\nmodel.add(Conv2D(32, (3, 3), activation='relu'))\nmodel.add(MaxPooling2D(pool_size=(2, 2)))\nmodel.add(Dropout(0.25))\nmodel.add(Conv2D(64, (3, 3), padding='same', activation='relu'))\nmodel.add(Conv2D(64, (3, 3), activation='relu'))\nmodel.add(MaxPooling2D(pool_size=(2, 2)))\nmodel.add(Dropout(0.25))\nmodel.add(Flatten())\nmodel.add(Dense(512, activation='relu'))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(num_classes, activation='softmax'))\n\n# Let's train the model using RMSprop\nmodel.compile(loss='categorical_crossentropy',\n              optimizer=keras.optimizers.Nadam(lr=0.0001, beta_1=0.9, beta_2=0.999, clipnorm=1.0),\n              metrics=['accuracy'])\n\nmodel.fit(x_train, y_train,\n              batch_size=batch_size,\n              epochs=epochs,\n              validation_data=(x_test, y_test),\n              callbacks=[WandbCallback(data_type=\"image\", labels=labels, validation_data=(x_test, y_test)), keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)],\n              shuffle=True)","metadata":{"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# A Few More Things\n- Try [EfficientNets](https://arxiv.org/pdf/1905.11946.pdf) to scale your network in an optimal way.\n- Read [this paper](https://arxiv.org/pdf/1803.09820.pdf) for an overview of some additional learning rate, batch sizes, momentum and weight decay techniques.\n- And [this one](https://arxiv.org/abs/1803.05407) on Stochastic Weight Averaging (SWA). It shows that better generalization can be achieved by averaging multiple points along the SGD's trajectory, with a cyclical or constant learning rate.\n- Read Andrej Karpathy's [excellent guide](http://karpathy.github.io/2019/04/25/recipe/) on getting the most juice out of your neural networks.\n\n## If you like this kernel, please give it an upvote. Thank you! :)","metadata":{}},{"cell_type":"markdown","source":"# That was fun, yeah?\n\nWe've explored a lot of different facets of neural networks in this post!\n\nWe've looked at how to setup a basic neural network (including choosing the number of hidden layers, hidden neurons, batch sizes etc.)\n\nWe've learnt about the role momentum and learning rates play in influencing model performance.\n\nAnd finally we've explored the problem of vanishing gradients and how to tackle it using non-saturating activation functions, BatchNorm, better weight initialization techniques and early stopping.\n\nYou can compare the accuracy and loss performances for the various techniques we tried in one single chart, by visiting your [Weights and Biases](https://app.wandb.ai/home) dashboard.\n\nNeural networks are powerful beasts that give you a lot of levers to tweak to get the best performance for the problems you're trying to solve! The sheer size of customizations that they offer can be overwhelming to even seasoned practitioners. Tools like Weights and Biases are your best friends in navigating the land of the hyper-parameters, trying different experiments and picking the most powerful models.\n\nI hope this guide will serve as a good starting point in your adventures.\n\nIf you have any questions or feedback, please don't hesitate to [message me](https://twitter.com/lavanyaai)!\n\nGood luck!\n\n\n----------\n## Weights & Biases\n\nWe're building lightweight, flexible experiment tracking tools for deep learning. Add a couple of lines to your python script, and we'll keep track of your hyperparameters and output metrics, making it easy to compare runs and see the whole history of your progress. Think of us like GitHub for deep learning. Find out more at [wandb.com](http://wandb.com).\n","metadata":{}}]}
\ No newline at end of file