diff --git a/README.md b/README.md
index d63a6a1..694bc88 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,37 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Ishan Ranade
+* Tested on personal computer: Gigabyte Aero 14, Windows 10, i7-7700HQ, GTX 1060
 
-### (TODO: Your README)
+# Boid Simulation
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+![](demo2.gif)
+
+## Features
+
+- Naive implementation
+- Uniform grid without coherence
+- Uniform grid with coherence
+- Performance analysis with various blocksizes and boid counts
+
+## Performance Analysis
+
+Below are graphs for performance for the three algorithms implements for various blocksizes including 128, 256, and 512.
+
+![](blocksize128.JPG)
+
+![](blocksize256.JPG)
+
+![](blocksize512.JPG)
+
+
+To perform my analysis I recorded down the framerates of the simulation as both the blocksize and number of boids changed for each of the three types of algorithm implemented.  I felt that FPS was the best way to determine which combination produced the best results.
+
+The baseline was the Naive method, and the goal was for each of the other two methods to be able to beat the naive performance.  This would be a good test to determine if using the GPU actually produced better results.
+
+For the Naive method changing the number of boids definitely changed the performance because this method was completely dependendent on boid size and did not make use of the massive parallelism.  For the uniform method, increasing the number of boids made the performance worse in all three cases of blocksizes.  This may be because of the lack of caching when looping through the particles in the surrounding cells.  The coherent method did the best with increasing boid count and this may be due to caching.
+
+In general it seemed that the naive method produced the worst results especially as the number of boids increased.  The block size did not affect this method as it did not make use of the massive parallelism.
+
+The uniform method seemed to do better than the coherent method except for when the blocksize was 512.  The coherent grid showed the best performance especially as the boid size increased.  I was surprised though that at low blocksizes and boid counts it seemed to do worse than the uniform method, but seemed to excel as both the blocksize and boid count increased.
\ No newline at end of file
diff --git a/blocksize128.JPG b/blocksize128.JPG
new file mode 100644
index 0000000..5b08a9b
Binary files /dev/null and b/blocksize128.JPG differ
diff --git a/blocksize256.JPG b/blocksize256.JPG
new file mode 100644
index 0000000..88f031d
Binary files /dev/null and b/blocksize256.JPG differ
diff --git a/blocksize512.JPG b/blocksize512.JPG
new file mode 100644
index 0000000..a86384e
Binary files /dev/null and b/blocksize512.JPG differ
diff --git a/data.txt b/data.txt
new file mode 100644
index 0000000..07d961e
--- /dev/null
+++ b/data.txt
@@ -0,0 +1,41 @@
+Coherent
+	Boids 1000
+		Blocksize 128 - 450
+		Blocksize 256 - 469
+		Blocksize 512 - 564
+	Boids 5000
+		Blocksize 128 - 398
+		Blocksize 256 - 395
+		Blocksize 512 - 396
+	Boids 10000
+		Blocksize 128 - 485
+		Blocksize 256 - 378
+		Blocksize 512 - 320
+
+Uniform
+	Boids 1000
+		Blocksize 128 - 555
+		Blocksize 256 - 553
+		Blocksize 512 - 496
+	Boids 5000
+		Blocksize 128 - 382
+		Blocksize 256 - 404
+		Blocksize 512 - 392
+	Boids 10000
+		Blocksize 128 - 444
+		Blocksize 256 - 277
+		Blocksize 512 - 242
+
+Naive
+	Boids 1000
+		Blocksize 128 - 460
+		Blocksize 256 - 509
+		Blocksize 512 - 464
+	Boids 5000
+		Blocksize 128 - 253
+		Blocksize 256 - 240
+		Blocksize 512 - 242
+	Boids 10000
+		Blocksize 128 - 121
+		Blocksize 256 - 117
+		Blocksize 512 - 114
\ No newline at end of file
diff --git a/data.xlsx b/data.xlsx
new file mode 100644
index 0000000..aca25ec
Binary files /dev/null and b/data.xlsx differ
diff --git a/demo.gif b/demo.gif
new file mode 100644
index 0000000..c617b1e
Binary files /dev/null and b/demo.gif differ
diff --git a/demo2.gif b/demo2.gif
new file mode 100644
index 0000000..04efc7b
Binary files /dev/null and b/demo2.gif differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fdd636d..030ca66 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,5 +10,5 @@ set(SOURCE_FILES
 
 cuda_add_library(src
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_60
     )
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..89b452a 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -5,6 +5,7 @@
 #include <glm/glm.hpp>
 #include "utilityCore.hpp"
 #include "kernel.h"
+#include <device_launch_parameters.h>
 
 // LOOK-2.1 potentially useful for doing grid-based neighbor search
 #ifndef imax
@@ -70,6 +71,10 @@ glm::vec3 *dev_pos;
 glm::vec3 *dev_vel1;
 glm::vec3 *dev_vel2;
 
+glm::vec3 *sorted_Vel1;
+glm::vec3 *sorted_Vel2;
+glm::vec3 *sorted_Pos;
+
 // LOOK-2.1 - these are NOT allocated for you. You'll have to set up the thrust
 // pointers on your own too.
 
@@ -94,6 +99,8 @@ float gridCellWidth;
 float gridInverseCellWidth;
 glm::vec3 gridMinimum;
 
+bool useVelBuf1;
+
 /******************
 * initSimulation *
 ******************/
@@ -137,6 +144,7 @@ __global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, flo
 * Initialize memory, update some globals
 */
 void Boids::initSimulation(int N) {
+  useVelBuf1 = true;
   numObjects = N;
   dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
 
@@ -169,6 +177,25 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+  checkCUDAErrorWithLine("dev_particleArrayIndices failed");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+  checkCUDAErrorWithLine("dev_particleGridIndices failed");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("dev_gridCellStartIndices failed");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("dev_gridCellEndIndices failed");
+
+  cudaMalloc((void**)&sorted_Vel1, N * sizeof(glm::vec3));
+  cudaMalloc((void**)&sorted_Vel2, N * sizeof(glm::vec3));
+  cudaMalloc((void**)&sorted_Pos, N * sizeof(glm::vec3));
+
   cudaDeviceSynchronize();
 }
 
@@ -233,7 +260,58 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
   // Rule 2: boids try to stay a distance d away from each other
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+
+  int count1 = 0;
+  int count3 = 0;
+  glm::vec3 perceivedCenter(0.f, 0.f, 0.f);
+  glm::vec3 perceivedVelocity(0.f, 0.f, 0.f);
+  glm::vec3 c(0.f, 0.f, 0.f);
+
+  for (int i = 0; i < N; i++) {
+    if (i != iSelf) {
+      float distance = glm::length(pos[i] - pos[iSelf]);
+
+      if (distance < rule1Distance) {
+        perceivedCenter += pos[i];
+        count1++;
+      }
+
+      if (distance < rule2Distance) {
+        c -= (pos[i] - pos[iSelf]);
+      }
+
+      if (distance < rule3Distance) {
+        perceivedVelocity += vel[i];
+        count3++;
+      }
+    }
+  }
+
+  if (count1 > 0) {
+    perceivedCenter /= float(count1);
+    perceivedCenter = (perceivedCenter - pos[iSelf]);
+  }
+
+  if (count3 > 0) {
+    perceivedVelocity /= float(count3);
+  }
+
+  glm::vec3 finalVel;
+
+  finalVel = (perceivedCenter * rule1Scale) + (c * rule2Scale) + (perceivedVelocity * rule3Scale);
+  
+
+  return finalVel;
+}
+
+// LOOK-2.1 Consider this method of computing a 1D index from a 3D grid index.
+// LOOK-2.3 Looking at this method, what would be the most memory efficient
+//          order for iterating over neighboring grid cells?
+//          for(x)
+//            for(y)
+//             for(z)? Or some other order?
+__device__ int gridIndex3Dto1D(int x, int y, int z, int cellWidth) {
+  return x + y * cellWidth + z * cellWidth * cellWidth;
 }
 
 /**
@@ -242,9 +320,22 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
 */
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   glm::vec3 *vel1, glm::vec3 *vel2) {
-  // Compute a new velocity based on pos and vel1
-  // Clamp the speed
-  // Record the new velocity into vel2. Question: why NOT vel1?
+	// Compute a new velocity based on pos and vel1
+	// Clamp the speed
+	// Record the new velocity into vel2. Question: why NOT vel1?
+
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+  if (index < N) {
+    glm::vec3 delta = computeVelocityChange(N, index, pos, vel1);
+    delta += vel1[index];
+
+    if (glm::length(delta) > maxSpeed) {
+      delta = glm::normalize(delta) * maxSpeed;
+    }
+
+    vel2[index] = delta;
+  }
 }
 
 /**
@@ -272,23 +363,23 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
   pos[index] = thisPos;
 }
 
-// LOOK-2.1 Consider this method of computing a 1D index from a 3D grid index.
-// LOOK-2.3 Looking at this method, what would be the most memory efficient
-//          order for iterating over neighboring grid cells?
-//          for(x)
-//            for(y)
-//             for(z)? Or some other order?
-__device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
-  return x + y * gridResolution + z * gridResolution * gridResolution;
-}
-
-__global__ void kernComputeIndices(int N, int gridResolution,
-  glm::vec3 gridMin, float inverseCellWidth,
+__global__ void kernComputeIndices(int N, int gridSideCount,
+  glm::vec3 gridMin, float cellWidth,
   glm::vec3 *pos, int *indices, int *gridIndices) {
     // TODO-2.1
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+  if (index < N) {
+    int x = floor((pos[index].x + abs(gridMin.x)) / cellWidth);
+    int y = floor((pos[index].y + abs(gridMin.y)) / cellWidth);
+    int z = floor((pos[index].z + abs(gridMin.z)) / cellWidth);
+
+    indices[index] = index;
+    gridIndices[index] = gridIndex3Dto1D(x, y, z, gridSideCount);
+  }
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +397,16 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index < N) {
+    if (index == 0 || particleGridIndices[index] != particleGridIndices[index-1]) {
+      gridCellStartIndices[particleGridIndices[index]] = index;
+    }
+
+    if (index == N - 1 || particleGridIndices[index] != particleGridIndices[index + 1]) {
+      gridCellEndIndices[particleGridIndices[index]] = index;
+    }
+  }
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -322,6 +423,84 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+  
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+  if (index < N) {
+    float maxSearchDistance = glm::max(glm::max(rule1Distance, rule2Distance), rule3Distance);
+
+    glm::vec3 min = glm::vec3(floor(pos[index].x + abs(gridMin.x) - maxSearchDistance) * inverseCellWidth,
+      floor(pos[index].y + abs(gridMin.y) - maxSearchDistance) * inverseCellWidth,
+      floor(pos[index].z + abs(gridMin.z) - maxSearchDistance) * inverseCellWidth);
+
+    glm::vec3 max = glm::vec3(floor(pos[index].x + abs(gridMin.x) + maxSearchDistance) * inverseCellWidth,
+      floor(pos[index].y + abs(gridMin.y) + maxSearchDistance) * inverseCellWidth,
+      floor(pos[index].z + abs(gridMin.z) + maxSearchDistance) * inverseCellWidth);
+
+    glm::clamp(min, glm::vec3(0, 0, 0), glm::vec3(gridResolution, gridResolution, gridResolution));
+    glm::clamp(max, glm::vec3(0, 0, 0), glm::vec3(gridResolution, gridResolution, gridResolution));
+
+    int count1 = 0;
+    int count3 = 0;
+    glm::vec3 perceivedCenter;
+    glm::vec3 perceivedVelocity;
+    glm::vec3 c;
+
+    for (int i = min.x; i < max.x; i++) {
+      for (int j = min.y; j < max.y; j++) {
+        for (int k = min.z; k < max.z; k++) {
+          int currentGridIndex = gridIndex3Dto1D(i, j, k, gridResolution);
+
+          int startIndex = gridCellStartIndices[currentGridIndex];
+          int endIndex = gridCellEndIndices[currentGridIndex];
+
+          for (int gridIndex = startIndex; gridIndex < endIndex; gridIndex++) {
+            int particleIndex = particleArrayIndices[gridIndex];
+
+            if (particleIndex != index) {
+
+              float distance = glm::distance(pos[particleIndex], pos[index]);
+              
+              if (distance < rule1Distance) {
+                perceivedCenter += pos[particleIndex];
+                count1++;
+              }
+
+              if (distance < rule2Distance) {
+                c -= (pos[particleIndex] - pos[index]);
+              }
+
+              if (distance < rule3Distance) {
+                perceivedVelocity += vel1[particleIndex];
+                count3++;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    if (count1 > 0) {
+      perceivedCenter /= count1;
+      perceivedCenter = (perceivedCenter - pos[index]);
+    }
+
+    if (count3 > 0) {
+      perceivedVelocity /= count3;
+    }
+
+    glm::vec3 finalVel = vel1[index];
+
+    finalVel += perceivedCenter * rule1Scale;
+    finalVel += c * rule2Scale;
+    finalVel += perceivedVelocity * rule3Scale;
+
+    if (glm::length(finalVel) > maxSpeed) {
+      finalVel = glm::normalize(finalVel) * maxSpeed;
+    }
+
+    vel2[index] = finalVel;
+  }
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,6 +520,84 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+  if (index < N) {
+    float maxSearchDistance = glm::max(glm::max(rule1Distance, rule2Distance), rule3Distance);
+
+    glm::vec3 min = glm::vec3(floor(pos[index].x + abs(gridMin.x) - maxSearchDistance) * inverseCellWidth,
+      floor(pos[index].y + abs(gridMin.y) - maxSearchDistance) * inverseCellWidth,
+      floor(pos[index].z + abs(gridMin.z) - maxSearchDistance) * inverseCellWidth);
+
+    glm::vec3 max = glm::vec3(floor(pos[index].x + abs(gridMin.x) + maxSearchDistance) * inverseCellWidth,
+      floor(pos[index].y + abs(gridMin.y) + maxSearchDistance) * inverseCellWidth,
+      floor(pos[index].z + abs(gridMin.z) + maxSearchDistance) * inverseCellWidth);
+
+    glm::clamp(min, glm::vec3(0, 0, 0), glm::vec3(gridResolution, gridResolution, gridResolution));
+    glm::clamp(max, glm::vec3(0, 0, 0), glm::vec3(gridResolution, gridResolution, gridResolution));
+
+    int count1 = 0;
+    int count3 = 0;
+    glm::vec3 perceivedCenter;
+    glm::vec3 perceivedVelocity;
+    glm::vec3 c;
+
+    for (int i = min.x; i < max.x; i++) {
+      for (int j = min.y; j < max.y; j++) {
+        for (int k = min.z; k < max.z; k++) {
+          int currentGridIndex = gridIndex3Dto1D(i, j, k, gridResolution);
+
+          int startIndex = gridCellStartIndices[currentGridIndex];
+          int endIndex = gridCellEndIndices[currentGridIndex];
+
+          for (int gridIndex = startIndex; gridIndex < endIndex; gridIndex++) {
+            int particleIndex = gridIndex;// particleArrayIndices[gridIndex];
+
+            if (particleIndex != index) {
+
+              float distance = glm::distance(pos[particleIndex], pos[index]);
+
+              if (distance < rule1Distance) {
+                perceivedCenter += pos[particleIndex];
+                count1++;
+              }
+
+              if (distance < rule2Distance) {
+                c -= (pos[particleIndex] - pos[index]);
+              }
+
+              if (distance < rule3Distance) {
+                perceivedVelocity += vel1[particleIndex];
+                count3++;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    if (count1 > 0) {
+      perceivedCenter /= count1;
+      perceivedCenter = (perceivedCenter - pos[index]);
+    }
+
+    if (count3 > 0) {
+      perceivedVelocity /= count3;
+    }
+
+    glm::vec3 finalVel = vel1[index];
+
+    finalVel += perceivedCenter * rule1Scale;
+    finalVel += c * rule2Scale;
+    finalVel += perceivedVelocity * rule3Scale;
+
+    if (glm::length(finalVel) > maxSpeed) {
+      finalVel = glm::normalize(finalVel) * maxSpeed;
+    }
+
+    vel2[index] = finalVel;
+  }
 }
 
 /**
@@ -349,6 +606,15 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
+  dim3 boidSizeBlocks((numObjects + blockSize - 1) / blockSize);
+
+  kernUpdateVelocityBruteForce << <boidSizeBlocks, blockSize >> > (numObjects, dev_pos, dev_vel1, dev_vel2);
+
+  glm::vec3 *temp = dev_vel1;
+  dev_vel1 = dev_vel2;
+  dev_vel2 = temp;
+
+  kernUpdatePos << < boidSizeBlocks, blockSize >> > (numObjects, dt, dev_pos, dev_vel1);
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +630,37 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+
+  int boidSizeBlocks = (numObjects + blockSize - 1) / blockSize;
+  int gridSizeBlocks = (gridCellCount + blockSize - 1) / blockSize;
+
+  kernComputeIndices << < boidSizeBlocks, blockSize >> >(numObjects, gridSideCount, gridMinimum, gridCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+  thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+  kernResetIntBuffer << < gridSizeBlocks, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+  kernResetIntBuffer << < gridSizeBlocks, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+
+  kernIdentifyCellStartEnd << < boidSizeBlocks, blockSize >> >(gridCellCount, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+  kernUpdateVelNeighborSearchScattered << < boidSizeBlocks, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+
+  glm::vec3 *temp = dev_vel1;
+  dev_vel1 = dev_vel2;
+  dev_vel2 = temp;
+
+  kernUpdatePos << < boidSizeBlocks, blockSize >> > (numObjects, dt, dev_pos, dev_vel1);
+}
+
+__global__ void kernSortVelocities(int N, int *particleArrayIndices, glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2, glm::vec3 *sortedPos, glm::vec3 *sortedVel1, glm::vec3 *sortedVel2) {
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+  if (index < N) {
+    int grid = particleArrayIndices[index];
+
+    sortedPos[index] = pos[grid];
+    sortedVel1[index] = vel1[grid];
+    sortedVel2[index] = vel2[grid];
+  }
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +679,42 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+
+  int boidSizeBlocks = (numObjects + blockSize - 1) / blockSize;
+  int gridSizeBlocks = (gridCellCount + blockSize - 1) / blockSize;
+
+  kernComputeIndices << < boidSizeBlocks, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+  thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+  kernResetIntBuffer << < gridSizeBlocks, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+  kernResetIntBuffer << < gridSizeBlocks, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+
+  kernIdentifyCellStartEnd << < boidSizeBlocks, blockSize >> > (gridCellCount, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+  kernSortVelocities << <boidSizeBlocks, blockSize >> > (numObjects, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2, sorted_Pos, sorted_Vel1, sorted_Vel2);
+
+  glm::vec3 *temp = dev_pos;
+  dev_pos = sorted_Pos;
+  sorted_Pos = temp;
+
+  temp = dev_vel1;
+  dev_vel1 = sorted_Vel1;
+  sorted_Vel1 = temp;
+
+  temp = dev_vel2;
+  dev_vel2 = sorted_Vel2;
+  sorted_Vel2 = temp;
+
+  kernUpdateVelNeighborSearchScattered << < boidSizeBlocks, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+
+  temp = dev_vel1;
+  dev_vel1 = dev_vel2;
+  dev_vel2 = temp;
+
+  kernUpdatePos << < boidSizeBlocks, blockSize >> > (numObjects, dt, dev_pos, dev_vel1);
+
+
+
 }
 
 void Boids::endSimulation() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..46422e0 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,11 +14,11 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 10000;
 const float DT = 0.2f;
 
 /**