diff --git a/README.md b/README.md
index d63a6a1..738295f 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,58 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Siyu Zheng
+  * [LinkedIn](https://www.linkedin.com/in/siyu-zheng-b3b38aa8/)
+* Tested on: Tested on: Tested on: Windows 10, i7-8750 @ 2.20GHz 16GB, GTX 1060 6GB (Personal Laptop)
 
-### (TODO: Your README)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+## Result
+
+![](images/coh20000b.gif)
+
+Simulation using coherent grids when boids number is 20000 and block size is 128.
+
+## Performance Analysis
+
+Framerate change with increasing # of boids for naive, scattered uniform grid, and coherent uniform grid (with and without visualization)
+![](images/boidsnumberwithoutvisulize.png)
+![](images/boidsnumberwithvisulize.png)
+Framerate change with increasing block size
+![](images/blocksize.png)
+
+* For each implementation, how does changing the number of boids affect
+performance? Why do you think this is?
+
+From the graph, we can see that as the number of boids increase, the framerate of naive, scatter and coherent implementation are all decreasing. This is because the incresing of boids will require more calculation on neighbor/all other boids. Among these three implementation, the naive approach has the lowest fps since it need to check more boids than other two implementation. Using uniform grid can significantly increase the performance. What's more, a uniform grid with semi-coherent memory access has the best performance since it optimize the memory access than scattered approach.
+
+In my experiment I also found that in some certain points the framerate will change a lot. From boids number 1280 to 1281, the framerate drop from around 1800 to around 1100. From boids number 5461 to 5462, the framerate increase from 1100 to 1800. 
+
+* For each implementation, how does changing the block count and block size
+affect performance? Why do you think this is?
+
+From the graph, we observed that as the block size change from 16 to 512 (the multiple of 32), the performance didn't change much. When block size is 16 which is less than warp size, the framerate of the brute force implementation is the lowest. This might due to the warp is not sufficiently used.
+
+* For the coherent uniform grid: did you experience any performance improvements
+with the more coherent uniform grid? Was this the outcome you expected?
+Why or why not?
+
+In my coherent uniform grid test, the performance improve quite a lot. This is because it access coherent memory. The program are likely to write a lots of bytes in small chunks. So if we perform data operation in random pointers, it will cost expensive memory access. With coherent memory storage, we can access contiguous memory to has less expensive cost.
+
+* Did changing cell width and checking 27 vs 8 neighboring cells affect performance?
+Why or why not? Be careful: it is insufficient (and possibly incorrect) to say
+that 27-cell is slower simply because there are more cells to check!
+
+Yes. Checking 27 neighboring will require more memory and more cach miss will occur.
+
+| 10000 boids        | 8 neighbors          | 27 neighbors  |
+| ------------- |:-------------:| -----:|
+| scattered      | 1346.6 | 1229.9 |
+| coherent      | 1639.7      |   1534.2 |
+
+
+| 20000 boids        | 8 neighbors          | 27 neighbors  |
+| ------------- |:-------------:| -----:|
+| scattered      | 783.8 | 585.3 |
+| coherent      | 1486.5      |   1156.6 |
+
+For my own experiment, the implementation of 8 neighbors has better performance than 27 neighbors.
\ No newline at end of file
diff --git a/images/blocksize.png b/images/blocksize.png
new file mode 100644
index 0000000..c420f06
Binary files /dev/null and b/images/blocksize.png differ
diff --git a/images/boidsnumberwithoutvisulize.png b/images/boidsnumberwithoutvisulize.png
new file mode 100644
index 0000000..632164d
Binary files /dev/null and b/images/boidsnumberwithoutvisulize.png differ
diff --git a/images/boidsnumberwithvisulize.png b/images/boidsnumberwithvisulize.png
new file mode 100644
index 0000000..3425e7f
Binary files /dev/null and b/images/boidsnumberwithvisulize.png differ
diff --git a/images/coh20000b.gif b/images/coh20000b.gif
new file mode 100644
index 0000000..04b19c5
Binary files /dev/null and b/images/coh20000b.gif differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fdd636d..750f0cb 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,5 +10,5 @@ set(SOURCE_FILES
 
 cuda_add_library(src
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_30
     )
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..ac366f1 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -17,6 +17,8 @@
 
 #define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
 
+#define neighbors8 1
+
 /**
 * Check for CUDA errors; print and exit if there was a problem.
 */
@@ -86,6 +88,9 @@ int *dev_gridCellEndIndices;   // to this cell?
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
 
+glm::vec3 *dev_shuffledPos;
+glm::vec3 *dev_shuffledVel;
+
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
 int gridCellCount;
@@ -169,6 +174,24 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  cudaMalloc((void**)&dev_shuffledPos, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_shuffledPos failed!");
+
+  cudaMalloc((void**)&dev_shuffledVel, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_shuffledVel failed!");
+
   cudaDeviceSynchronize();
 }
 
@@ -233,7 +256,47 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
   // Rule 2: boids try to stay a distance d away from each other
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+	glm::vec3 thisBoidPos = pos[iSelf];
+	glm::vec3 thisBoidVel = vel[iSelf];
+	glm::vec3 center = glm::vec3(0.0f);
+	glm::vec3 seperate = glm::vec3(0.0f);
+	glm::vec3 cohesion = glm::vec3(0.0f);
+	int neighborCountRule1 = 0;
+	int neighborCountRule3 = 0;
+
+	for (int j = 0; j < N; j++) {
+		if (j == iSelf) {
+			continue;
+		}
+		glm::vec3 thatBoidPos = pos[j];
+		glm::vec3 thatBoidVel = vel[j];
+		float dist = glm::distance(thisBoidPos, thatBoidPos);
+		if (dist < rule1Distance) {
+			center += thatBoidPos;
+			neighborCountRule1++;
+		}
+		if (dist < rule2Distance) {
+			seperate -= thatBoidPos - thisBoidPos;
+		}
+		if (dist < rule3Distance) {
+			cohesion += thatBoidVel;
+			neighborCountRule3++;
+		}
+	}
+
+	if (neighborCountRule1 > 0) {
+		center /= neighborCountRule1;
+		thisBoidVel += (center - thisBoidPos) * rule1Scale;	
+	}
+
+	if (neighborCountRule3 > 0) {
+		cohesion /= neighborCountRule3;
+		thisBoidVel += cohesion * rule3Scale;
+	}
+
+	thisBoidVel += seperate * rule2Scale;
+	 
+  return thisBoidVel;
 }
 
 /**
@@ -243,8 +306,19 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   glm::vec3 *vel1, glm::vec3 *vel2) {
   // Compute a new velocity based on pos and vel1
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) {
+		return;
+	}
+	glm::vec3 velocityChange = computeVelocityChange(N, index, pos, vel1);
+	glm::vec3 vel2New = vel1[index] + velocityChange;
   // Clamp the speed
+	vel2New = glm::length(vel2New) > maxSpeed ? maxSpeed * glm::normalize(vel2New) : vel2New;
+	//vel2New = glm::vec3(glm::clamp(vel2New.x, -maxSpeed, maxSpeed), 
+	//	glm::clamp(vel2New.y, -maxSpeed, maxSpeed), 
+	//	glm::clamp(vel2New.z, -maxSpeed, maxSpeed));
   // Record the new velocity into vel2. Question: why NOT vel1?
+	vel2[index] = vel2New;
 }
 
 /**
@@ -289,6 +363,13 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) {
+		return;
+	}
+	indices[index] = index;
+	glm::vec3 gridIndex = (pos[index] - gridMin) * inverseCellWidth;
+	gridIndices[index] = gridIndex3Dto1D(gridIndex.x, gridIndex.y, gridIndex.z, gridResolution);
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +387,23 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) {
+		return;
+	} 
+	int gridIndex = particleGridIndices[index];
+	if (index == 0) {
+		gridCellStartIndices[gridIndex] = 0;
+		return;
+	}
+	else if (index == N - 1) {
+		gridCellEndIndices[gridIndex] = N;
+	}
+	if (particleGridIndices[index] != particleGridIndices[index - 1]) {
+		gridCellStartIndices[gridIndex] = index;
+		//end indice is not included in neighbors.
+		gridCellEndIndices[particleGridIndices[index - 1]] = index;
+	}
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -316,12 +414,117 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
   // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
   // the number of boids that need to be checked.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) {
+		return;
+	}
+	// - Identify the grid cell that this particle is in
+	glm::vec3 curPos = pos[index];
+	glm::vec3 cellPos = (curPos - gridMin) * inverseCellWidth;
+
+	// - Identify which cells may contain neighbors. This isn't always 8.
+	// - For each cell, read the start/end indices in the boid pointer array.
+
+	//float maxdist = glm::max(rule1Distance, glm::max(rule2Distance, rule3Distance));
+	//glm::ivec3 neighborcellstart = (curPos - maxdist - gridMin) * inverseCellWidth;
+	//glm::ivec3 neighborcellend = (curPos + maxdist - gridMin) * inverseCellWidth;
+
+	// - Access each boid in the cell and compute velocity change from
+	//   the boids rules, if this boid is within the neighborhood distance.
+	glm::vec3 thisBoidPos = pos[index];
+	glm::vec3 thisBoidVel = vel1[index];
+	glm::vec3 center = glm::vec3(0.0f);
+	glm::vec3 seperate = glm::vec3(0.0f);
+	glm::vec3 cohesion = glm::vec3(0.0f);
+	int neighborCountRule1 = 0;
+	int neighborCountRule3 = 0;
+	int xStart = -1, xEnd = 1;
+	int yStart = -1, yEnd = 1;
+	int zStart = -1, zEnd = 1;
+
+	#if neighbors8
+		glm::vec3 cellPosFloor = cellPos - glm::floor(cellPos);
+		if (cellPosFloor.x < 0.5f) {
+			xEnd = 0;
+		}
+		else {
+			xStart = 0;
+		}
+		if (cellPosFloor.y < 0.5f) {
+			yEnd = 0;
+		}
+		else {
+			yStart = 0;
+		}
+		if (cellPosFloor.z < 0.5f) {
+			zEnd = 0;
+		}
+		else {
+			zStart = 0;
+		}
+	#endif
+		for (int gridz = cellPos.z + zStart; gridz <= cellPos.z + zEnd; gridz++) {
+			for (int gridy = cellPos.y + yStart; gridy <= cellPos.y + yEnd; gridy++) {
+				for (int gridx = cellPos.x + xStart; gridx <= cellPos.x + xEnd; gridx++) {
+					int x = imax(gridx, 0);
+					int y = imax(gridy, 0);
+					int z = imax(gridx, 0);
+					x = imin(gridx, gridResolution - 1);
+					y = imin(gridy, gridResolution - 1);
+					z = imin(gridz, gridResolution - 1);
+		//for (int z = neighborcellstart.z; z <= neighborcellend.z; z++) {
+		//	for (int y = neighborcellstart.y; y <= neighborcellend.y; y++) {
+		//		for (int x = neighborcellstart.x; x <= neighborcellend.x; x++) {
+					int neighborGridInd = gridIndex3Dto1D(x, y, z, gridResolution);
+					int startInd = gridCellStartIndices[neighborGridInd];
+					int endInd = gridCellEndIndices[neighborGridInd];
+					for (int j = startInd; j < endInd; j++) {
+						int i = particleArrayIndices[j];
+						if (i == index) {
+							continue;
+						}
+						glm::vec3 thatBoidPos = pos[i];
+						glm::vec3 thatBoidVel = vel1[i];
+						float dist = glm::distance(thisBoidPos, thatBoidPos);
+						if (dist < rule1Distance) {
+							center += thatBoidPos;
+							neighborCountRule1++;
+						}
+						if (dist < rule2Distance) {
+							seperate -= thatBoidPos - thisBoidPos;
+						}
+						if (dist < rule3Distance) {
+							cohesion += thatBoidVel;
+							neighborCountRule3++;
+						}
+					}
+				}
+			}
+		}
+
+	if (neighborCountRule1 > 0) {
+		center /= neighborCountRule1;
+		thisBoidVel += (center - thisBoidPos) * rule1Scale;
+	}
+	if (neighborCountRule3 > 0) {
+		cohesion /= neighborCountRule3;
+		thisBoidVel += cohesion * rule3Scale;
+	}
+	thisBoidVel += seperate * rule2Scale;
+    // - Clamp the speed change before putting the new speed in vel2
+	glm::vec3 vel2New = vel1[index] + thisBoidVel;
+	vel2New = glm::length(vel2New) > maxSpeed ? maxSpeed * glm::normalize(vel2New) : vel2New;
+	vel2[index] = vel2New;
+}
+
+__global__ void kernReshufflePosVel(int N, glm::vec3 *pos, glm::vec3 *vel, glm::vec3 *shuffledPos, glm::vec3 *shuffledVel, int *particleArrayIndices) {
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) {
+		return;
+	}
+	int sortedInd = particleArrayIndices[index];
+	shuffledPos[index] = pos[sortedInd];
+	shuffledVel[index] = vel[sortedInd];
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -333,14 +536,107 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // except with one less level of indirection.
   // This should expect gridCellStartIndices and gridCellEndIndices to refer
   // directly to pos and vel1.
+
   // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  //   DIFFERENCE: For best results, consider what order the cells should be
-  //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N) {
+		return;
+	}
+	glm::vec3 curPos = pos[index];
+	glm::vec3 cellPos = (curPos - gridMin) * inverseCellWidth;
+
+
+	// - Identify which cells may contain neighbors. This isn't always 8.
+	// - For each cell, read the start/end indices in the boid pointer array.
+	//   DIFFERENCE: For best results, consider what order the cells should be
+	//   checked in to maximize the memory benefits of reordering the boids data.
+	// - Access each boid in the cell and compute velocity change from
+	//   the boids rules, if this boid is within the neighborhood distance.
+
+	//float maxDist = glm::max(rule1Distance, glm::max(rule2Distance, rule3Distance));
+	//glm::ivec3 neighborCellStart = (curPos - maxDist - gridMin) * inverseCellWidth;
+	//glm::ivec3 neighborCellEnd = (curPos + maxDist - gridMin) * inverseCellWidth;
+	glm::vec3 thisBoidPos = pos[index];
+	glm::vec3 thisBoidVel = vel1[index];
+	glm::vec3 center = glm::vec3(0.0f);
+	glm::vec3 seperate = glm::vec3(0.0f);
+	glm::vec3 cohesion = glm::vec3(0.0f);
+	int neighborCountRule1 = 0;
+	int neighborCountRule3 = 0;
+
+	int xStart = -1, xEnd = 1;
+	int yStart = -1, yEnd = 1;
+	int zStart = -1, zEnd = 1;
+
+	#if neighbors8
+		glm::vec3 cellPosFloor = cellPos - glm::floor(cellPos);
+		if (cellPosFloor.x < 0.5f) {
+			xEnd = 0;
+		}
+		else {
+			xStart = 0;
+		}
+		if (cellPosFloor.y < 0.5f) {
+			yEnd = 0;
+		}
+		else {
+			yStart = 0;
+		}
+		if (cellPosFloor.z < 0.5f) {
+			zEnd = 0;
+		}
+		else {
+			zStart = 0;
+		}
+	#endif
+	for (int gridz = cellPos.z + zStart; gridz <= cellPos.z + zEnd; gridz++) {
+		for (int gridy = cellPos.y + yStart; gridy <= cellPos.y + yEnd; gridy++) {
+			for (int gridx = cellPos.x + xStart; gridx <= cellPos.x + xEnd; gridx++) {
+				int x = imax(gridx, 0);
+				int y = imax(gridy, 0);
+				int z = imax(gridx, 0);
+				x = imin(gridx, gridResolution - 1);
+				y = imin(gridy, gridResolution - 1);
+				z = imin(gridz, gridResolution - 1);
+				int neighborGridInd = gridIndex3Dto1D(x, y, z, gridResolution);
+				int startInd = gridCellStartIndices[neighborGridInd];
+				int endInd = gridCellEndIndices[neighborGridInd];
+				for (int j = startInd; j < endInd; j++) {
+					if (j == index) {
+						continue;
+					}
+					glm::vec3 thatBoidPos = pos[j];
+					glm::vec3 thatBoidVel = vel1[j];
+					float dist = glm::distance(thisBoidPos, thatBoidPos);
+					if (dist < rule1Distance) {
+						center += thatBoidPos;
+						neighborCountRule1++;
+					}
+					if (dist < rule2Distance) {
+						seperate -= thatBoidPos - thisBoidPos;
+					}
+					if (dist < rule3Distance) {
+						cohesion += thatBoidVel;
+						neighborCountRule3++;
+					}
+				}
+			}
+
+		}
+	}
+	if (neighborCountRule1 > 0) {
+		center /= neighborCountRule1;
+		thisBoidVel += (center - thisBoidPos) * rule1Scale;
+	}
+	if (neighborCountRule3 > 0) {
+		cohesion /= neighborCountRule3;
+		thisBoidVel += cohesion * rule3Scale;
+	}
+	thisBoidVel += seperate * rule2Scale;
+	// - Clamp the speed change before putting the new speed in vel2
+	glm::vec3 vel2New = vel1[index] + thisBoidVel;
+	vel2New = glm::length(vel2New) > maxSpeed ? maxSpeed * glm::normalize(vel2New) : vel2New;
+	vel2[index] = vel2New;
 }
 
 /**
@@ -348,7 +644,14 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 */
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
-  // TODO-1.2 ping-pong the velocity buffers
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	kernUpdateVelocityBruteForce << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_pos, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!");
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+	checkCUDAErrorWithLine("kernUpdatePos failed!");
+
+	// TODO-1.2 ping-pong the velocity buffers
+	std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +667,32 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	dim3 fullBlocksPerCell((gridCellCount + blockSize - 1) / blockSize);
+
+	kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	checkCUDAErrorWithLine("kernComputeIndices failed!");
+
+	dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+	dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+	
+	//kernResetIntBuffer << <fullBlocksPerCell, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+	//checkCUDAErrorWithLine("kernResetIntBuffer start array failed!");
+	//kernResetIntBuffer << <fullBlocksPerCell, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+	//checkCUDAErrorWithLine("kernResetIntBuffer end array failed!");
+	
+	kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+	checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+	
+	kernUpdateVelNeighborSearchScattered << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!");
+	
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> >(numObjects, dt, dev_pos, dev_vel2);
+	checkCUDAErrorWithLine("kernUpdatePos failed!");
+	
+	std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +711,34 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	dim3 fullBlocksPerCell((gridCellCount + blockSize - 1) / blockSize);
+	kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	checkCUDAErrorWithLine("kernComputeIndices failed!");
+	
+	dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+	dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+	
+	kernResetIntBuffer << <fullBlocksPerCell, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+	checkCUDAErrorWithLine("kernResetIntBuffer start array failed!");
+	kernResetIntBuffer << <fullBlocksPerCell, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+	checkCUDAErrorWithLine("kernResetIntBuffer end array failed!");
+	
+	kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+	checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
+	kernReshufflePosVel << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_pos, dev_vel1, dev_shuffledPos, dev_shuffledVel, dev_particleArrayIndices);
+	checkCUDAErrorWithLine("kernReshufflePosVel failed!");
+	
+	kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_shuffledPos, dev_shuffledVel, dev_vel1);
+	checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!");
+	
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> >(numObjects, dt, dev_shuffledPos, dev_vel1);
+	checkCUDAErrorWithLine("kernUpdatePos failed!");
+	
+	std::swap(dev_pos, dev_shuffledPos);
 }
 
 void Boids::endSimulation() {
@@ -390,6 +747,12 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleGridIndices); 
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_shuffledPos);
+  cudaFree(dev_shuffledVel);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..698b942 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,12 +13,12 @@
 // ================
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
-#define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define VISUALIZE 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 10000;
 const float DT = 0.2f;
 
 /**