diff --git a/CMakeLists.txt b/CMakeLists.txt
index f654c9e..cea8daf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,7 +18,21 @@ set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON)
 set(CUDA_SEPARABLE_COMPILATION ON)
 
 if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-    set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+    set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/osx")
+elseif(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+    set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/linux" "/usr/lib64")
+elseif(WIN32)
+    if(${MSVC_VERSION} MATCHES "1915")
+        set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/win/vc2017")
+	elseif(${MSVC_VERSION} MATCHES "1900")
+        set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/win/vc2015")
+    elseif(${MSVC_VERSION} MATCHES "1800")
+        set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/win/vc2013")
+    elseif(${MSVC_VERSION} MATCHES "1700")
+        set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/win/vc2012")
+    else()
+        MESSAGE("Error: unsupported MSVC_VERSION: " ${MSVC_VERSION})
+    endif()
 endif()
 
 include_directories(.)
diff --git a/Capture1.JPG b/Capture1.JPG
new file mode 100644
index 0000000..1613567
Binary files /dev/null and b/Capture1.JPG differ
diff --git a/Capture2.JPG b/Capture2.JPG
new file mode 100644
index 0000000..e76479a
Binary files /dev/null and b/Capture2.JPG differ
diff --git a/README.md b/README.md
index 0e38ddb..a895fda 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,193 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Ishan Ranade
+* Tested on personal computer: Gigabyte Aero 14, Windows 10, i7-7700HQ, GTX 1060
 
-### (TODO: Your README)
+# Scan and Compaction
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+## Performance Graphs
 
+![](Capture1.JPG)
+
+![](Capture2.JPG)
+
+## Test Results
+
+#### Array size = 64
+```
+****************
+** SCAN TESTS **
+****************
+    [  26  35   7   4  10   9  23  19  37  15  16  37  41 ...  23   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 0.006929ms    (std::chrono Measured)
+    [   0  26  61  68  72  82  91 114 133 170 185 201 238 ... 6108 6131 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 0.00474ms    (std::chrono Measured)
+    [   0  26  61  68  72  82  91 114 133 170 185 201 238 ... 6069 6090 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 0.676864ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 0.342016ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 1.35987ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 1.09571ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 16.9236ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 3.34643ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   3   1   1   1   2   0   0   2   3   3   0   1   0 ...   1   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 0.001458ms    (std::chrono Measured)
+    [   3   1   1   1   2   2   3   3   1   2   3   3   1 ...   2   1 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 0.018598ms    (std::chrono Measured)
+    [   3   1   1   1   2   2   3   3   1   2   3   3   1 ...   3   2 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 0.006564ms    (std::chrono Measured)
+    [   3   1   1   1   2   2   3   3   1   2   3   3   1 ...   2   1 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 1.24522ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 1.38035ms    (CUDA Measured)
+    passed
+Press any key to continue . . .
+
+```
+
+#### Array size = 65536
+```
+****************
+** SCAN TESTS **
+****************
+    [   4  12  16  28   0  38   0  11   2  40  25   0  28 ...  42   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 0.264752ms    (std::chrono Measured)
+    [   0   4  16  32  60  60  98  98 109 111 151 176 176 ... 1606322 1606364 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 0.175772ms    (std::chrono Measured)
+    [   0   4  16  32  60  60  98  98 109 111 151 176 176 ... 1606246 1606277 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 1.51245ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 1.47968ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 5.45789ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 5.55315ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 27.2742ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 11.7524ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   0   2   2   0   2   0   1   0   0   3   2   0 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 0.396034ms    (std::chrono Measured)
+    [   2   2   2   2   1   3   2   1   3   1   3   3   3 ...   2   3 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 0.520386ms    (std::chrono Measured)
+    [   2   2   2   2   1   3   2   1   3   1   3   3   3 ...   3   2 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 1.3048ms    (std::chrono Measured)
+    [   2   2   2   2   1   3   2   1   3   1   3   3   3 ...   2   3 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 9.05011ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 10.4632ms    (CUDA Measured)
+    passed
+Press any key to continue . . .
+```
+
+#### Array size = 33554432
+```
+****************
+** SCAN TESTS **
+****************
+    [  34  21  39  39  17   7  27  41  25  44   5  38  27 ...  44   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 91.0629ms    (std::chrono Measured)
+    [   0  34  55  94 133 150 157 184 225 250 294 299 337 ... 821704663 821704707 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 96.8149ms    (std::chrono Measured)
+    [   0  34  55  94 133 150 157 184 225 250 294 299 337 ... 821704604 821704613 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 142.816ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 130.053ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 606.905ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 598.48ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 1059.55ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 1052.12ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   0   2   2   1   0   1   3   1   1   2   3   2   1 ...   2   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 169.206ms    (std::chrono Measured)
+    [   2   2   1   1   3   1   1   2   3   2   1   1   3 ...   1   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 169.446ms    (std::chrono Measured)
+    [   2   2   1   1   3   1   1   2   3   2   1   1   3 ...   3   1 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 481.232ms    (std::chrono Measured)
+    [   2   2   1   1   3   1   1   2   3   2   1   1   3 ...   1   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 2345.71ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 4260.37ms    (CUDA Measured)
+    passed
+Press any key to continue . . .
+```
+
+## Discussion
+
+One of the biggest performance hits for my work efficient implementation I believe is the bank conflicts that are occurring, which are drastically reducing its efficiency.  Another hit could be the mathematical operations that I am performing in my kernels, as I tended to repeat some calculations and did not save every value for future use.
+
+It seems that the thrust implementation took an extremely long time to finish.  This could be because that thrust takes some time to warm up, and may have had a lot of cache misses the first time that I used it.  In general my CPU version seemed to perform the best out of all of these.  I believe this is because I did not properly use shared memory, avoid bank conflicts, and keep my kernels lightweight enough to fully utilize the power of the GPU.  This assignment was a big eye opener in how to write better GPU code and what to look for in optimizing kernels.
diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp
index 46337ab..716f6be 100644
--- a/src/testing_helpers.hpp
+++ b/src/testing_helpers.hpp
@@ -52,9 +52,17 @@ void onesArray(int n, int *a) {
 void genArray(int n, int *a, int maxval) {
     srand(time(nullptr));
 
+    /*if (n == 7) {
+        for (int i = 0; i < n; i++) {
+            a[i] = i;
+        }
+        return;
+    }*/
+
     for (int i = 0; i < n; i++) {
         a[i] = rand() % maxval;
     }
+    
 }
 
 void printArray(int n, int *a, bool abridged = false) {
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index cdbef77..48e2f35 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_60
     )
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 05ce667..951d3f8 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -20,6 +20,10 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
             // TODO
+            odata[0] = 0;
+            for (int i = 1; i < n; ++i) {
+                odata[i] = odata[i - 1] + idata[i - 1];
+            }
 	        timer().endCpuTimer();
         }
 
@@ -30,9 +34,16 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
+            int index = 0;
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    odata[index] = idata[i];
+                    index++;
+                }
+            }
             // TODO
 	        timer().endCpuTimer();
-            return -1;
+            return index;
         }
 
         /**
@@ -43,8 +54,35 @@ namespace StreamCompaction {
         int compactWithScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
 	        // TODO
+            int *temp = (int*) malloc(n * sizeof(int));
+
+            int finalSize = 0;
+            // First go through and puts 1s and 0s in temp
+            for (int i = 0; i < n; ++i) {
+                if (idata[i] == 0) {
+                    odata[i] = 0;
+                }
+                else {
+                    odata[i] = 1;
+                    finalSize++;
+                }
+            }
+
+            // Now run a scan on odata and save results in temp
+            temp[0] = 0;
+            for (int i = 1; i < n; ++i) {
+                temp[i] = temp[i - 1] + odata[i - 1];
+            }
+ 
+            // Now go through temp and save final results in odata
+            for (int i = 0; i < n; ++i) {
+                if (odata[i] != 0) {
+                    odata[temp[i]] = idata[i];
+                }
+            }
+
 	        timer().endCpuTimer();
-            return -1;
+            return finalSize;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 36c5ef2..3426959 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -2,6 +2,11 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "efficient.h"
+#include <device_launch_parameters.h>
+#include <iostream>
+#include <stdlib.h>
+#include <string>
+
 
 namespace StreamCompaction {
     namespace Efficient {
@@ -12,13 +17,146 @@ namespace StreamCompaction {
             return timer;
         }
 
+        __global__ void kernUpsweep(int n, int power, int *array) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+            if (index < n) {
+                if (index % (2 * power) == 0) {
+                    array[index + (2 * power) - 1] = array[index + power - 1] + array[index + (2 * power) - 1];
+                }
+            }
+        }
+
+        __global__ void kernDownsweep(int n, int power, int *array) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+            if (index < n) {
+                if (index % (2 * power) == 0) {
+                    int t = array[index + power - 1];
+                    array[index + power - 1] = array[index + (2 * power) - 1];
+                    array[index + (2 * power) - 1] = t + array[index + (2 * power) - 1];
+                }
+            }
+        }
+
+        __global__ void kernSetZero(int n, int *array) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+            if (index < n) {
+                if (index == n - 1) {
+                    array[index] = 0;
+                }
+            }
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int *temp;
+
+            int size = 1;
+            while (size < n) {
+                size *= 2;
+            }
+
+            cudaMalloc((void**)&temp, size * sizeof(int));
+            cudaDeviceSynchronize();
+
+            cudaMemcpy(temp, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            cudaDeviceSynchronize();
+
             timer().startGpuTimer();
+
+            int blockSize = 256;
+            int blocks = (size + blockSize - 1) / blockSize;
+
             // TODO
+            for (int d = 0; d < ilog2ceil(size); ++d) {
+                kernUpsweep << <blocks, blockSize >> > (size, pow(2, d), temp);
+                cudaDeviceSynchronize();
+            }
+            
+            kernSetZero << <blocks, blockSize >> > (size, temp);
+
+            for (int d = ilog2ceil(size) - 1; d >= 0; --d) {
+                kernDownsweep << <blocks, blockSize >> > (size, pow(2, d), temp);
+                cudaDeviceSynchronize();
+            }
+
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, temp, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+        }
+
+
+
+
+
+
+
+
+
+
+
+
+
+        void scanEfficient(int n, int *odata, const int *idata) {
+             int *temp;
+
+            int size = 1;
+            while (size < n) {
+                size *= 2;
+            }
+
+            cudaMalloc((void**)&temp, size * sizeof(int));
+            cudaDeviceSynchronize();
+
+            cudaMemcpy(temp, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            cudaDeviceSynchronize();
+
+            int blockSize = 256;
+            int blocks = (size + blockSize - 1) / blockSize;
+
+            // TODO
+            for (int d = 0; d < ilog2ceil(size); ++d) {
+                kernUpsweep << <blocks, blockSize >> > (size, pow(2, d), temp);
+                cudaDeviceSynchronize();
+            }
+
+            kernSetZero << <blocks, blockSize >> > (size, temp);
+
+            for (int d = ilog2ceil(size) - 1; d >= 0; --d) {
+                kernDownsweep << <blocks, blockSize >> > (size, pow(2, d), temp);
+                cudaDeviceSynchronize();
+            }
+
+            cudaMemcpy(odata, temp, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+        }
+
+        __global__ void kernMapToBoolean(int n, int *read, int *write) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+            if (index < n) {
+                if (read[index] == 0) {
+                    write[index] = 0;
+                }
+                else {
+                    write[index] = 1;
+                }
+            }
+        }
+
+        __global__ void kernScatter(int n, int *idata, int *booleans, int *scan, int *odata) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+            if (index < n) {
+                if (booleans[index] == 1) {
+                    odata[scan[index]] = idata[index];
+                }
+            }
         }
 
         /**
@@ -32,9 +170,54 @@ namespace StreamCompaction {
          */
         int compact(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
-            // TODO
+
+            int blockSize = 256;
+            int blocks = (n + blockSize - 1) / blockSize;
+
+            int *dev_idata;
+            int *booleans;
+            int *scanArray;
+            int *result;
+
+            cudaMallocManaged(&dev_idata, n * sizeof(int));
+
+            cudaDeviceSynchronize();
+            
+            for (int i = 0; i < n; ++i) {
+                dev_idata[i] = idata[i];
+            }
+            
+            cudaMallocManaged(&booleans, n * sizeof(int));
+            cudaMallocManaged(&scanArray, n * sizeof(int));
+            cudaMallocManaged(&result, n * sizeof(int));
+
+            cudaDeviceSynchronize();
+
+            // First map the initial array to booleans
+            kernMapToBoolean << <blocks, blockSize >> > (n, dev_idata, booleans);
+
+            cudaDeviceSynchronize();
+
+            // Now do a scan
+            scanEfficient(n, scanArray, booleans);
+
+            // Now do a scatter
+            int *dev_odata;
+            cudaMallocManaged(&dev_odata, n * sizeof(int));
+            kernScatter << <blocks, blockSize >> > (n, dev_idata, booleans, scanArray, dev_odata);
+
+            cudaDeviceSynchronize();
+
+            int finalCount = 0;
+            for (int i = 0; i < n; ++i) {
+                finalCount += booleans[i];
+            }
+
+            cudaMemcpy(odata, dev_odata, finalCount * sizeof(int), cudaMemcpyDeviceToHost);
+
             timer().endGpuTimer();
-            return -1;
+
+            return finalCount;
         }
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 9218f8e..0736d57 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -2,6 +2,7 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "naive.h"
+#include <device_launch_parameters.h>
 
 namespace StreamCompaction {
     namespace Naive {
@@ -11,15 +12,64 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
+
         // TODO: __global__
+        __global__ void kernScan(int n, int power, int *read, int *write) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
 
+            if (index < n) {
+                if (index >= power) {
+                    write[index] = read[index - power] + read[index];
+                }
+                else {
+                    write[index] = read[index];
+                }
+            }
+        }
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
             // TODO
+
+            int *dev_read;
+            cudaMalloc((void**)&dev_read, n * sizeof(int));
+
+            int *dev_write;
+            cudaMalloc((void**)&dev_write, n * sizeof(int));
+
+            cudaMemcpy(dev_read, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            cudaDeviceSynchronize();
+
+            timer().startGpuTimer();
+
+            int D = ilog2ceil(n);
+            for (int d = 1; d < D + 1; ++d) {
+                int blockSize = 256;
+                int blocks = (n + blockSize - 1) / blockSize;
+
+                int power = pow(2, d - 1);
+                kernScan << <blocks, blockSize >> > (n, power, dev_read, dev_write);
+
+                cudaDeviceSynchronize();
+
+                int *temp = dev_read;
+                dev_read = dev_write;
+                dev_write = temp;
+            }
+
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_read, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            for (int i = n - 1; i >= 1; --i) {
+                odata[i] = odata[i - 1];
+            }
+            odata[0] = 0;
+           
         }
+
+        
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 36b732d..898b8d5 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -22,6 +22,21 @@ namespace StreamCompaction {
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            int *dev_odata;
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+
+            int *dev_idata;
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            thrust::device_ptr<int> thrust_odata(dev_odata);
+            thrust::device_ptr<int> thrust_idata(dev_idata);
+
+            thrust::exclusive_scan(thrust_idata, thrust_idata + n, thrust_odata);
+
+            cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+
             timer().endGpuTimer();
         }
     }