CIS565-Fall-2018 · IshanRanade · Sep 18, 2018 · Sep 18, 2018 · Sep 18, 2018 · Sep 18, 2018
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -18,7 +18,21 @@ set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON)
 set(CUDA_SEPARABLE_COMPILATION ON)
 
 if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
-    set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+    set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/osx")
+elseif(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+    set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/linux" "/usr/lib64")
+elseif(WIN32)
+    if(${MSVC_VERSION} MATCHES "1915")
+        set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/win/vc2017")
+	elseif(${MSVC_VERSION} MATCHES "1900")
+        set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/win/vc2015")
+    elseif(${MSVC_VERSION} MATCHES "1800")
+        set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/win/vc2013")
+    elseif(${MSVC_VERSION} MATCHES "1700")
+        set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/win/vc2012")
+    else()
+        MESSAGE("Error: unsupported MSVC_VERSION: " ${MSVC_VERSION})
+    endif()
 endif()
 
 include_directories(.)

diff --git a/Capture1.JPG b/Capture1.JPG
diff --git a/Capture2.JPG b/Capture2.JPG
diff --git a/README.md b/README.md
@@ -3,12 +3,193 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Ishan Ranade
+* Tested on personal computer: Gigabyte Aero 14, Windows 10, i7-7700HQ, GTX 1060
 
-### (TODO: Your README)
+# Scan and Compaction
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+## Performance Graphs
 
+![](Capture1.JPG)
+
+![](Capture2.JPG)
+
+## Test Results
+
+#### Array size = 64
+```
+****************
+** SCAN TESTS **
+****************
+    [  26  35   7   4  10   9  23  19  37  15  16  37  41 ...  23   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 0.006929ms    (std::chrono Measured)
+    [   0  26  61  68  72  82  91 114 133 170 185 201 238 ... 6108 6131 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 0.00474ms    (std::chrono Measured)
+    [   0  26  61  68  72  82  91 114 133 170 185 201 238 ... 6069 6090 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 0.676864ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 0.342016ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 1.35987ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 1.09571ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 16.9236ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 3.34643ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   3   1   1   1   2   0   0   2   3   3   0   1   0 ...   1   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 0.001458ms    (std::chrono Measured)
+    [   3   1   1   1   2   2   3   3   1   2   3   3   1 ...   2   1 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 0.018598ms    (std::chrono Measured)
+    [   3   1   1   1   2   2   3   3   1   2   3   3   1 ...   3   2 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 0.006564ms    (std::chrono Measured)
+    [   3   1   1   1   2   2   3   3   1   2   3   3   1 ...   2   1 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 1.24522ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 1.38035ms    (CUDA Measured)
+    passed
+Press any key to continue . . .
+
+```
+
+#### Array size = 65536
+```
+****************
+** SCAN TESTS **
+****************
+    [   4  12  16  28   0  38   0  11   2  40  25   0  28 ...  42   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 0.264752ms    (std::chrono Measured)
+    [   0   4  16  32  60  60  98  98 109 111 151 176 176 ... 1606322 1606364 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 0.175772ms    (std::chrono Measured)
+    [   0   4  16  32  60  60  98  98 109 111 151 176 176 ... 1606246 1606277 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 1.51245ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 1.47968ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 5.45789ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 5.55315ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 27.2742ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 11.7524ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   0   2   2   0   2   0   1   0   0   3   2   0 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 0.396034ms    (std::chrono Measured)
+    [   2   2   2   2   1   3   2   1   3   1   3   3   3 ...   2   3 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 0.520386ms    (std::chrono Measured)
+    [   2   2   2   2   1   3   2   1   3   1   3   3   3 ...   3   2 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 1.3048ms    (std::chrono Measured)
+    [   2   2   2   2   1   3   2   1   3   1   3   3   3 ...   2   3 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 9.05011ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 10.4632ms    (CUDA Measured)
+    passed
+Press any key to continue . . .
+```
+
+#### Array size = 33554432
+```
+****************
+** SCAN TESTS **
+****************
+    [  34  21  39  39  17   7  27  41  25  44   5  38  27 ...  44   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 91.0629ms    (std::chrono Measured)
+    [   0  34  55  94 133 150 157 184 225 250 294 299 337 ... 821704663 821704707 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 96.8149ms    (std::chrono Measured)
+    [   0  34  55  94 133 150 157 184 225 250 294 299 337 ... 821704604 821704613 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 142.816ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 130.053ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 606.905ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 598.48ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 1059.55ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 1052.12ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   0   2   2   1   0   1   3   1   1   2   3   2   1 ...   2   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 169.206ms    (std::chrono Measured)
+    [   2   2   1   1   3   1   1   2   3   2   1   1   3 ...   1   2 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 169.446ms    (std::chrono Measured)
+    [   2   2   1   1   3   1   1   2   3   2   1   1   3 ...   3   1 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 481.232ms    (std::chrono Measured)
+    [   2   2   1   1   3   1   1   2   3   2   1   1   3 ...   1   2 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 2345.71ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 4260.37ms    (CUDA Measured)
+    passed
+Press any key to continue . . .
+```
+
+## Discussion
+
+One of the biggest performance hits for my work efficient implementation I believe is the bank conflicts that are occurring, which are drastically reducing its efficiency.  Another hit could be the mathematical operations that I am performing in my kernels, as I tended to repeat some calculations and did not save every value for future use.
+
+It seems that the thrust implementation took an extremely long time to finish.  This could be because that thrust takes some time to warm up, and may have had a lot of cache misses the first time that I used it.  In general my CPU version seemed to perform the best out of all of these.  I believe this is because I did not properly use shared memory, avoid bank conflicts, and keep my kernels lightweight enough to fully utilize the power of the GPU.  This assignment was a big eye opener in how to write better GPU code and what to look for in optimizing kernels.
diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp
@@ -52,9 +52,17 @@ void onesArray(int n, int *a) {
 void genArray(int n, int *a, int maxval) {
     srand(time(nullptr));
 
+    /*if (n == 7) {
+        for (int i = 0; i < n; i++) {
+            a[i] = i;
+        }
+        return;
+    }*/
+
     for (int i = 0; i < n; i++) {
         a[i] = rand() % maxval;
     }
+
 }
 
 void printArray(int n, int *a, bool abridged = false) {

diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
@@ -13,5 +13,5 @@ set(SOURCE_FILES
 
 cuda_add_library(stream_compaction
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_60
     )
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
@@ -20,6 +20,10 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
             // TODO
+            odata[0] = 0;
+            for (int i = 1; i < n; ++i) {
+                odata[i] = odata[i - 1] + idata[i - 1];
+            }
 	        timer().endCpuTimer();
         }
 
@@ -30,9 +34,16 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
+            int index = 0;
+            for (int i = 0; i < n; i++) {
+                if (idata[i] != 0) {
+                    odata[index] = idata[i];
+                    index++;
+                }
+            }
             // TODO
 	        timer().endCpuTimer();
-            return -1;
+            return index;
         }
 
         /**
@@ -43,8 +54,35 @@ namespace StreamCompaction {
         int compactWithScan(int n, int *odata, const int *idata) {
 	        timer().startCpuTimer();
 	        // TODO
+            int *temp = (int*) malloc(n * sizeof(int));
+
+            int finalSize = 0;
+            // First go through and puts 1s and 0s in temp
+            for (int i = 0; i < n; ++i) {
+                if (idata[i] == 0) {
+                    odata[i] = 0;
+                }
+                else {
+                    odata[i] = 1;
+                    finalSize++;
+                }
+            }
+
+            // Now run a scan on odata and save results in temp
+            temp[0] = 0;
+            for (int i = 1; i < n; ++i) {
+                temp[i] = temp[i - 1] + odata[i - 1];
+            }
+
+            // Now go through temp and save final results in odata
+            for (int i = 0; i < n; ++i) {
+                if (odata[i] != 0) {
+                    odata[temp[i]] = idata[i];
+                }
+            }
+
 	        timer().endCpuTimer();
-            return -1;
+            return finalSize;
         }
     }
 }