diff --git a/CMakeLists.txt b/CMakeLists.txt index f654c9e..cea8daf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,7 +18,21 @@ set(CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE ON) set(CUDA_SEPARABLE_COMPILATION ON) if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") - set(CUDA_PROPAGATE_HOST_FLAGS OFF) + set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/osx") +elseif(${CMAKE_SYSTEM_NAME} MATCHES "Linux") + set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/linux" "/usr/lib64") +elseif(WIN32) + if(${MSVC_VERSION} MATCHES "1915") + set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/win/vc2017") + elseif(${MSVC_VERSION} MATCHES "1900") + set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/win/vc2015") + elseif(${MSVC_VERSION} MATCHES "1800") + set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/win/vc2013") + elseif(${MSVC_VERSION} MATCHES "1700") + set(EXTERNAL_LIB_PATH "${EXTERNAL}/lib/win/vc2012") + else() + MESSAGE("Error: unsupported MSVC_VERSION: " ${MSVC_VERSION}) + endif() endif() include_directories(.) diff --git a/Capture1.JPG b/Capture1.JPG new file mode 100644 index 0000000..1613567 Binary files /dev/null and b/Capture1.JPG differ diff --git a/Capture2.JPG b/Capture2.JPG new file mode 100644 index 0000000..e76479a Binary files /dev/null and b/Capture2.JPG differ diff --git a/README.md b/README.md index 0e38ddb..a895fda 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,193 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Ishan Ranade +* Tested on personal computer: Gigabyte Aero 14, Windows 10, i7-7700HQ, GTX 1060 -### (TODO: Your README) +# Scan and Compaction -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +## Performance Graphs +![](Capture1.JPG) + +![](Capture2.JPG) + +## Test Results + +#### Array size = 64 +``` +**************** +** SCAN TESTS ** +**************** + [ 26 35 7 4 10 9 23 19 37 15 16 37 41 ... 23 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.006929ms (std::chrono Measured) + [ 0 26 61 68 72 82 91 114 133 170 185 201 238 ... 6108 6131 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.00474ms (std::chrono Measured) + [ 0 26 61 68 72 82 91 114 133 170 185 201 238 ... 6069 6090 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.676864ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.342016ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 1.35987ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 1.09571ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 16.9236ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 3.34643ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 1 1 1 2 0 0 2 3 3 0 1 0 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.001458ms (std::chrono Measured) + [ 3 1 1 1 2 2 3 3 1 2 3 3 1 ... 2 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.018598ms (std::chrono Measured) + [ 3 1 1 1 2 2 3 3 1 2 3 3 1 ... 3 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.006564ms (std::chrono Measured) + [ 3 1 1 1 2 2 3 3 1 2 3 3 1 ... 2 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 1.24522ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 1.38035ms (CUDA Measured) + passed +Press any key to continue . . . + +``` + +#### Array size = 65536 +``` +**************** +** SCAN TESTS ** +**************** + [ 4 12 16 28 0 38 0 11 2 40 25 0 28 ... 42 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.264752ms (std::chrono Measured) + [ 0 4 16 32 60 60 98 98 109 111 151 176 176 ... 1606322 1606364 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.175772ms (std::chrono Measured) + [ 0 4 16 32 60 60 98 98 109 111 151 176 176 ... 1606246 1606277 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 1.51245ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 1.47968ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 5.45789ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 5.55315ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 27.2742ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 11.7524ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 0 2 2 0 2 0 1 0 0 3 2 0 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.396034ms (std::chrono Measured) + [ 2 2 2 2 1 3 2 1 3 1 3 3 3 ... 2 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.520386ms (std::chrono Measured) + [ 2 2 2 2 1 3 2 1 3 1 3 3 3 ... 3 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 1.3048ms (std::chrono Measured) + [ 2 2 2 2 1 3 2 1 3 1 3 3 3 ... 2 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 9.05011ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 10.4632ms (CUDA Measured) + passed +Press any key to continue . . . +``` + +#### Array size = 33554432 +``` +**************** +** SCAN TESTS ** +**************** + [ 34 21 39 39 17 7 27 41 25 44 5 38 27 ... 44 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 91.0629ms (std::chrono Measured) + [ 0 34 55 94 133 150 157 184 225 250 294 299 337 ... 821704663 821704707 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 96.8149ms (std::chrono Measured) + [ 0 34 55 94 133 150 157 184 225 250 294 299 337 ... 821704604 821704613 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 142.816ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 130.053ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 606.905ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 598.48ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 1059.55ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 1052.12ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 2 2 1 0 1 3 1 1 2 3 2 1 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 169.206ms (std::chrono Measured) + [ 2 2 1 1 3 1 1 2 3 2 1 1 3 ... 1 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 169.446ms (std::chrono Measured) + [ 2 2 1 1 3 1 1 2 3 2 1 1 3 ... 3 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 481.232ms (std::chrono Measured) + [ 2 2 1 1 3 1 1 2 3 2 1 1 3 ... 1 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 2345.71ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 4260.37ms (CUDA Measured) + passed +Press any key to continue . . . +``` + +## Discussion + +One of the biggest performance hits for my work efficient implementation I believe is the bank conflicts that are occurring, which are drastically reducing its efficiency. Another hit could be the mathematical operations that I am performing in my kernels, as I tended to repeat some calculations and did not save every value for future use. + +It seems that the thrust implementation took an extremely long time to finish. This could be because that thrust takes some time to warm up, and may have had a lot of cache misses the first time that I used it. In general my CPU version seemed to perform the best out of all of these. I believe this is because I did not properly use shared memory, avoid bank conflicts, and keep my kernels lightweight enough to fully utilize the power of the GPU. This assignment was a big eye opener in how to write better GPU code and what to look for in optimizing kernels. diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 46337ab..716f6be 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -52,9 +52,17 @@ void onesArray(int n, int *a) { void genArray(int n, int *a, int maxval) { srand(time(nullptr)); + /*if (n == 7) { + for (int i = 0; i < n; i++) { + a[i] = i; + } + return; + }*/ + for (int i = 0; i < n; i++) { a[i] = rand() % maxval; } + } void printArray(int n, int *a, bool abridged = false) { diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..48e2f35 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_60 ) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..951d3f8 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,10 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + odata[0] = 0; + for (int i = 1; i < n; ++i) { + odata[i] = odata[i - 1] + idata[i - 1]; + } timer().endCpuTimer(); } @@ -30,9 +34,16 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); + int index = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[index] = idata[i]; + index++; + } + } // TODO timer().endCpuTimer(); - return -1; + return index; } /** @@ -43,8 +54,35 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int *temp = (int*) malloc(n * sizeof(int)); + + int finalSize = 0; + // First go through and puts 1s and 0s in temp + for (int i = 0; i < n; ++i) { + if (idata[i] == 0) { + odata[i] = 0; + } + else { + odata[i] = 1; + finalSize++; + } + } + + // Now run a scan on odata and save results in temp + temp[0] = 0; + for (int i = 1; i < n; ++i) { + temp[i] = temp[i - 1] + odata[i - 1]; + } + + // Now go through temp and save final results in odata + for (int i = 0; i < n; ++i) { + if (odata[i] != 0) { + odata[temp[i]] = idata[i]; + } + } + timer().endCpuTimer(); - return -1; + return finalSize; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..3426959 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,6 +2,11 @@ #include #include "common.h" #include "efficient.h" +#include +#include +#include +#include + namespace StreamCompaction { namespace Efficient { @@ -12,13 +17,146 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpsweep(int n, int power, int *array) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < n) { + if (index % (2 * power) == 0) { + array[index + (2 * power) - 1] = array[index + power - 1] + array[index + (2 * power) - 1]; + } + } + } + + __global__ void kernDownsweep(int n, int power, int *array) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < n) { + if (index % (2 * power) == 0) { + int t = array[index + power - 1]; + array[index + power - 1] = array[index + (2 * power) - 1]; + array[index + (2 * power) - 1] = t + array[index + (2 * power) - 1]; + } + } + } + + __global__ void kernSetZero(int n, int *array) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < n) { + if (index == n - 1) { + array[index] = 0; + } + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int *temp; + + int size = 1; + while (size < n) { + size *= 2; + } + + cudaMalloc((void**)&temp, size * sizeof(int)); + cudaDeviceSynchronize(); + + cudaMemcpy(temp, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + timer().startGpuTimer(); + + int blockSize = 256; + int blocks = (size + blockSize - 1) / blockSize; + // TODO + for (int d = 0; d < ilog2ceil(size); ++d) { + kernUpsweep << > > (size, pow(2, d), temp); + cudaDeviceSynchronize(); + } + + kernSetZero << > > (size, temp); + + for (int d = ilog2ceil(size) - 1; d >= 0; --d) { + kernDownsweep << > > (size, pow(2, d), temp); + cudaDeviceSynchronize(); + } + timer().endGpuTimer(); + + cudaMemcpy(odata, temp, n * sizeof(int), cudaMemcpyDeviceToHost); + + } + + + + + + + + + + + + + + void scanEfficient(int n, int *odata, const int *idata) { + int *temp; + + int size = 1; + while (size < n) { + size *= 2; + } + + cudaMalloc((void**)&temp, size * sizeof(int)); + cudaDeviceSynchronize(); + + cudaMemcpy(temp, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + + int blockSize = 256; + int blocks = (size + blockSize - 1) / blockSize; + + // TODO + for (int d = 0; d < ilog2ceil(size); ++d) { + kernUpsweep << > > (size, pow(2, d), temp); + cudaDeviceSynchronize(); + } + + kernSetZero << > > (size, temp); + + for (int d = ilog2ceil(size) - 1; d >= 0; --d) { + kernDownsweep << > > (size, pow(2, d), temp); + cudaDeviceSynchronize(); + } + + cudaMemcpy(odata, temp, n * sizeof(int), cudaMemcpyDeviceToHost); + + } + + __global__ void kernMapToBoolean(int n, int *read, int *write) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < n) { + if (read[index] == 0) { + write[index] = 0; + } + else { + write[index] = 1; + } + } + } + + __global__ void kernScatter(int n, int *idata, int *booleans, int *scan, int *odata) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + + if (index < n) { + if (booleans[index] == 1) { + odata[scan[index]] = idata[index]; + } + } } /** @@ -32,9 +170,54 @@ namespace StreamCompaction { */ int compact(int n, int *odata, const int *idata) { timer().startGpuTimer(); - // TODO + + int blockSize = 256; + int blocks = (n + blockSize - 1) / blockSize; + + int *dev_idata; + int *booleans; + int *scanArray; + int *result; + + cudaMallocManaged(&dev_idata, n * sizeof(int)); + + cudaDeviceSynchronize(); + + for (int i = 0; i < n; ++i) { + dev_idata[i] = idata[i]; + } + + cudaMallocManaged(&booleans, n * sizeof(int)); + cudaMallocManaged(&scanArray, n * sizeof(int)); + cudaMallocManaged(&result, n * sizeof(int)); + + cudaDeviceSynchronize(); + + // First map the initial array to booleans + kernMapToBoolean << > > (n, dev_idata, booleans); + + cudaDeviceSynchronize(); + + // Now do a scan + scanEfficient(n, scanArray, booleans); + + // Now do a scatter + int *dev_odata; + cudaMallocManaged(&dev_odata, n * sizeof(int)); + kernScatter << > > (n, dev_idata, booleans, scanArray, dev_odata); + + cudaDeviceSynchronize(); + + int finalCount = 0; + for (int i = 0; i < n; ++i) { + finalCount += booleans[i]; + } + + cudaMemcpy(odata, dev_odata, finalCount * sizeof(int), cudaMemcpyDeviceToHost); + timer().endGpuTimer(); - return -1; + + return finalCount; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..0736d57 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,6 +2,7 @@ #include #include "common.h" #include "naive.h" +#include namespace StreamCompaction { namespace Naive { @@ -11,15 +12,64 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } + // TODO: __global__ + __global__ void kernScan(int n, int power, int *read, int *write) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index < n) { + if (index >= power) { + write[index] = read[index - power] + read[index]; + } + else { + write[index] = read[index]; + } + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO + + int *dev_read; + cudaMalloc((void**)&dev_read, n * sizeof(int)); + + int *dev_write; + cudaMalloc((void**)&dev_write, n * sizeof(int)); + + cudaMemcpy(dev_read, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + cudaDeviceSynchronize(); + + timer().startGpuTimer(); + + int D = ilog2ceil(n); + for (int d = 1; d < D + 1; ++d) { + int blockSize = 256; + int blocks = (n + blockSize - 1) / blockSize; + + int power = pow(2, d - 1); + kernScan << > > (n, power, dev_read, dev_write); + + cudaDeviceSynchronize(); + + int *temp = dev_read; + dev_read = dev_write; + dev_write = temp; + } + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_read, n * sizeof(int), cudaMemcpyDeviceToHost); + + for (int i = n - 1; i >= 1; --i) { + odata[i] = odata[i - 1]; + } + odata[0] = 0; + } + + } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..898b8d5 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -22,6 +22,21 @@ namespace StreamCompaction { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + int *dev_odata; + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + + int *dev_idata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + thrust::device_ptr thrust_odata(dev_odata); + thrust::device_ptr thrust_idata(dev_idata); + + thrust::exclusive_scan(thrust_idata, thrust_idata + n, thrust_odata); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + timer().endGpuTimer(); } }