diff --git a/README.md b/README.md index 0e38ddb..a4ffd49 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,106 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Liam Dugan + * [LinkedIn](https://www.linkedin.com/in/liam-dugan-95a961135/), [personal website](http://liamdugan.com/) +* Tested on: Windows 10, Ryzen 5 1600 @ 3.20GHz 16GB, GTX 1070 16GB (Personal Computer) -### (TODO: Your README) -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +# Stream Compaction +This homework is an introduction to implementing and optimizing parallel algorithms on the GPU. To do this we were tasked with writing a CPU implementation and various GPU implementations and comparing their timing results. +## Features Completed +* CPU Scan and stream compaction implementation +* GPU Naive scan implementation +* GPU Work efficient scan and stream compaction implementation +* Wrapper for Thrust compaction + +# Questions +### Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU +I tested block sizes from 32 up to 1024 in power of two increments and determined that a block size of 64 was ideal for the naive implementation and a block size of 128 was ideal for the work-efficient implementations. + +For the timing results, an array of size 2^20 (~1 million) was tested while changing the block size. Time is in miliseconds. + +![](images/BlockSize.png) + +### Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis). + +![](images/Scan.png) + +![](images/StreamCompaction.png) + +### Write a brief explanation of the phenomena you see here. +I believe the main performance bottleneck in my scan code is undoubtedly the memory accesses. In both my naive and work efficient scan implementations I write to global memory after every level of the tree is traversed (i.e. log(n) times). This is significantly slower than using shared memory, which would only write to global memory log(n / blocksize) times. + +Additionally, in my work efficient scan algorithm I use ping-pong buffers,but memcpy the data between buffers on every cycle of both the upsweep and the downsweep of the algorithm. Since I do not have to do this in my naive implementation, I believe that is why my work efficient scan is slower. + +![](images/Code.PNG) + +For the thrust scan, the reason why I believe the power of two array test runs so much slower than the non power of two code is due to some sort of internal thrust library bookkeeping. I believe once a thrust function is called, there is some sort of one-time-only process to initialize thrust specific state. Thus when we call the power-of-two length array thrust scan it has to take that extra set up time, but once we call the non power of two code, all the setup has already been completed and it can run quickly. + + +### Test Program Output + +``` + +**************** +** SCAN TESTS ** +**************** + [ 12 6 12 29 23 44 41 42 46 40 9 23 22 ... 6 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 1.47776ms (std::chrono Measured) + [ 0 12 18 30 59 82 126 167 209 255 295 304 327 ... 51356118 51356124 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 1.57856ms (std::chrono Measured) + [ 0 12 18 30 59 82 126 167 209 255 295 304 327 ... 51356052 51356083 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 5.05549ms (CUDA Measured) + [ 0 12 18 30 59 82 126 167 209 255 295 304 327 ... 51356118 51356124 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 5.13843ms (CUDA Measured) + [ 0 12 18 30 59 82 126 167 209 255 295 304 327 ... 51356118 51356124 ] + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 9.00813ms (CUDA Measured) + [ 0 12 18 30 59 82 126 167 209 255 295 304 327 ... 51356118 51356124 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 8.82483ms (CUDA Measured) + [ 0 12 18 30 59 82 126 167 209 255 295 304 327 ... 51356052 51356083 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 4.64384ms (CUDA Measured) + [ 0 12 18 30 59 82 126 167 209 255 295 304 327 ... 51356118 51356124 ] + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.334848ms (CUDA Measured) + [ 0 12 18 30 59 82 126 167 209 255 295 304 327 ... 51356052 51356083 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 3 1 3 3 0 1 1 1 0 2 1 1 1 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 5.27103ms (std::chrono Measured) + [ 3 1 3 3 1 1 1 2 1 1 1 2 3 ... 3 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 5.21855ms (std::chrono Measured) + [ 3 1 3 3 1 1 1 2 1 1 1 2 3 ... 3 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 12.4281ms (std::chrono Measured) + [ 3 1 3 3 1 1 1 2 1 1 1 2 3 ... 3 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 7.15469ms (CUDA Measured) + [ 3 1 3 3 1 1 1 2 1 1 1 2 3 ... 3 3 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 7.18848ms (CUDA Measured) + [ 3 1 3 3 1 1 1 2 1 1 1 2 3 ... 3 3 ] + passed +``` \ No newline at end of file diff --git a/images/BlockSize.png b/images/BlockSize.png new file mode 100644 index 0000000..4539034 Binary files /dev/null and b/images/BlockSize.png differ diff --git a/images/Code.PNG b/images/Code.PNG new file mode 100644 index 0000000..4fa35d8 Binary files /dev/null and b/images/Code.PNG differ diff --git a/images/Scan.png b/images/Scan.png new file mode 100644 index 0000000..7df0bd5 Binary files /dev/null and b/images/Scan.png differ diff --git a/images/StreamCompaction.png b/images/StreamCompaction.png new file mode 100644 index 0000000..3a88b81 Binary files /dev/null and b/images/StreamCompaction.png differ diff --git a/src/main.cpp b/src/main.cpp index 1850161..6f7675d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,11 +13,12 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 21; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; int *c = new int[SIZE]; +int *d = new int[SIZE]; int main(int argc, char* argv[]) { // Scan tests @@ -46,53 +47,53 @@ int main(int argc, char* argv[]) { printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); printArray(NPOT, b, true); printCmpResult(NPOT, b, c); - + zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ - + /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan */ +/* onesArray(SIZE, d); + printDesc("1s array for finding bugs"); + StreamCompaction::Efficient::scan(SIZE, c, d); + printArray(SIZE, c, true); + */ zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); printf("\n"); @@ -137,14 +138,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 46337ab..d6cc4e3 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -1,8 +1,8 @@ #pragma once -#include -#include -#include +#include +#include +#include #include #include @@ -69,8 +69,8 @@ void printArray(int n, int *a, bool abridged = false) { printf("]\n"); } -template -void printElapsedTime(T time, std::string note = "") -{ - std::cout << " elapsed time: " << time << "ms " << note << std::endl; +template +void printElapsedTime(T time, std::string note = "") +{ + std::cout << " elapsed time: " << time << "ms " << note << std::endl; } \ No newline at end of file diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..48e2f35 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_60 ) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 8fc0211..25f9fc3 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -22,8 +22,16 @@ namespace StreamCompaction { * Maps an array to an array of 0s and 1s for stream compaction. Elements * which map to 0 will be removed, and elements which map to 1 will be kept. */ - __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + __global__ void kernMapToBoolean(int n, int paddedN, int *bools, const int *idata) { + // get index first and reject if greater than paddedN + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= paddedN) + { + return; + } + + // determine if you're a boolean (if you're in the part that's just padded on, give yourself a 0) + bools[index] = (idata[index] && index < n) ? 1 : 0; } /** @@ -32,8 +40,18 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO - } + + // get index first + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + { + return; + } + if (bools[index]) + { + odata[indices[index]] = idata[index]; + } + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 99a1b04..13a4059 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -32,7 +32,7 @@ inline int ilog2ceil(int x) { namespace StreamCompaction { namespace Common { - __global__ void kernMapToBoolean(int n, int *bools, const int *idata); + __global__ void kernMapToBoolean(int n, int paddedN, int *bools, const int *idata); __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices); diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..b4df5f7 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,15 +1,15 @@ #include #include "cpu.h" -#include "common.h" +#include "common.h" namespace StreamCompaction { namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** @@ -19,7 +19,14 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int sum = 0; + for (int i = 0; i < n; ++i) + { + odata[i] = sum; + sum += idata[i]; + } + timer().endCpuTimer(); } @@ -30,9 +37,20 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int index = 0; + for (int i = 0; i < n; ++i) + { + // if the data meets the condition put it in + if (idata[i]) + { + odata[index] = idata[i]; + ++index; + } + } + timer().endCpuTimer(); - return -1; + return index; } /** @@ -41,10 +59,33 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO + int* scanned = (int*) malloc(sizeof(int) * n); + + timer().startCpuTimer(); + + int sum = 0; + for (int i = 0; i < n; ++i) + { + scanned[i] = sum; + if (idata[i]) + { + ++sum; + } + } + + // now scatter + for (int j = 0; j < n; j++) + { + if (idata[j]) + { + odata[scanned[j]] = idata[j]; + } + } + timer().endCpuTimer(); - return -1; + + free(scanned); + return sum; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..ff22e13 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -1,24 +1,125 @@ + #include #include #include "common.h" #include "efficient.h" +#define blockSize 128 + +int* dev_efficientScanBuf; +int* dev_efficientIdata; +int* dev_efficientBools; +int* dev_efficientIndices; + +__global__ void kernEfficientScanUpSweep(int n, int d, int* odata, int* idata) +{ + // get index first + int index = threadIdx.x + (blockIdx.x * blockDim.x); + int twoToPowDPlusOne = 1 << (d + 1); + if (index >= n || index % twoToPowDPlusOne != 0) + { + return; + } + + int twoToPowD = 1 << d; + + // then add the two numbers and put them into the global output buffer + odata[index + twoToPowDPlusOne - 1] = idata[index + twoToPowDPlusOne - 1] + idata[index + twoToPowD - 1]; +} + +__global__ void kernSetFirstElementZero(int n, int* odata) +{ + odata[n - 1] = 0; +} + +__global__ void kernEfficientScanDownSweep(int n, int d, int* odata, int* idata) +{ + // get index first + int index = threadIdx.x + (blockIdx.x * blockDim.x); + int twoToPowDPlusOne = 1 << (d + 1); + if (index >= n || (index % twoToPowDPlusOne != 0)) + { + return; + } + + int twoToPowD = 1 << d; + + // then sweep down + odata[index + twoToPowD - 1] = idata[index + twoToPowDPlusOne - 1]; + odata[index + twoToPowDPlusOne - 1] = idata[index + twoToPowDPlusOne - 1] + idata[index + twoToPowD - 1]; +} + namespace StreamCompaction { namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + int nNextHighestPowTwo = 1 << ilog2ceil(n); + + cudaMalloc((void**)&dev_efficientScanBuf, nNextHighestPowTwo * sizeof(int)); + checkCUDAError("cudaMalloc buf failed"); + + cudaMalloc((void**)&dev_efficientIdata, nNextHighestPowTwo * sizeof(int)); + checkCUDAError("cudaMalloc idata failed"); + + timer().startGpuTimer(); + + cudaMemcpy((void*)dev_efficientIdata, (const void*)idata, nNextHighestPowTwo * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy idata failed"); + + // call the upsweep kernel log2n number of times + for (int d = 0; d < ilog2ceil(nNextHighestPowTwo); ++d) + { + + // copy all the data to make sure everythings in place + cudaMemcpy((void*)dev_efficientScanBuf, (const void*)dev_efficientIdata, nNextHighestPowTwo * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy idata failed"); + + // call the kernel + kernEfficientScanUpSweep<<<((nNextHighestPowTwo + blockSize - 1) / blockSize) , blockSize>>>(nNextHighestPowTwo, d, dev_efficientScanBuf, dev_efficientIdata); + + // flip flop the buffers so that idata is always the most recent data + int* temp = dev_efficientScanBuf; + dev_efficientScanBuf = dev_efficientIdata; + dev_efficientIdata = temp; + } + + // set first element to be zero in a new kernel (unsure how to do this otherwise) + kernSetFirstElementZero << <1, 1 >> > (nNextHighestPowTwo, dev_efficientIdata); + + // now call the downsweep kernel log2n times + for (int d = (ilog2ceil(nNextHighestPowTwo) - 1); d >= 0; --d) + { + // copy all the data to make sure everything is in place + cudaMemcpy((void*)dev_efficientScanBuf, (const void*)dev_efficientIdata, nNextHighestPowTwo * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy idata failed"); + + // call the kernel + kernEfficientScanDownSweep<<<((nNextHighestPowTwo + blockSize - 1) / blockSize) , blockSize>>>(nNextHighestPowTwo, d, dev_efficientScanBuf, dev_efficientIdata); + + // flip flop the buffers + int* temp = dev_efficientScanBuf; + dev_efficientScanBuf = dev_efficientIdata; + dev_efficientIdata = temp; + } + + // shift it and memcpy to out + cudaMemcpy(odata, dev_efficientIdata, nNextHighestPowTwo * sizeof(int), cudaMemcpyDeviceToHost); + + timer().endGpuTimer(); + + cudaFree(dev_efficientScanBuf); + cudaFree(dev_efficientIdata); + + } /** @@ -31,10 +132,92 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + int nNextHighestPowTwo = 1 << ilog2ceil(n); + + cudaMalloc((void**)&dev_efficientBools, nNextHighestPowTwo * sizeof(int)); + checkCUDAError("cudaMalloc bool buf failed"); + + cudaMalloc((void**)&dev_efficientScanBuf, nNextHighestPowTwo * sizeof(int)); + checkCUDAError("cudaMalloc buf failed"); + + cudaMalloc((void**)&dev_efficientIdata, nNextHighestPowTwo * sizeof(int)); + checkCUDAError("cudaMalloc idata failed"); + + cudaMalloc((void**)&dev_efficientIndices, nNextHighestPowTwo * sizeof(int)); + checkCUDAError("cudaMalloc indices failed"); + + // memcpy all the stuff over to gpu before calling kernel functions + cudaMemcpy((void*)dev_efficientIdata, (const void*)idata, nNextHighestPowTwo * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy idata failed"); + timer().startGpuTimer(); - // TODO + + // map all of the values to booleans (and pad with zeroes for those values higher than original array limit) + StreamCompaction::Common::kernMapToBoolean<< <((nNextHighestPowTwo + blockSize - 1) / blockSize), blockSize >> > (n, nNextHighestPowTwo, dev_efficientBools, dev_efficientIdata); + + // Start the scan --------------- (copy pasted from the scan function because you can't nest calls to timer. Plus it saves a copy from device to host) + + // make a copy of the bools so we can do the scan and put it into indices + cudaMemcpy((void*)dev_efficientIndices, (const void*)dev_efficientBools, nNextHighestPowTwo * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy idata failed"); + + // call the upsweep kernel log2n number of times + for (int d = 0; d < ilog2ceil(nNextHighestPowTwo); ++d) + { + // copy all the data to make sure everythings in place + cudaMemcpy((void*)dev_efficientScanBuf, (const void*)dev_efficientIndices, nNextHighestPowTwo * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy idata failed"); + + // call the kernel + kernEfficientScanUpSweep << <((nNextHighestPowTwo + blockSize - 1) / blockSize), blockSize >> > (nNextHighestPowTwo, d, dev_efficientScanBuf, dev_efficientIndices); + + // flip flop the buffers so that idata is always the most recent data + int* temp = dev_efficientScanBuf; + dev_efficientScanBuf = dev_efficientIndices; + dev_efficientIndices = temp; + } + + // set first element to be zero in a new kernel (unsure how to do this otherwise) + kernSetFirstElementZero << <1, 1 >> > (nNextHighestPowTwo, dev_efficientIndices); + + // now call the downsweep kernel log2n times + for (int d = (ilog2ceil(nNextHighestPowTwo) - 1); d >= 0; --d) + { + // copy all the data to make sure everythings in place + cudaMemcpy((void*)dev_efficientScanBuf, (const void*)dev_efficientIndices, nNextHighestPowTwo * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("cudaMemcpy idata failed"); + + // call the kernel + kernEfficientScanDownSweep << <((nNextHighestPowTwo + blockSize - 1) / blockSize), blockSize >> > (nNextHighestPowTwo, d, dev_efficientScanBuf, dev_efficientIndices); + + // flip flop the buffers + int* temp = dev_efficientScanBuf; + dev_efficientScanBuf = dev_efficientIndices; + dev_efficientIndices = temp; + } + + // ------- end of scan + + int sizeOfCompactedStream = 0; + // memcpy the final value of indices to out so that we can get the total size of compacted stream + cudaMemcpy(&sizeOfCompactedStream, dev_efficientIndices + (nNextHighestPowTwo - 1), 1 * sizeof(int), cudaMemcpyDeviceToHost); + + // run the stream compaction + StreamCompaction::Common::kernScatter << <((nNextHighestPowTwo + blockSize - 1) / blockSize), blockSize >> > (nNextHighestPowTwo, dev_efficientScanBuf, dev_efficientIdata, dev_efficientBools, dev_efficientIndices); + + // memcpy to out + cudaMemcpy(odata, dev_efficientScanBuf, sizeOfCompactedStream * sizeof(int), cudaMemcpyDeviceToHost); + timer().endGpuTimer(); - return -1; + + // free all our stuff + cudaFree(dev_efficientScanBuf); + cudaFree(dev_efficientBools); + cudaFree(dev_efficientIdata); + cudaFree(dev_efficientIndices); + + // return the total size of the compacted stream + return sizeOfCompactedStream; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..b7eeb25 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,23 +3,106 @@ #include "common.h" #include "naive.h" +#define blockSize 64 + +int* dev_gpuScanBuf; +int* dev_idata; + +__global__ void kernNaiveScan(int n, int twoToPowerDMinusOne, int* odata, int* idata) +{ + // get index first + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + { + return; + } + + // then add the two numbers and put them into the global output buffer + if (index >= twoToPowerDMinusOne) + { + int one = idata[index - twoToPowerDMinusOne]; + int two = idata[index]; + int onePlusTwo = one + two; + odata[index] = onePlusTwo; + } + else + { + odata[index] = idata[index]; + } +} + +__global__ void kernShiftScan(int n, int* odata, int* idata) +{ + + // if your thread index is 0, insert a 0, otherwise everyone else do their own index - 1 in the data array + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) + { + return; + } + + if (index == 0) + { + odata[index] = 0; + } + else + { + odata[index] = idata[index - 1]; + } +} + namespace StreamCompaction { namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } - // TODO: __global__ /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int nNextHighestPowTwo = 1 << ilog2ceil(n); + + cudaMalloc((void**)&dev_gpuScanBuf, nNextHighestPowTwo * sizeof(int)); + checkCUDAError("cudaMalloc buf failed"); + + cudaMalloc((void**)&dev_idata, nNextHighestPowTwo * sizeof(int)); + checkCUDAError("cudaMalloc idata failed"); + + timer().startGpuTimer(); + + cudaMemcpy((void*)dev_idata, (const void*)idata, nNextHighestPowTwo * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy idata failed"); + + // call the kernel log2n number of times + for (int i = 1; i <= ilog2ceil(nNextHighestPowTwo); ++i) + { + // call the kernel + int twoToPowerIMinusOne = 1 << (i - 1); + kernNaiveScan<<<((n + blockSize - 1) / blockSize) , blockSize>>>(nNextHighestPowTwo, twoToPowerIMinusOne, dev_gpuScanBuf, dev_idata); + + // flip flop the buffers + int* temp = dev_gpuScanBuf; + dev_gpuScanBuf = dev_idata; + dev_idata = temp; + } + + // shift it and memcpy to out + kernShiftScan << <((n + blockSize - 1) / blockSize), blockSize >> > (nNextHighestPowTwo, dev_gpuScanBuf, dev_idata); + + cudaMemcpy(odata, dev_gpuScanBuf, nNextHighestPowTwo * sizeof(float), cudaMemcpyDeviceToHost); + + timer().endGpuTimer(); + + cudaFree(dev_gpuScanBuf); + cudaFree(dev_idata); + } } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 37dcb06..bf2d3b5 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -1,6 +1,7 @@ #pragma once #include "common.h" +#include namespace StreamCompaction { namespace Naive { diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..a342b17 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -6,23 +6,29 @@ #include "common.h" #include "thrust.h" + namespace StreamCompaction { namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::device_vector dev_idata(idata, idata + n); + thrust::device_vector dev_odata(n); + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); + timer().endGpuTimer(); + + thrust::copy(dev_odata.begin(), dev_odata.end(), odata); } } }