diff --git a/README.md b/README.md index 0e38ddb..50192a2 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,388 @@ +# IT SEEMS MY GPU HAS SOME ISSUE WITH THRUST. BECAUSE I CHANGED CUDA9.2 TO CUDA8.0 BUT THE THRUST SCAN IS STILL SLOW. MY GPU IS GTX 1080. DRIVER VERSION IS 411.63(NEWEST). + CUDA Stream Compaction ====================== **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Xiao Zhang + * [LinkedIn](https://www.linkedin.com/in/xiao-zhang-674bb8148/) +* Tested on: Windows 10, i7-7700K @ 4.20GHz 16.0GB, GTX 1080 15.96GB (my own PC) + +### Analysis (the pictures shown are the time in millisecond to execute the scan only) + +#### Power-of-two + +![](img/1.JPG) + +#### Non-power-of-two + +![](img/2.JPG) + +### Q&A + +#### Write a brief explanation of the phenomena you see here. Can you find the performance bottlenecks? Is it memory I/O? Computation? Is it different for each implementation? + +* The first phenomenon is that all GPU algorithm is slower than CPU algorithm. This may be because of there is context changing between GPU and CPU for GPU algorithms whereas there is not for the CPU algorithm. The msvc compiler may also did some optimization to the C++ code. + +* The second phenomenon is that efficient GPU algorithm is not faster than naive GPU algorithm. This may be because the naive GPU algorithm contains only one pass and the efficient GPU algorithm contains two pass, and during those two passes, the scheduling is not reliable, which means early termination somehow does not alleviate the idling of some GPU threads. Cache may be another reason why naive algorithm is faster. In naive algorithm, we are accessing the array sequentially, where in efficient algorithm, we are accessing the array with a changing step. Also, we are doing more global reading and writing in the efficienet algorithm. + +* The third phenomenon is that thrust scan algorithm is slower than any other algorithm. One possible reason is that the blocksize and gridsize is not set properly and thrust just uses some default value, which is not optimal for my hardware condition. + +* The last phenomenon is that for arrays whose size is a non-power-of-two number, the efficient algorithm suffers alot in terms of performance. This may be becasue my implementation is not optimal. I use cudaMemset to set all the extra elements to zero but if the code just submit the command to GPU and returned immediately, the clock start to tick, the next kernel function is also submitted but the last cudaMemset is still running, then there is still some time before the actual scan algorithm starting to execute. The way that I set the last element of the array to zero after the up sweep kernel is also through cudaMemset. If there is some delay way doing this, it will also affect the performance of the efficient algorithm in general, with non-power-of-two-sized array or not. + +### Output + +#### Arraysize 256 + +``` +**************** +** SCAN TESTS ** +**************** + [ 0 24 44 11 9 38 36 25 14 28 41 25 49 ... 14 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.000487ms (std::chrono Measured) + [ 0 0 24 68 79 88 126 162 187 201 229 270 295 ... 6301 6315 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.000244ms (std::chrono Measured) + [ 0 0 24 68 79 88 126 162 187 201 229 270 295 ... 6179 6221 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.018368ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.018432ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.053824ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.047712ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.05184ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.053024ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 0 0 3 3 0 2 3 2 0 1 3 1 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.000975ms (std::chrono Measured) + [ 3 3 2 3 2 1 3 1 2 2 3 1 3 ... 3 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.000731ms (std::chrono Measured) + [ 3 3 2 3 2 1 3 1 2 2 3 1 3 ... 2 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.000975ms (std::chrono Measured) + [ 3 3 2 3 2 1 3 1 2 2 3 1 3 ... 3 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.01984ms (CUDA Measured) + [ 3 3 2 3 2 1 3 1 2 2 3 1 3 ... 3 1 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.048096ms (CUDA Measured) + [ 3 3 2 3 2 1 3 1 2 2 3 1 3 ... 2 3 ] + passed +``` + +#### Arraysize 512 + +``` +**************** +** SCAN TESTS ** +**************** + [ 18 21 27 21 8 35 44 5 29 41 31 26 43 ... 2 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.001219ms (std::chrono Measured) + [ 0 18 39 66 87 95 130 174 179 208 249 280 306 ... 12340 12342 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.000732ms (std::chrono Measured) + [ 0 18 39 66 87 95 130 174 179 208 249 280 306 ... 12322 12334 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.024256ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.022368ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.027712ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.036288ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.130304ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.127424ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 3 1 0 3 0 3 3 3 3 2 1 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.001463ms (std::chrono Measured) + [ 2 3 3 1 3 3 3 3 3 2 1 2 1 ... 1 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.001707ms (std::chrono Measured) + [ 2 3 3 1 3 3 3 3 3 2 1 2 1 ... 1 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.001951ms (std::chrono Measured) + [ 2 3 3 1 3 3 3 3 3 2 1 2 1 ... 1 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.021376ms (CUDA Measured) + [ 2 3 3 1 3 3 3 3 3 2 1 2 1 ... 1 2 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.08528ms (CUDA Measured) + [ 2 3 3 1 3 3 3 3 3 2 1 2 1 ... 1 1 ] + passed +``` + +#### Arraysize 1024 + +``` +**************** +** SCAN TESTS ** +**************** + [ 14 15 34 28 40 12 1 9 34 5 7 19 38 ... 20 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.001463ms (std::chrono Measured) + [ 0 14 29 63 91 131 143 144 153 187 192 199 218 ... 24828 24848 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.001462ms (std::chrono Measured) + [ 0 14 29 63 91 131 143 144 153 187 192 199 218 ... 24782 24798 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.026752ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.024864ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.031584ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.047456ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.143456ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.118656ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 0 2 2 3 3 2 3 1 1 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.002195ms (std::chrono Measured) + [ 2 3 2 2 2 3 3 2 3 1 1 2 1 ... 1 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.002438ms (std::chrono Measured) + [ 2 3 2 2 2 3 3 2 3 1 1 2 1 ... 2 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.003901ms (std::chrono Measured) + [ 2 3 2 2 2 3 3 2 3 1 1 2 1 ... 1 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.021216ms (CUDA Measured) + [ 2 3 2 2 2 3 3 2 3 1 1 2 1 ... 1 1 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.058272ms (CUDA Measured) + [ 2 3 2 2 2 3 3 2 3 1 1 2 1 ... 2 1 ] + passed +``` + +#### Arraysize 2048 + +``` +**************** +** SCAN TESTS ** +**************** + [ 32 17 14 2 39 38 27 46 8 8 40 41 48 ... 48 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.002926ms (std::chrono Measured) + [ 0 32 49 63 65 104 142 169 215 223 231 271 312 ... 49750 49798 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.002682ms (std::chrono Measured) + [ 0 32 49 63 65 104 142 169 215 223 231 271 312 ... 49659 49671 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.03088ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.029824ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.022848ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.044128ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.105312ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.10768ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 1 0 2 1 0 1 2 0 0 2 1 0 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.004632ms (std::chrono Measured) + [ 2 1 2 1 1 2 2 1 1 2 2 3 3 ... 3 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.004876ms (std::chrono Measured) + [ 2 1 2 1 1 2 2 1 1 2 2 3 3 ... 2 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.007071ms (std::chrono Measured) + [ 2 1 2 1 1 2 2 1 1 2 2 3 3 ... 3 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.022496ms (CUDA Measured) + [ 2 1 2 1 1 2 2 1 1 2 2 3 3 ... 3 2 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.044096ms (CUDA Measured) + [ 2 1 2 1 1 2 2 1 1 2 2 3 3 ... 2 3 ] + passed +``` + +#### Arraysize 4096 + +``` +**************** +** SCAN TESTS ** +**************** + [ 47 2 37 17 33 33 40 22 29 11 42 36 31 ... 17 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.005607ms (std::chrono Measured) + [ 0 47 49 86 103 136 169 209 231 260 271 313 349 ... 100624 100641 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.005364ms (std::chrono Measured) + [ 0 47 49 86 103 136 169 209 231 260 271 313 349 ... 100522 100558 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.028512ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.02624ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.0424ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.056768ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.123552ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.127488ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 2 3 1 3 3 0 0 3 3 2 0 3 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.009753ms (std::chrono Measured) + [ 1 2 3 1 3 3 3 3 2 3 3 1 1 ... 2 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.009508ms (std::chrono Measured) + [ 1 2 3 1 3 3 3 3 2 3 3 1 1 ... 1 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.014873ms (std::chrono Measured) + [ 1 2 3 1 3 3 3 3 2 3 3 1 1 ... 2 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.049856ms (CUDA Measured) + [ 1 2 3 1 3 3 3 3 2 3 3 1 1 ... 2 3 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.09104ms (CUDA Measured) + [ 1 2 3 1 3 3 3 3 2 3 3 1 1 ... 1 2 ] + passed +``` -### (TODO: Your README) +#### Arraysize 8192 -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +``` +**************** +** SCAN TESTS ** +**************** + [ 49 14 14 29 11 26 29 9 23 49 26 45 40 ... 15 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.010972ms (std::chrono Measured) + [ 0 49 63 77 106 117 143 172 181 204 253 279 324 ... 200832 200847 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.010971ms (std::chrono Measured) + [ 0 49 63 77 106 117 143 172 181 204 253 279 324 ... 200773 200788 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.032864ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.03248ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.051232ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.093952ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.117088ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.11424ms (CUDA Measured) + passed +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 2 0 1 3 2 1 1 1 3 0 3 2 ... 1 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.016335ms (std::chrono Measured) + [ 1 2 1 3 2 1 1 1 3 3 2 1 3 ... 1 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.017066ms (std::chrono Measured) + [ 1 2 1 3 2 1 1 1 3 3 2 1 3 ... 3 3 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.026332ms (std::chrono Measured) + [ 1 2 1 3 2 1 1 1 3 3 2 1 3 ... 1 1 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.106496ms (CUDA Measured) + [ 1 2 1 3 2 1 1 1 3 3 2 1 3 ... 1 1 ] + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.059904ms (CUDA Measured) + [ 1 2 1 3 2 1 1 1 3 3 2 1 3 ... 3 3 ] + passed +``` diff --git a/img/1.JPG b/img/1.JPG new file mode 100644 index 0000000..c44168a Binary files /dev/null and b/img/1.JPG differ diff --git a/img/2.JPG b/img/2.JPG new file mode 100644 index 0000000..589004f Binary files /dev/null and b/img/2.JPG differ diff --git a/src/main.cpp b/src/main.cpp index 1850161..2dc8a5b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -137,14 +137,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); printDesc("work-efficient compact, non-power-of-two"); count = StreamCompaction::Efficient::compact(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); system("pause"); // stop Win32 console from closing on exit diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..48e2f35 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_60 ) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 8fc0211..3ae0084 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -14,7 +14,6 @@ void checkCUDAErrorFn(const char *msg, const char *file, int line) { exit(EXIT_FAILURE); } - namespace StreamCompaction { namespace Common { @@ -24,6 +23,10 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) return; + + bools[index] = (int)(idata[index] != 0); } /** @@ -33,6 +36,13 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) return; + + if (bools[index]) + { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 99a1b04..a4ff249 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -13,6 +13,10 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define blocksize 512 + +//#define SYNC_GRID + /** * Check for CUDA errors; print and exit if there was a problem. */ diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..53e6b0c 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,13 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + + odata[0] = 0; + for (int i = 1; i < n; i++) + { + odata[i] = odata[i - 1] + idata[i - 1]; + } + timer().endCpuTimer(); } @@ -31,8 +38,18 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + + int count = 0; + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + { + odata[count++] = idata[i]; + } + } + timer().endCpuTimer(); - return -1; + return count; } /** @@ -43,8 +60,27 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int count = 0; + + //scan + odata[0] = 0; + for (int i = 1; i < n; i++) + { + odata[i] = odata[i - 1] + (idata[i - 1]!=0);//map to boolean + } + + //scatter + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + { + odata[odata[i]] = idata[i]; + count++; + } + } + timer().endCpuTimer(); - return -1; + return count; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..3c05105 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -4,37 +4,176 @@ #include "efficient.h" namespace StreamCompaction { - namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } - - /** - * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. - */ - int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; - } - } -} + namespace Efficient { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + __global__ void kernUpSweep(int n, int POT, int POT_EX, int *data) + { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) return; + if (index % POT_EX != 0) return; + + data[index + POT_EX - 1] += data[index + POT - 1]; + } + + __global__ void kernDownSweep(int n, int POT, int POT_EX, int *data) + { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) return; + if (index % POT_EX != 0) return; + + int temp = data[index + POT - 1]; + data[index + POT - 1] = data[index + POT_EX - 1]; + data[index + POT_EX - 1] += temp; + } + + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + int count = ilog2ceil(n); + int number = 1 << count; + int *dev_data; + dim3 gridsize((number - 1) / blocksize + 1); + + cudaMalloc((void**)&dev_data, number * sizeof(int)); + checkCUDAErrorFn("malloc dev_data"); + + cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice); + if (number > n) + { + cudaMemset(dev_data + n, 0, (number - n) * sizeof(int)); + checkCUDAErrorFn("set dev_data"); + } + + //start ticking + timer().startGpuTimer(); + for (int i = 0; i < count; i++) + { + kernUpSweep << > > (number, 1 << i, 1 << i + 1, dev_data); +#ifdef SYNC_GRID + cudaThreadSynchronize(); +#endif + } + + //set data[number-1] to 0 + cudaMemset((void*)(dev_data + (number - 1)), 0, sizeof(int)); + checkCUDAErrorFn("set dev_data[number-1]"); + + for (int i = count - 1; i >= 0; i--) + { + kernDownSweep << > > (number, 1 << i, 1 << i + 1, dev_data); +#ifdef SYNC_GRID + cudaThreadSynchronize(); +#endif + } + + //stop ticking + timer().endGpuTimer(); + + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_data); + checkCUDAErrorFn("free dev_data"); + } + + /** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ + int compact(int n, int *odata, const int *idata) { + int result = 0; + int count = ilog2ceil(n); + int number = 1 << count; + int *dev_idata; + int *dev_odata; + int *dev_indices; + int *dev_bools; + dim3 gridsize((number - 1) / blocksize + 1); + dim3 gridsize_EXACT((n - 1) / blocksize + 1); + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAErrorFn("malloc dev_idata"); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAErrorFn("malloc dev_odata"); + + cudaMalloc((void**)&dev_indices, number * sizeof(int)); + checkCUDAErrorFn("malloc dev_indices"); + + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + checkCUDAErrorFn("malloc dev_bools"); + + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAErrorFn("memcpy dev_idata"); + + Common::kernMapToBoolean << > > (n, dev_bools, dev_idata); + + cudaMemcpy(dev_indices, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice); + + if (number > n) + { + cudaMemset(dev_indices + n, 0, (number - n) * sizeof(int)); + checkCUDAErrorFn("set dev_indices"); + } + + //start ticking + timer().startGpuTimer(); + + for (int i = 0; i < count; i++) + { + kernUpSweep << > > (number, 1 << i, 1 << i + 1, dev_indices); +#ifdef SYNC_GRID + cudaThreadSynchronize(); +#endif + } + + //set data[number-1] to 0 + cudaMemset((void*)(dev_indices + (number - 1)), 0, sizeof(int)); + checkCUDAErrorFn("set dev_indices[number-1]"); + + + for (int i = count - 1; i >= 0; i--) + { + kernDownSweep << > > (number, 1 << i, 1 << i + 1, dev_indices); +#ifdef SYNC_GRID + cudaThreadSynchronize(); +#endif + } + + Common::kernScatter << > > (n, dev_odata, dev_idata, dev_bools, dev_indices); + + //stop ticking + timer().endGpuTimer(); + + cudaMemcpy(&result, dev_indices + (n - 1), sizeof(int), cudaMemcpyDeviceToHost); + result += (int)(idata[n - 1] != 0); + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + checkCUDAErrorFn("free dev_idata"); + + cudaFree(dev_odata); + checkCUDAErrorFn("free dev_odata"); + + cudaFree(dev_indices); + checkCUDAErrorFn("free dev_indices"); + + cudaFree(dev_bools); + checkCUDAErrorFn("free dev_bools"); + return result; + } + } +} \ No newline at end of file diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..0d97c8d 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -12,14 +12,63 @@ namespace StreamCompaction { return timer; } // TODO: __global__ + __global__ void kernScan(int n, int POT, int *odata, int *idata) + { + int index = blockIdx.x * blockDim.x + threadIdx.x; + if (index >= n) return; + if (index >= POT) + { + odata[index] = idata[index] + idata[index - POT]; + } + else + { + odata[index] = idata[index]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + int count = ilog2ceil(n); + int *dev_odata; + int *dev_idata; + dim3 gridsize((n - 1) / blocksize + 1); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + checkCUDAErrorFn("malloc dev_odata"); + + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + checkCUDAErrorFn("malloc dev_idata"); + + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + for (int i = 0; i < count; i++) + { + kernScan << > > (n, 1 << i, dev_odata, dev_idata); +#ifdef SYNC_GRID + cudaThreadSynchronize(); +#endif + if (i != count - 1)//if not last time, exchange buffer for next kern + { + int *temp = dev_odata; + dev_odata = dev_idata; + dev_idata = temp; + } + } timer().endGpuTimer(); + + //shift right and insert identity + cudaMemcpy(odata + 1, dev_odata, (n - 1) * sizeof(int), cudaMemcpyDeviceToHost); + odata[0] = 0; + + cudaFree(dev_odata); + checkCUDAErrorFn("free dev_odata"); + + cudaFree(dev_idata); + checkCUDAErrorFn("free dev_idata"); + } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..0a31d9f 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -17,12 +17,64 @@ namespace StreamCompaction { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); - } + void scan(int n, int *odata, const int *idata) { + // TODO use `thrust::exclusive_scan` + // example: for device_vectors dv_in and dv_out: + // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + ////////////////////////////////////////////////////////////// + + // NOT WORKING, TOO SLOW + + //thrust::device_vector dv_in(thrust::host_vector(idata, idata + n)); + //thrust::device_vector dv_out(thrust::host_vector(odata, odata + n)); + + //timer().startGpuTimer(); + //thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + //timer().endGpuTimer(); + + //cudaMemcpy(odata, dv_out.data().get(), sizeof(int) * n, cudaMemcpyDeviceToHost); + //checkCUDAErrorFn("memcpy back failed!"); + + ////////////////////////////////////////////////////////////// + + // NOT WORKING, TOO SLOW + + //int *dev_in, *dev_out; + + //cudaMalloc((void**)&dev_in, n * sizeof(int)); + //checkCUDAError("cudaMalloc dev_in failed!"); + + //cudaMalloc((void**)&dev_out, n * sizeof(int)); + //checkCUDAError("cudaMalloc dev_out failed!"); + + //cudaMemcpy(dev_in, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + //thrust::device_ptr dv_in_ptr(dev_in); + //thrust::device_ptr dv_out_ptr(dev_out); + + //thrust::device_vector dv_in(dev_in, dev_in + n); + //thrust::device_vector dv_out(dev_out, dev_out + n); + + //timer().startGpuTimer(); + //thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + //timer().endGpuTimer(); + + //cudaMemcpy(odata, dv_out.data().get(), sizeof(int) * n, cudaMemcpyDeviceToHost); + + //cudaFree(dev_in); + //cudaFree(dev_out); + + ////////////////////////////////////////////////////////////////////////// + + // NOT WORKING, TOO SLOW, MUST BE GPU ISSUE + + thrust::device_vector d_data_in(idata, idata + n); + thrust::device_vector d_data_out(odata, odata + n); + timer().startGpuTimer(); + thrust::exclusive_scan(d_data_in.begin(), d_data_in.end(), d_data_out.begin()); + timer().endGpuTimer(); + thrust::copy(d_data_out.begin(), d_data_out.end(), odata); + } } }