From 60eeba1ce9e804fce603707c458a437299143d6b Mon Sep 17 00:00:00 2001 From: Angelina Risi Date: Tue, 11 Sep 2018 18:46:56 -0400 Subject: [PATCH 01/37] CPU, Naive GPU, and Efficient GPU scan/compact --- src/main.cpp | 6 +- stream_compaction/CMakeLists.txt | 2 +- stream_compaction/common.cu | 9 +- stream_compaction/cpu.cu | 72 +++++++++++--- stream_compaction/efficient.cu | 155 +++++++++++++++++++++++++++++-- stream_compaction/naive.cu | 61 ++++++++++-- stream_compaction/thrust.cu | 10 +- 7 files changed, 277 insertions(+), 38 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 1850161..7758045 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan @@ -71,14 +71,14 @@ int main(int argc, char* argv[]) { printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..c8709e7 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -13,5 +13,5 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_50 ) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 8fc0211..c99513b 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,10 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index >= n) return; + if (idata[index] != 0) bools[index] = 1; + else bools[index] = 0; } /** @@ -32,7 +35,9 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index >= n) return; + if (bools[index] == 1) odata[indices[index]] = idata[index]; } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 05ce667..dbf4b59 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,15 +1,15 @@ #include #include "cpu.h" -#include "common.h" +#include "common.h" namespace StreamCompaction { namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** @@ -19,7 +19,13 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + if (n < 1) return; // no data + odata[0] = 0; // Exclusive scan + for (int i = 1; i < n; i++) { + odata[i] = idata[i-1] + odata[i-1]; + } + timer().endCpuTimer(); } @@ -30,9 +36,19 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + if (n < 1) return -1; // no data + int nElem = 0; // remaining elements after compact + + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[nElem] = idata[i]; + nElem++; + } + } + timer().endCpuTimer(); - return -1; + return nElem; } /** @@ -41,10 +57,44 @@ namespace StreamCompaction { * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { + + int *map_data = (int*)(malloc(n * sizeof(int))); + int *scan_data = (int*)(malloc(n * sizeof(int))); + timer().startCpuTimer(); - // TODO + + //map + int i; + for (i = 0; i < n; i++) { + if (idata[i] != 0) { + map_data[i] = 1; + } + else map_data[i] = 0; + } + + // scan + scan_data[0] = 0; // Exclusive scan + for (int i = 1; i < n; i++) { + scan_data[i] = map_data[i - 1] + scan_data[i - 1]; + } + + int r_val; + + // scatter + for (i = 0; i < n; i++) { + if (map_data[i] == 1) { + r_val = scan_data[i]; + odata[r_val] = idata[i]; + } + } + r_val++; + timer().endCpuTimer(); - return -1; + + free(map_data); + free(scan_data); + + return r_val; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 36c5ef2..8e0e188 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,22 +3,90 @@ #include "common.h" #include "efficient.h" +#define blockSize 256 + namespace StreamCompaction { namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } + __global__ void kernScanDataUpSweep(int n, int offset1, int offset2, int* buff) { + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + + int access = index * offset2 - 1; + if (access >= n || n < 1 || access < 0) return; + + buff[access] += buff[access - offset1]; + + + } + __global__ void kernScanDataDownSweep(int n, int offset1, int offset2, int* buff) { + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + + int access = index * offset2 - 1; + if (access >= n || n < 1 || access < 0) return; + + int temp = buff[access - offset1]; + buff[access - offset1] = buff[access]; + buff[access] += temp; + + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int limit = ilog2ceil(n); + int size = pow(2, limit); + + // allocate memory + int* dev_buf; + cudaMalloc((void**)&dev_buf, size * sizeof(int)); + + // copy input data to device + cudaMemset(dev_buf + n, 0, (size - n) * sizeof(int)); + cudaMemcpy(dev_buf, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + + int d; + int offset1; + int offset2; + // UpSweep + + for (d = 1; d <= limit; d++) { + offset1 = pow(2, d - 1); + offset2 = pow(2, d); + kernScanDataUpSweep << > >(size, offset1, offset2, dev_buf); + cudaDeviceSynchronize(); + } + + // DownSweep + cudaMemset(dev_buf + n - 1, 0, (size - n + 1)* sizeof(int)); + for (d = limit; d >= 1; d--) { + offset1 = pow(2, d - 1); + offset2 = pow(2, d); + kernScanDataDownSweep << > >(size, offset1, offset2, dev_buf); + cudaDeviceSynchronize(); + } + + + timer().endGpuTimer(); + + // for debugging + //printf("Limit: %i, Size: %i, N: %i\n", limit, size, n); + + // copy output data to host + cudaMemcpy(odata, dev_buf, n * sizeof(int), cudaMemcpyDeviceToHost); + + // cleanup + cudaFree(dev_buf); } /** @@ -31,10 +99,77 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + + int* dev_map; // bool mapping + int* dev_scan; // scanned data + int* dev_out; // compacted data to output + int* dev_in; // input data + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int limit = ilog2ceil(n); + int size = pow(2, limit); + + // allocate memory + cudaMalloc((void**)&dev_in, n * sizeof(int)); + cudaMalloc((void**)&dev_map, n * sizeof(int)); + cudaMalloc((void**)&dev_out, n * sizeof(int)); + cudaMalloc((void**)&dev_scan, size * sizeof(int)); + + cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); - // TODO + // map + StreamCompaction::Common::kernMapToBoolean << > >(n, dev_map, dev_in); + + cudaMemcpy(dev_scan, dev_map, n * sizeof(int), cudaMemcpyDeviceToDevice); // copy bool data to scan + cudaMemset(dev_scan + n, 0, (size - n) * sizeof(int)); // zero extra mem + + // scan + + int d; + int offset1; + int offset2; + // UpSweep + + for (d = 1; d <= limit; d++) { + offset1 = pow(2, d - 1); + offset2 = pow(2, d); + kernScanDataUpSweep << > >(size, offset1, offset2, dev_scan); + cudaDeviceSynchronize(); + } + + // DownSweep + cudaMemset(dev_scan + n - 1, 0, (size - n + 1) * sizeof(int)); // zero extra + for (d = limit; d >= 1; d--) { + offset1 = pow(2, d - 1); + offset2 = pow(2, d); + kernScanDataDownSweep << > >(size, offset1, offset2, dev_scan); + cudaDeviceSynchronize(); + } + + // scatter + StreamCompaction::Common::kernScatter << > >(n, dev_out, dev_in, dev_map, dev_scan); + timer().endGpuTimer(); - return -1; + + // copy output to host + cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost); + int map_val; + int r_val; + cudaMemcpy(&r_val, dev_scan + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&map_val, dev_map + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + + if (map_val != 0) r_val++; + + // cleanup + cudaFree(dev_in); + cudaFree(dev_map); + cudaFree(dev_out); + cudaFree(dev_scan); + + return r_val; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 9218f8e..965bd74 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,23 +3,72 @@ #include "common.h" #include "naive.h" +#define blockSize 256 + namespace StreamCompaction { namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } // TODO: __global__ + __global__ void kernScanDataNaive(int n, int offset, int* out, const int *in) { + + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index > n || n < 1) return; + + if (index >= offset) { + out[index] = in[index] + in[index - offset]; + } + else { + out[index] = in[index]; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + // allocate memory + int* dev_out; + int* dev_in; + int* swap; + + cudaMalloc((void**)&dev_out, n * sizeof(int)); + cudaMalloc((void**)&dev_in, n * sizeof(int)); + + // copy input data to device + cudaMemset(dev_in, 0, sizeof(int)); + cudaMemcpy(dev_in + 1, idata, (n - 1) * sizeof(int), cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + int d; + int offset; + for (d = 1; d <= ilog2ceil(n); d++) { + offset = pow(2, d - 1); + kernScanDataNaive<<>>(n, offset, dev_out, dev_in); + + // swap buffers + swap = dev_in; + dev_in = dev_out; + dev_out = swap; + + } + timer().endGpuTimer(); + + // copy output data to host + cudaMemcpy(odata, dev_in, n * sizeof(int), cudaMemcpyDeviceToHost); + + // cleanup + cudaFree(dev_in); + cudaFree(dev_out); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 36b732d..e3b3268 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -8,11 +8,11 @@ namespace StreamCompaction { namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. From 00e94b3810c858de48b410568dbf3f5d6fee9456 Mon Sep 17 00:00:00 2001 From: Angelina Risi Date: Tue, 11 Sep 2018 21:41:53 -0400 Subject: [PATCH 02/37] Added Thrust implementation and improved Work-Efficient Performance --- stream_compaction/efficient.cu | 33 +++++++++++++++++++-------------- stream_compaction/thrust.cu | 15 ++++++++++++--- 2 files changed, 31 insertions(+), 17 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 8e0e188..9aa91ff 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -40,11 +40,12 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); int limit = ilog2ceil(n); int size = pow(2, limit); + dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize); + // allocate memory int* dev_buf; cudaMalloc((void**)&dev_buf, size * sizeof(int)); @@ -58,13 +59,13 @@ namespace StreamCompaction { int d; int offset1; int offset2; + // UpSweep - for (d = 1; d <= limit; d++) { offset1 = pow(2, d - 1); offset2 = pow(2, d); + fullBlocksPerGrid.x = ((size/offset2) + blockSize) / blockSize; kernScanDataUpSweep << > >(size, offset1, offset2, dev_buf); - cudaDeviceSynchronize(); } // DownSweep @@ -72,8 +73,8 @@ namespace StreamCompaction { for (d = limit; d >= 1; d--) { offset1 = pow(2, d - 1); offset2 = pow(2, d); + fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; kernScanDataDownSweep << > >(size, offset1, offset2, dev_buf); - cudaDeviceSynchronize(); } @@ -105,57 +106,58 @@ namespace StreamCompaction { int* dev_out; // compacted data to output int* dev_in; // input data - dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); - int limit = ilog2ceil(n); int size = pow(2, limit); + dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize); + // allocate memory cudaMalloc((void**)&dev_in, n * sizeof(int)); cudaMalloc((void**)&dev_map, n * sizeof(int)); cudaMalloc((void**)&dev_out, n * sizeof(int)); cudaMalloc((void**)&dev_scan, size * sizeof(int)); - cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); - + cudaMemset(dev_scan + n, 0, (size - n) * sizeof(int)); // zero extra mem + cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); // copy input data timer().startGpuTimer(); // map StreamCompaction::Common::kernMapToBoolean << > >(n, dev_map, dev_in); - cudaMemcpy(dev_scan, dev_map, n * sizeof(int), cudaMemcpyDeviceToDevice); // copy bool data to scan - cudaMemset(dev_scan + n, 0, (size - n) * sizeof(int)); // zero extra mem // scan int d; int offset1; int offset2; - // UpSweep + // UpSweep for (d = 1; d <= limit; d++) { offset1 = pow(2, d - 1); offset2 = pow(2, d); + fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; kernScanDataUpSweep << > >(size, offset1, offset2, dev_scan); - cudaDeviceSynchronize(); } // DownSweep - cudaMemset(dev_scan + n - 1, 0, (size - n + 1) * sizeof(int)); // zero extra + cudaMemset(dev_scan + n - 1, 0, (size - n + 1) * sizeof(int)); for (d = limit; d >= 1; d--) { offset1 = pow(2, d - 1); offset2 = pow(2, d); + fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; kernScanDataDownSweep << > >(size, offset1, offset2, dev_scan); - cudaDeviceSynchronize(); } // scatter + fullBlocksPerGrid.x = ((n + blockSize - 1) / blockSize); StreamCompaction::Common::kernScatter << > >(n, dev_out, dev_in, dev_map, dev_scan); timer().endGpuTimer(); // copy output to host cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost); + + // calc # of elements for return int map_val; int r_val; cudaMemcpy(&r_val, dev_scan + n - 1, sizeof(int), cudaMemcpyDeviceToHost); @@ -163,6 +165,9 @@ namespace StreamCompaction { if (map_val != 0) r_val++; + // for debugging + //printf("Limit: %i, Size: %i, N: %i\n", limit, size, n); + // cleanup cudaFree(dev_in); cudaFree(dev_map); diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index e3b3268..5fa537f 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,20 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + thrust::host_vector host_thrust_in(idata, idata + n); + thrust::host_vector host_thrust_out(odata, odata + n); + + thrust::device_vector dev_thrust_in = host_thrust_in; + thrust::device_vector dev_thrust_out = host_thrust_out; + + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::exclusive_scan(dev_thrust_in.begin(), dev_thrust_in.end(), dev_thrust_out.begin()); + timer().endGpuTimer(); + + thrust::copy(dev_thrust_out.begin(), dev_thrust_out.end(), odata); } } } From 2930becabf203a2156eab55bb7dbf7c185ce6444 Mon Sep 17 00:00:00 2001 From: Angelina Risi Date: Wed, 12 Sep 2018 23:31:08 -0400 Subject: [PATCH 03/37] Backing up Radix Sort, not yet tested but compiles --- stream_compaction/CMakeLists.txt | 2 + stream_compaction/efficient.cu | 4 +- stream_compaction/efficient.h | 3 + stream_compaction/radix.cu | 162 +++++++++++++++++++++++++++++++ stream_compaction/radix.h | 12 +++ 5 files changed, 181 insertions(+), 2 deletions(-) create mode 100644 stream_compaction/radix.cu create mode 100644 stream_compaction/radix.h diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index c8709e7..2769e97 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -9,6 +9,8 @@ set(SOURCE_FILES "efficient.cu" "thrust.h" "thrust.cu" + "radix.h" + "radix.cu" ) cuda_add_library(stream_compaction diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 9aa91ff..e6a9a1d 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -109,8 +109,6 @@ namespace StreamCompaction { int limit = ilog2ceil(n); int size = pow(2, limit); - dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize); - // allocate memory cudaMalloc((void**)&dev_in, n * sizeof(int)); cudaMalloc((void**)&dev_map, n * sizeof(int)); @@ -120,6 +118,8 @@ namespace StreamCompaction { cudaMemset(dev_scan + n, 0, (size - n) * sizeof(int)); // zero extra mem cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); // copy input data + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + timer().startGpuTimer(); // map StreamCompaction::Common::kernMapToBoolean << > >(n, dev_map, dev_in); diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4f..109455d 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -6,6 +6,9 @@ namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); + __global__ void kernScanDataUpSweep(int n, int offset1, int offset2, int* buff); + __global__ void kernScanDataDownSweep(int n, int offset1, int offset2, int* buff); + void scan(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu new file mode 100644 index 0000000..b896967 --- /dev/null +++ b/stream_compaction/radix.cu @@ -0,0 +1,162 @@ +#include "radix.h" +#include +#include + +#define blockSize 256 + +// macros for bit checks and toggles +// define macro to get nth bit of int +#define bitK(num, k) ((num >> k) & 1) +// flip bit +#define flipBit(bit) (bit ^ 1) + +namespace StreamCompaction { + namespace Radix { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + + __global__ void kernFindMax(int n, int offset1, int offset2, int* buff) { + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + + int access = index * offset2 - 1; + if (access >= n || n < 1 || access < 0) return; + + // modify in place + if (buff[access] < buff[access - offset1]) { + buff[access] = buff[access - offset1]; + } + + } + + __global__ void kernBoolMaps(int n, int k, int* input, int* b_arr, int* f_arr) { + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index >= n) return; + + int bit = bitK(input[index], k); + int fBit = flipBit(bit); + + b_arr[index] = bit; // maps bit k in b_arr + f_arr[index] = fBit; // copy same value here for scan + } + + __global__ void kernComputeT(int n, int totFalse, int *t_arr, int *f_arr) { + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index >= n) return; + + t_arr[index] = index - f_arr[index] + totFalse; + } + + __global__ void kernRadixScatter(int n, int *out, int *in, int *b_arr, int *f_arr, int *t_arr) { + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index >= n) return; + + int access = b_arr[index] ? t_arr[index] : f_arr[index]; + out[access] = in[index]; + } + + /* + Performs Radix Sort on input data using Work-Efficient Scan + */ + + void sort(int n, int *odata, const int *idata) { + + int limit = ilog2ceil(n); + int size = pow(2, limit); + + dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize); + + int d; + int offset1; + int offset2; + + int max; + int totFalse; + + // alloc. memory + int *b_arr; + //int *e_arr; // e_arr sorted in f_arr, do not need + int *f_arr; + int *t_arr; + + int *dev_in; + int *dev_out; + + cudaMalloc((void**)&b_arr, n * sizeof(int)); + //cudaMalloc((void**)&e_arr, n * sizeof(int)); + cudaMalloc((void**)&f_arr, size * sizeof(int)); // sized to power of 2 for scan + cudaMalloc((void**)&t_arr, n * sizeof(int)); + + cudaMalloc((void**)&dev_in, n * sizeof(int)); + cudaMalloc((void**)&dev_out, n * sizeof(int)); + + // copy input + cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + cudaMemset(f_arr + n, 0, (size - n) * sizeof(int)); + + // find max of data + for (d = 1; d <= limit; d++) { + offset1 = pow(2, d - 1); + offset2 = pow(2, d); + fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; + kernFindMax << > >(size, offset1, offset2, dev_out); + } + + max = dev_out[size - 1]; // save max to calc. number of passes + + for (int k = 0; k < ilog2ceil(max); k++) { + // map arrays b & e + fullBlocksPerGrid.x = ((n + blockSize - 1) / blockSize); + kernBoolMaps << > > (n, k, dev_in, b_arr, f_arr); + totFalse = f_arr[n - 1]; + + // exclusive scan e_arr into f_arr + + // UpSweep + for (d = 1; d <= limit; d++) { + offset1 = pow(2, d - 1); + offset2 = pow(2, d); + fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; + StreamCompaction::Efficient::kernScanDataUpSweep << > >(size, offset1, offset2, f_arr); + } + + // DownSweep + cudaMemset(f_arr + n - 1, 0, (size - n + 1) * sizeof(int)); + for (d = limit; d >= 1; d--) { + offset1 = pow(2, d - 1); + offset2 = pow(2, d); + fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; + StreamCompaction::Efficient::kernScanDataDownSweep << > >(size, offset1, offset2, f_arr); + } + + // total Falses + totFalse += f_arr[n - 1]; + + // Compute t_arr + fullBlocksPerGrid.x = ((n + blockSize - 1) / blockSize); + kernComputeT << > >(n, totFalse, t_arr, f_arr); + + // scatter + kernRadixScatter << > >(n, dev_out, dev_in, b_arr, f_arr, t_arr); + + } + // copy output data to host + cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost); + + // cleanup + cudaFree(dev_in); + cudaFree(dev_out); + cudaFree(b_arr); + cudaFree(f_arr); + cudaFree(t_arr); + + + } + + } + +} \ No newline at end of file diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h new file mode 100644 index 0000000..3063a1f --- /dev/null +++ b/stream_compaction/radix.h @@ -0,0 +1,12 @@ +#pragma once + +#include "common.h" +#include "efficient.h" + +namespace StreamCompaction { + namespace Radix { + StreamCompaction::Common::PerformanceTimer& timer(); + + void sort(int n, int *odata, const int *idata); + } +} From e300543271c215ace562db7cca129e007980aa89 Mon Sep 17 00:00:00 2001 From: Angelina Risi Date: Thu, 13 Sep 2018 00:59:20 -0400 Subject: [PATCH 04/37] Radix debugged and working on small data set, added cuda error checks to all GPU implementations --- src/main.cpp | 26 ++++++++ src/testing_helpers.hpp | 14 ++--- stream_compaction/efficient.cu | 15 ++++- stream_compaction/naive.cu | 5 +- stream_compaction/radix.cu | 110 +++++++++++++++++++++++++++++++-- stream_compaction/radix.h | 2 + 6 files changed, 157 insertions(+), 15 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 7758045..a6f269b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include "testing_helpers.hpp" const int SIZE = 1 << 8; // feel free to change the size of array @@ -95,6 +96,31 @@ int main(int argc, char* argv[]) { //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + zeroArray(SIZE, c); + printDesc("Find max, power-of-two"); + int max = StreamCompaction::Radix::max(SIZE, a); + printf("max = %i\n", max); + + zeroArray(SIZE, c); + printDesc("Find max, non-power-of-two"); + max = StreamCompaction::Radix::max(NPOT, a); + printf("max = %i\n", max); + + zeroArray(SIZE, c); + int radix_tst[8] = { 4, 7, 2, 6, 3, 5, 1, 0 }; + printDesc("Radix sort, power-of-two"); + StreamCompaction::Radix::sort(8, c, radix_tst); + printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(8, c, true); + + zeroArray(SIZE, c); + printDesc("Radix sort, non-power-of-two"); + StreamCompaction::Radix::sort(7, c, radix_tst); + printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(7, c, true); + + + printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp index 46337ab..d6cc4e3 100644 --- a/src/testing_helpers.hpp +++ b/src/testing_helpers.hpp @@ -1,8 +1,8 @@ #pragma once -#include -#include -#include +#include +#include +#include #include #include @@ -69,8 +69,8 @@ void printArray(int n, int *a, bool abridged = false) { printf("]\n"); } -template -void printElapsedTime(T time, std::string note = "") -{ - std::cout << " elapsed time: " << time << "ms " << note << std::endl; +template +void printElapsedTime(T time, std::string note = "") +{ + std::cout << " elapsed time: " << time << "ms " << note << std::endl; } \ No newline at end of file diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index e6a9a1d..20ff9b4 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -49,10 +49,12 @@ namespace StreamCompaction { // allocate memory int* dev_buf; cudaMalloc((void**)&dev_buf, size * sizeof(int)); + checkCUDAError("w-e scan malloc fail!"); // copy input data to device cudaMemset(dev_buf + n, 0, (size - n) * sizeof(int)); cudaMemcpy(dev_buf, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("initializing w-e scan data buff fail!"); timer().startGpuTimer(); @@ -66,6 +68,7 @@ namespace StreamCompaction { offset2 = pow(2, d); fullBlocksPerGrid.x = ((size/offset2) + blockSize) / blockSize; kernScanDataUpSweep << > >(size, offset1, offset2, dev_buf); + checkCUDAError("upsweep fail!"); } // DownSweep @@ -75,6 +78,7 @@ namespace StreamCompaction { offset2 = pow(2, d); fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; kernScanDataDownSweep << > >(size, offset1, offset2, dev_buf); + checkCUDAError("downsweep fail!"); } @@ -85,6 +89,7 @@ namespace StreamCompaction { // copy output data to host cudaMemcpy(odata, dev_buf, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("copying output data fail!"); // cleanup cudaFree(dev_buf); @@ -114,9 +119,11 @@ namespace StreamCompaction { cudaMalloc((void**)&dev_map, n * sizeof(int)); cudaMalloc((void**)&dev_out, n * sizeof(int)); cudaMalloc((void**)&dev_scan, size * sizeof(int)); + checkCUDAError("w-e compact malloc fail!"); cudaMemset(dev_scan + n, 0, (size - n) * sizeof(int)); // zero extra mem cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); // copy input data + checkCUDAError("initializing w-e compact data buffs fail!"); dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); @@ -124,6 +131,7 @@ namespace StreamCompaction { // map StreamCompaction::Common::kernMapToBoolean << > >(n, dev_map, dev_in); cudaMemcpy(dev_scan, dev_map, n * sizeof(int), cudaMemcpyDeviceToDevice); // copy bool data to scan + checkCUDAError("w-e compact bool mapping fail!"); // scan @@ -137,6 +145,7 @@ namespace StreamCompaction { offset2 = pow(2, d); fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; kernScanDataUpSweep << > >(size, offset1, offset2, dev_scan); + checkCUDAError("w-e compact upsweep fail!"); } // DownSweep @@ -146,24 +155,28 @@ namespace StreamCompaction { offset2 = pow(2, d); fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; kernScanDataDownSweep << > >(size, offset1, offset2, dev_scan); + checkCUDAError("w-e compact downsweep fail!"); } // scatter fullBlocksPerGrid.x = ((n + blockSize - 1) / blockSize); StreamCompaction::Common::kernScatter << > >(n, dev_out, dev_in, dev_map, dev_scan); + checkCUDAError("w-e compact scatter fail!"); timer().endGpuTimer(); // copy output to host cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("w-e compact output copy fail!"); // calc # of elements for return int map_val; int r_val; cudaMemcpy(&r_val, dev_scan + n - 1, sizeof(int), cudaMemcpyDeviceToHost); cudaMemcpy(&map_val, dev_map + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("w-e compact calc # elem fail!"); - if (map_val != 0) r_val++; + r_val += map_val; // for debugging //printf("Limit: %i, Size: %i, N: %i\n", limit, size, n); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 965bd74..09cb5ff 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -41,10 +41,12 @@ namespace StreamCompaction { cudaMalloc((void**)&dev_out, n * sizeof(int)); cudaMalloc((void**)&dev_in, n * sizeof(int)); + checkCUDAError("naive scan malloc fail!"); // copy input data to device cudaMemset(dev_in, 0, sizeof(int)); cudaMemcpy(dev_in + 1, idata, (n - 1) * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("naive input copy fail!"); timer().startGpuTimer(); @@ -53,7 +55,7 @@ namespace StreamCompaction { for (d = 1; d <= ilog2ceil(n); d++) { offset = pow(2, d - 1); kernScanDataNaive<<>>(n, offset, dev_out, dev_in); - + checkCUDAError("naive scan iteration fail!"); // swap buffers swap = dev_in; dev_in = dev_out; @@ -65,6 +67,7 @@ namespace StreamCompaction { // copy output data to host cudaMemcpy(odata, dev_in, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("naive copy output fail!"); // cleanup cudaFree(dev_in); diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu index b896967..a3598ea 100644 --- a/stream_compaction/radix.cu +++ b/stream_compaction/radix.cu @@ -62,6 +62,43 @@ namespace StreamCompaction { Performs Radix Sort on input data using Work-Efficient Scan */ + int max(int n, int *idata) { + int limit = ilog2ceil(n); + int size = pow(2, limit); + + dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize); + + int d; + int offset1; + int offset2; + + int max; + int *max_arr; + cudaMalloc((void**)&max_arr, size * sizeof(int)); + cudaMemcpy(max_arr, idata, n * sizeof(int), cudaMemcpyHostToDevice); + cudaMemset(max_arr + n, 0, (size - n) * sizeof(int)); + + timer().startGpuTimer(); + + // find max of data + for (d = 1; d <= limit; d++) { + offset1 = pow(2, d - 1); + offset2 = pow(2, d); + fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; + kernFindMax << > >(size, offset1, offset2, max_arr); + checkCUDAError("Radix find max fail!"); + } + + timer().endGpuTimer(); + + cudaMemcpy(&max, max_arr + size - 1, sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(max_arr); + return max; + + + } + void sort(int n, int *odata, const int *idata) { int limit = ilog2ceil(n); @@ -75,6 +112,8 @@ namespace StreamCompaction { int max; int totFalse; + + int temp; // alloc. memory int *b_arr; @@ -85,6 +124,8 @@ namespace StreamCompaction { int *dev_in; int *dev_out; + int *swap; + cudaMalloc((void**)&b_arr, n * sizeof(int)); //cudaMalloc((void**)&e_arr, n * sizeof(int)); cudaMalloc((void**)&f_arr, size * sizeof(int)); // sized to power of 2 for scan @@ -93,26 +134,51 @@ namespace StreamCompaction { cudaMalloc((void**)&dev_in, n * sizeof(int)); cudaMalloc((void**)&dev_out, n * sizeof(int)); + //debug + //printf("Malloced\n"); + // copy input cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); - + cudaMemcpy(f_arr, dev_in, n * sizeof(int), cudaMemcpyDeviceToDevice); cudaMemset(f_arr + n, 0, (size - n) * sizeof(int)); + checkCUDAError("Radix mem init fail!"); + + //debug + //printf("Input copied\n"); + + timer().startGpuTimer(); // find max of data for (d = 1; d <= limit; d++) { offset1 = pow(2, d - 1); offset2 = pow(2, d); fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; - kernFindMax << > >(size, offset1, offset2, dev_out); + kernFindMax << > >(size, offset1, offset2, f_arr); + checkCUDAError("Radix find max fail!"); } - max = dev_out[size - 1]; // save max to calc. number of passes + // copy max value + cudaMemcpy(&max, f_arr + size - 1, sizeof(int), cudaMemcpyDeviceToHost); + // zero extra mem + cudaMemset(f_arr + n, 0, (size - n) * sizeof(int)); + + //debug + //printf("Max: %i\n", max); for (int k = 0; k < ilog2ceil(max); k++) { + //debug + //printf("Pass %i\n", k+1); + // map arrays b & e fullBlocksPerGrid.x = ((n + blockSize - 1) / blockSize); kernBoolMaps << > > (n, k, dev_in, b_arr, f_arr); - totFalse = f_arr[n - 1]; + checkCUDAError("Radix bool maps fail!"); + + // get whether last number's bit is false + cudaMemcpy(&totFalse, f_arr + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + + //debug + //printf("bools mapped\n"); // exclusive scan e_arr into f_arr @@ -122,6 +188,7 @@ namespace StreamCompaction { offset2 = pow(2, d); fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; StreamCompaction::Efficient::kernScanDataUpSweep << > >(size, offset1, offset2, f_arr); + checkCUDAError("Radix upsweep fail!"); } // DownSweep @@ -131,21 +198,49 @@ namespace StreamCompaction { offset2 = pow(2, d); fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; StreamCompaction::Efficient::kernScanDataDownSweep << > >(size, offset1, offset2, f_arr); + checkCUDAError("Radix downsweep fail!"); } + //debug + //printf("scanned\n"); + // total Falses - totFalse += f_arr[n - 1]; + cudaMemcpy(&temp, f_arr + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + totFalse += temp; + + //debug + //printf("totFalse = %i\n", totFalse); + // Compute t_arr fullBlocksPerGrid.x = ((n + blockSize - 1) / blockSize); kernComputeT << > >(n, totFalse, t_arr, f_arr); + checkCUDAError("Radix t_arr compute fail!"); + + //debug + //printf("t_arr computed\n"); // scatter kernRadixScatter << > >(n, dev_out, dev_in, b_arr, f_arr, t_arr); + checkCUDAError("Radix scatter fail!"); + + swap = dev_out; + dev_out = dev_in; + dev_in = swap; + + //debug + //printf("scattered\n"); } + + timer().endGpuTimer(); + // copy output data to host - cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(odata, dev_in, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("Radix output copy fail!"); + + //debug + //printf("output copied\n"); // cleanup cudaFree(dev_in); @@ -154,6 +249,9 @@ namespace StreamCompaction { cudaFree(f_arr); cudaFree(t_arr); + //debug + //printf("complete\n"); + } diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h index 3063a1f..19b46c6 100644 --- a/stream_compaction/radix.h +++ b/stream_compaction/radix.h @@ -8,5 +8,7 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); void sort(int n, int *odata, const int *idata); + + int max(int n, int* idata); } } From b2cbce13d3251419306dbfd2328d4f803f799b75 Mon Sep 17 00:00:00 2001 From: Angelina Risi Date: Thu, 13 Sep 2018 01:01:38 -0400 Subject: [PATCH 05/37] Radix tested on large dataset, appears functional --- src/main.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index a6f269b..4195306 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -107,17 +107,17 @@ int main(int argc, char* argv[]) { printf("max = %i\n", max); zeroArray(SIZE, c); - int radix_tst[8] = { 4, 7, 2, 6, 3, 5, 1, 0 }; + //int radix_tst[8] = { 4, 7, 2, 6, 3, 5, 1, 0 }; printDesc("Radix sort, power-of-two"); - StreamCompaction::Radix::sort(8, c, radix_tst); + StreamCompaction::Radix::sort(SIZE, c, a); printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - printArray(8, c, true); + printArray(SIZE, c, true); zeroArray(SIZE, c); printDesc("Radix sort, non-power-of-two"); - StreamCompaction::Radix::sort(7, c, radix_tst); + StreamCompaction::Radix::sort(NPOT, c, a); printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - printArray(7, c, true); + printArray(NPOT, c, true); From 6b81341fc04b07707b26a2e329f6f9d9877501b5 Mon Sep 17 00:00:00 2001 From: Angelina Risi Date: Thu, 13 Sep 2018 01:14:21 -0400 Subject: [PATCH 06/37] Added timing info for finding max val --- src/main.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main.cpp b/src/main.cpp index 4195306..8bf8a35 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -99,11 +99,13 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("Find max, power-of-two"); int max = StreamCompaction::Radix::max(SIZE, a); + printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printf("max = %i\n", max); zeroArray(SIZE, c); printDesc("Find max, non-power-of-two"); max = StreamCompaction::Radix::max(NPOT, a); + printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printf("max = %i\n", max); zeroArray(SIZE, c); From 4eead15d5d4316cf234ada90604aa568e1047774 Mon Sep 17 00:00:00 2001 From: Angelina Risi Date: Thu, 13 Sep 2018 01:34:36 -0400 Subject: [PATCH 07/37] Removed debug code --- stream_compaction/efficient.cu | 6 ------ stream_compaction/radix.cu | 37 ---------------------------------- 2 files changed, 43 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 20ff9b4..04f1843 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -84,9 +84,6 @@ namespace StreamCompaction { timer().endGpuTimer(); - // for debugging - //printf("Limit: %i, Size: %i, N: %i\n", limit, size, n); - // copy output data to host cudaMemcpy(odata, dev_buf, n * sizeof(int), cudaMemcpyDeviceToHost); checkCUDAError("copying output data fail!"); @@ -178,9 +175,6 @@ namespace StreamCompaction { r_val += map_val; - // for debugging - //printf("Limit: %i, Size: %i, N: %i\n", limit, size, n); - // cleanup cudaFree(dev_in); cudaFree(dev_map); diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu index a3598ea..0ad6b84 100644 --- a/stream_compaction/radix.cu +++ b/stream_compaction/radix.cu @@ -112,7 +112,6 @@ namespace StreamCompaction { int max; int totFalse; - int temp; // alloc. memory @@ -134,18 +133,12 @@ namespace StreamCompaction { cudaMalloc((void**)&dev_in, n * sizeof(int)); cudaMalloc((void**)&dev_out, n * sizeof(int)); - //debug - //printf("Malloced\n"); - // copy input cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(f_arr, dev_in, n * sizeof(int), cudaMemcpyDeviceToDevice); cudaMemset(f_arr + n, 0, (size - n) * sizeof(int)); checkCUDAError("Radix mem init fail!"); - //debug - //printf("Input copied\n"); - timer().startGpuTimer(); // find max of data @@ -162,12 +155,7 @@ namespace StreamCompaction { // zero extra mem cudaMemset(f_arr + n, 0, (size - n) * sizeof(int)); - //debug - //printf("Max: %i\n", max); - for (int k = 0; k < ilog2ceil(max); k++) { - //debug - //printf("Pass %i\n", k+1); // map arrays b & e fullBlocksPerGrid.x = ((n + blockSize - 1) / blockSize); @@ -176,9 +164,6 @@ namespace StreamCompaction { // get whether last number's bit is false cudaMemcpy(&totFalse, f_arr + n - 1, sizeof(int), cudaMemcpyDeviceToHost); - - //debug - //printf("bools mapped\n"); // exclusive scan e_arr into f_arr @@ -201,25 +186,15 @@ namespace StreamCompaction { checkCUDAError("Radix downsweep fail!"); } - //debug - //printf("scanned\n"); - // total Falses cudaMemcpy(&temp, f_arr + n - 1, sizeof(int), cudaMemcpyDeviceToHost); totFalse += temp; - //debug - //printf("totFalse = %i\n", totFalse); - - // Compute t_arr fullBlocksPerGrid.x = ((n + blockSize - 1) / blockSize); kernComputeT << > >(n, totFalse, t_arr, f_arr); checkCUDAError("Radix t_arr compute fail!"); - //debug - //printf("t_arr computed\n"); - // scatter kernRadixScatter << > >(n, dev_out, dev_in, b_arr, f_arr, t_arr); checkCUDAError("Radix scatter fail!"); @@ -227,10 +202,6 @@ namespace StreamCompaction { swap = dev_out; dev_out = dev_in; dev_in = swap; - - //debug - //printf("scattered\n"); - } timer().endGpuTimer(); @@ -239,20 +210,12 @@ namespace StreamCompaction { cudaMemcpy(odata, dev_in, n * sizeof(int), cudaMemcpyDeviceToHost); checkCUDAError("Radix output copy fail!"); - //debug - //printf("output copied\n"); - // cleanup cudaFree(dev_in); cudaFree(dev_out); cudaFree(b_arr); cudaFree(f_arr); cudaFree(t_arr); - - //debug - //printf("complete\n"); - - } } From 2f1b74ba1a40e048c81ad7c592c708dbdd083387 Mon Sep 17 00:00:00 2001 From: risia Date: Thu, 13 Sep 2018 01:38:23 -0400 Subject: [PATCH 08/37] Radix sort implementation description in readme --- README.md | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0e38ddb..5499e18 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,20 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Angelina Risi + * [LinkedIn](www.linkedin.com/in/angelina-risi) +* Tested on: Windows 10, i7-6700HQ @ 2.60GHz 8GB, GTX 960M 4096MB (Personal Laptop) ### (TODO: Your README) +**Radix Sort Implementation** + +Radix sort is a method of sorting data in an array from min to max using the values' binary data. This is done by sorting by the least-significant bit (LSB) first, iterating through bit sorts until the most-significant bit (MSB). +Before we can sort, we actually need to find the dataset's maximum value. By taking the ceiling of log2(max), we can get the max number of bits representing the data. This reduces the number of redundant iterations of sorting from the number of bits in the data type to only as many relevant ones there are. This is done on the GPU using a parallel reduction algorithm comparing pairs of values. +To perform the sort itself efficiently, we generate a a pair of boolean buffers indicating whether the currently tested bit at that index is 0 or 1. One buffer is the true buffer, called b_arr, and the other the false buffer, called f_arr. If the bit value is 1, b_arr[index] is set to 1 and f_arr to 0, and vice versa. We save the last value of f_arr for later to compute the number of "falses" for indexing. +The f_arr is scanned using the work-efficient exclusive scan to generate the "false" indices, the locations to store the data values if b_arr[index] == 0 in the output array. The "true" indices, t_arr, are generated as "index - f_arr[index] + totFalse". The total false values is the last value in the scanned f_arr plus the value we stored earlier from f_arr before scanning. By using a GPU-implemented scatter function, we save the input values sorted into the output buffer. To remove the need for more intermediate buffers for each sort step, the input and output arrays are ping-ponged (switch their pointers) each sort step. + + Include analysis, etc. (Remember, this is public, so don't put anything here that you don't want to share with the world.) From 3d4f2f2cdd0f3feaa4dac87a3b274ac4b6f4314e Mon Sep 17 00:00:00 2001 From: risia Date: Thu, 13 Sep 2018 01:56:17 -0400 Subject: [PATCH 09/37] Added findMax code with extra comments for illustration --- README.md | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 5499e18..c98e927 100644 --- a/README.md +++ b/README.md @@ -13,10 +13,35 @@ CUDA Stream Compaction Radix sort is a method of sorting data in an array from min to max using the values' binary data. This is done by sorting by the least-significant bit (LSB) first, iterating through bit sorts until the most-significant bit (MSB). Before we can sort, we actually need to find the dataset's maximum value. By taking the ceiling of log2(max), we can get the max number of bits representing the data. This reduces the number of redundant iterations of sorting from the number of bits in the data type to only as many relevant ones there are. This is done on the GPU using a parallel reduction algorithm comparing pairs of values. + +```cpp +// each thread compares a pair of integers from the input buffer +// and selects the greater of the two +__global__ void kernFindMax(int n, int offset1, int offset2, int* buff) { + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + + // compute which index to compare + int access = index * offset2 - 1; + if (access >= n || n < 1 || access < 0) return; + + // modify in place + if (buff[access] < buff[access - offset1]) { + buff[access] = buff[access - offset1]; + } +} +``` +```cpp +// The loop iterates deeper into the reduction until the final max value is sorted to the end +// This essentially sweeps the max value up to the root of a balanced binary tree +for (d = 1; d <= limit; d++) { + offset1 = pow(2, d - 1); + offset2 = pow(2, d); + fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; + kernFindMax << > >(size, offset1, offset2, max_arr); + checkCUDAError("Radix find max fail!"); // error checking +} +``` + To perform the sort itself efficiently, we generate a a pair of boolean buffers indicating whether the currently tested bit at that index is 0 or 1. One buffer is the true buffer, called b_arr, and the other the false buffer, called f_arr. If the bit value is 1, b_arr[index] is set to 1 and f_arr to 0, and vice versa. We save the last value of f_arr for later to compute the number of "falses" for indexing. The f_arr is scanned using the work-efficient exclusive scan to generate the "false" indices, the locations to store the data values if b_arr[index] == 0 in the output array. The "true" indices, t_arr, are generated as "index - f_arr[index] + totFalse". The total false values is the last value in the scanned f_arr plus the value we stored earlier from f_arr before scanning. By using a GPU-implemented scatter function, we save the input values sorted into the output buffer. To remove the need for more intermediate buffers for each sort step, the input and output arrays are ping-ponged (switch their pointers) each sort step. - -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) - From 3292d5b6406d5fedd25100d89bed5850e26e5822 Mon Sep 17 00:00:00 2001 From: Angelina Risi Date: Thu, 13 Sep 2018 02:31:54 -0400 Subject: [PATCH 10/37] Small Radix sort example --- img/radix_example.PNG | Bin 0 -> 12095 bytes src/main.cpp | 10 +++++++++- stream_compaction/radix.cu | 4 ++-- 3 files changed, 11 insertions(+), 3 deletions(-) create mode 100644 img/radix_example.PNG diff --git a/img/radix_example.PNG b/img/radix_example.PNG new file mode 100644 index 0000000000000000000000000000000000000000..bd18f27f735f0360d74c5af6e40e8acce0e86e9a GIT binary patch literal 12095 zcmbuFc|25a@l9N25Jmf3-i7z`$GAD1{DSSC#(uK3ZMXWP6c)X?!{%)XxCfVjaz{i3A zP=XiO%(1Zzc%vZ5F0CmEfA{Ksg;#gyq_TGa5ltKGQQ6{KrugeCt5Hsdh;{e|FV?Q4=+zyI-;ks9pJ{B4!Qa*hrg{mM`&c%mw48U-J;bm83b>b^G`lLTRH@Hht=O$V-lKx20ew_$&zYf zh`LzY5AEbxn!C?8?Nv`CrgRThpar!!)N9zFwG9MEB6lWv%7(hEn52i~a{ zYTL!nSRX9AUT)BJDNtPvvq+_*Q8Yo&0`I3=wOxhWoh`cMsW*z0HA0i=JPFo?xD0CP zgaxk0o8P=u_x?AE=Vtc z=lTLxoua#n*ARD@Z00h9G)alw^(~?~Pw%E}z(xft?cRfuVu-_Jf4`xg&koR2M|f4M8H;0inPK+;C)k2oAG8m7O{t8%ErT@yeKb%A!+%hh z?}c(GwH6NMeLvoFZH#m8C{hg?C`+GoHLfm|i{%g&KKbY%qxN!#Sn4cx_UWz3tPxGh zBs)vsDTH#Ax55I+s!|0~D`(Zeg*w&Qz1SW(Cbx@W=l<2JVo*HR*D#9Q z8OEmDl*3f(;e)!^rPx_6D+Dv*Br=~d8)SG)O36`6V@T(&KX$Vo%Ya`Gg4n!=@-E_? z`Nx@wkrM(S5cH5#Ue@qkatPOEkLVVVltS~KtZAGY1*gnCEVUH`Hi$n2O#GAor7Uw# zaOZ#Y3hG64M)wIyn^NzeVPat?;uH-}$xeP?X2cfEYAQHyck^u-J5kTWXY0#|PAT53 zQgRVaidZ4;$PNMo_(>$fFh)SjMD zrB5Txm$>fzrIFJu15(lJh(?3aU%N3nkVuM=|*$iBI?%H$;no(gPE3X** z{zBxlkU;9%uDqykjY*(kh317$p`=pJ)+*I*8@qwh2GzOm=$@+RT$&ECoE@8Kd-f~# zZfj1RXM+0=Ib4MW`Pt$?%xQ=M*P6l2pO0?PmrozxXv4OdokI41Tob`YjoUa&h)(%i z?>ARnMk0u71WXPDpjXTa7tHbn=!?naPM53R0_n-7>16KUN^H>Aw zi+gDFzIhhy!p4R-IhB;AF!N)x?+}I^2H*KTHX(flTF47zP<5e5`~*wpoew0}Bt7rg z#HWlQ6Y|i*>Z%ShR(S?!)V*S@+^X}u^5`yW#^w8u8=yf)4&ZJ`q+@)H(+#@u<;Y5J zst!>FS~{@4Xs2>4#YFHpFI&>~{fq;0)vu>DC0Fv>QuPjY6l>)vp8!3Cr9iN9H6;># zT4h@EJz+;H`@(CQevs&O)1@fVjyPVms#KN4PK?& z)ev;YXz7_t_VunJ_u59cA4lF(U2d)#)q2K~r9CUPt9ZfJM=?u!dm`PedqL z;@$lf`EPlhR0iI*`xz;O;e)S3<>?%f=FZfX@s7hLkjC(J3uZ?9(%M87<~>1wHR+U74=W{v1A^z? z4vNSVl3roj>=OCC2TV9A_bTYlZWu^zciNjm&s_0fUmZd%#=*=7t(W_XVuww;ca&gA z9`hW0XVYcRy*yt$8zS0E@Tez25Aj%^Ob2g^y|xCwo7|TOQDK~x^5hQ_eeinzV_^;R zgy8qbk95F0;dxBb=n3TPP(5YKaL3VR@MB@`a~AsZ%$E*Ax0+wRet0V2G~M&W-9N*x zveKs1f*rNUlW!(xqB&)hLgg{c`k=T;RaZw<^47DLPe~Y&#MNT+)Js&Dm8Jf@LZY=f zI-?=@eZ1CBGWR`KS9iYI{Uh*$kksHX;u9y|fXnuoXclOwhwWLvG**I~ZQ}{MlyjsU zu{ju>KZowiMZ{pDd9%VvZMQr1rb~y2QDPqy_RcUM)`Rf5;EH4GGRqb&EWB;OuN)vEjED(L+hah4LFd4+zB9 zDBw2vsHz@nsVmcqf**5>;`$!=e8~%?pu%U4&IN@-f-N{HI?W%9S?2~hF*XDvh~H#? zx`Al56W-d-Gw+|*FQzz<^q%PjB0oGce!kN=s{Z-i*_3siI{EmC=6bPb&2>Lia*GQ6 zDxxcTlx}U-`K$f%C%l~*hna|Ie9=A7!+13J|WvnHSRvmd|7`9p%3bZr4d4t8X- zi{w-975};-HG1`@+sV#k;IhwMyyAmO(AV>=ZQ2Z>q#OO^o_s~A@S<~*F=DbSi*o;d zY1K2UWxaWcXnsI~-5q6w_o=AFGVJO4XR#iT7@}dkyWnhh|M*3aPL09;sB@^D{J4pt zb}OJu1F1&ZVJX%vh$KH_f2}nG>zlEDz$praRaTZ{FSb{{+8@83VPCKIRmjn}l-|>x zkLvPI7(Qx~tIHpAt}O$)Qs!&&CEYMf<)&@yid0*q+l2S8j?9-I>&gAFBfE7y%{!#x zyX2yruDwcEZE>_M(7Lnn6JqQyS~dE&tSo!Y*q@t@yM(}OEM?dg)|O;5c3&WsSdMUU zn$r*pHLZ*i)OgV5IQOLn8fps>Fa`$Tt>2{rurr5KTfS@r0>{S1>J1gEVVfiOW0cV5 zVE+1_%DZb@-gMpfC+*ALe8tSO~_f> zVd_Nrbrt*qxB_83rIr|sJK>Fs9-T9mz|VAi;Ql+`Y>%zve_D;iKC*PPdzrIcCsGV=l4F9%Yqf9mG8^Ge3`a7OGje>2~7Fg99U|m`#li>sBbGv z7dr_uSM62u_Lz70q@5!s#cV+lqU=eu>%cT@nQ}O}fUZ)PBLEsHQT?H$k3Jx3VP=>1 zg%ib|D>SW8kh6-Ww$Te=GmZVEXX}Ut^mQq!EcW@RkMSJ+v?WmGzvEBJJak^FJ&!t~ z)lUER*}28+L(v71+s=EWlV@Y=8((>=N#?LWNvUOUoG#VTr5!|g)_L$={&0v3gx5x{ z7U9-2ybNoTN}{hRym*BZOV}6tyi!O;lss7(1e7IxE1^nMX%h~t_WM$)W78Z?1 zG|3xMe1aQABu4g@N8^j%6yD}9kHVQ1CFl}>Vl+YHu=x$!GT;f`eaVm}>S?9!IplXs zhM$3RGr4Q@EnxQ*o)sI?A+PBG_WLliOWITQqji&7Yu;piG^bZ_2nmnDVNs{Lj`!)W zXW@0}5vx%<^&CklGjhLru^uADVG+7A6)VvSyrtKC^k8f8UXTgbIzp6o3j(=~C?UUY zb5u);Hq`nfQ#)_IH%49K7BlalA+38wRHObmGegn>Yr**BK!{R2Xf_53F|cl$f(?~= zWp56cNZ#YPvO=_nAwu5WR9*#fgS%|+ryP)9wY6pR;2(qUqWZ&4o1z>@mH``cZx4VL z3>5Z@re8mkCBb85BxcrwXU`ry^$A7X|I zqP0S`J7nSemLD_P9*Z`9x{Z?6iO#fpJkY+2~W4D_LMJX+Wh#@60 z%6rJi5hIFb9n?dIUly&moh z-2XRv;!w#%N9fQ=jn$6Qwn~f1=xbT5a!K!c{H?p@Fq_rhljfyvwK3U)V>y$(2IkQ5VTa zr`B4Vu54k|90A&DY8ks2kb~Ltm{Xqdy7TZgoCOahAsB$@$;)T=ISr|{n6R=q&rz9A ziV)P#s4hJQjW7m`e0COp{(y-6xI=O*l9=KDaP@-571PR#hL`nf;O$J>*|#k}-dV=r z4;S4D9Sw6u)6NtYrH%#++>Rb@oTd?0qMEAA4E*IgSPmJj$2WXAet4Y>aTe`Mm-R~N z_%M~(O#8AcX?EG$wU|~@J`G_G*z!U1Q!Ft~jn65s z!Ww2ma0Fp8f*sVa;4ULxajC9mcf-M|qi8~?^XrX`mE#jvO1AMfiv1EA_rPC~QHVL( z(Va=P$WR2UPqy;Tjutl=5I6K3I2&#prjzUsKK_@!`yk)kFW--T-ibU#Cy9qzcJ_P< z^8;r~2vlmwnQ+p+SXZ@%cD$R%A6|bW!89hD!<3UzP612h5$bOkly%yHyvayVn)WZ$Xx~Ok; zsGc#Z^nQ|NT&a_UsH4a|VrFQ2BF+4h4cW`FVO&*yw-P{m=$BTGd3<{arHK26;K8ty zostu|Xlcms6YqaX{%S;Zx*UENc(#MuxZNc!)|K;5+E%LOl`fE?`vPd%-_f;W@7LkJ z+ns9+qFwb?9qjd|o$mKkpT-)qGaBaKM|483Dny9hB?JySw)Gr=UZ8tC@v(T`Q-!&w zT1sa3sj_F3J{9l8Wi{$KEj9%20bUUBN3rcuY zreyL{^MZFMNR<+FFUQhTYEV<+dsW2URFP_2TR>*Fi&j*-9JJ+;#$OjZV3FvZ_N%7OdSDktS|lv>VexomMZ zIo4U7TMq055#OM-+Kea=H~1ph{8qe!l{k_YTOhI3K2Yh-;ZCvz&))piUpap?K%vy7ex`tU~mFBN8nQqL#l=WKNxq&78XmtP8{yty0T zD&DHEhwtPsuBvUKvvUbBzC$GVoTZ?2l=;&}Hh!R;C)_$9-y);gKqt!Xr?;flHHbCv zSpqG$%BSU+c78N{!EuzjNRp@9_pM#l2@2Rs*Efmg^AcOpiEcvUdx9h8)FX5jrD4j$ zG4dZFdqua%7I5k2!&zMA;~;|;Q^8He%2EXEIncp>FJ*M4N(}n6mlqs>v%byU@9z*K z#`smr2($zlC>O{6fgf(ZfpC#w`xcKafxntcSpQQb4+JdLtmeZ^iYYsp(v|CGm}q+T zfT1T*@$`^TlHijqPp>qrw#2?I^fOo|?Lm87{2Rye0UWdK=|Xw}q^6?EK&d6z33oD~ zZxEwdkggI_7exs43V3c0%qm>IUSxE%09Ca#Q8!il&|6P@Q=J169T54R6}2Yx!iYa3 z#^v2r?yK&vU1FDuntQScgX}$`yQjm7L`V0#3SWTmGaixuhNIp5P&IqxcRjnQ`K+Bc z&vkumrdyoWE-Au07Agv~h;g+2>En#`G=qI?7Szc9H$tEcbcG@bXn@d0Et+Dc0IJfx z{y(V7;BTs$#eeDtU~`jJV8MGbrY|s7%2RC#dV^tJ`#=B8w2qj(l@%e-QcN8zmlqe+ z>lz^7w<`Un=uJUJ177UefF{ zie+VaY`t?XllE|vMFG4tc~_WZgF`VXU-D`8{{y8+-|T_LZ2^HE0pk@YVRqoB$mYHh%6v};)c-NW?(~n zYgZ^R!4VxV9x<+OM6C&KZ-JmU1VA5iMhvCi5wxGy*_x`3TZ7Nb0QJQ2R^PiV6G&rd z{8sq+V~3;+?r#b)82>M~>NO*8!;1IoCS4WeRhilqM~5|ymQfxtgccc-*00NrP3c1f zzh`aS9^1$IZXn6K$}jQT-hzS8Rmx*MT#qi|0xb*ZNU2L8@P8gWljb|Q zpV~IzSIQ4Qhn(qFW)CcI%)#FXln;-Mrqgqki}1TtEoFUz=54_3lX>xP5{te>zSK)} zfkdItZOrLl)tFH{^319vGK@Hzj_q@7J1U?ixw>#d*i=xk#lve*GV;lc{|ir=T^0p?4y=FD zNsWLgxwmodqNBU7yZnjk;wFG|r8P0rR9L4E&i_IS<+j ziX#|-&x>yk<*@{}2TGd>Oq1Y$2w?ve%nq6gw5;C6uXWf1Gp=yTO$y<2VFFhCxtMj@ zFOtAG#BR#(YDm-90RF3qnU{L*R=y@(9)i@(D4nH0BzB_(Po?Ks9=P^RFA%1h~P4H=anm+bcrD2T!>~Kh` zD*|y%PtGi6=m-Y-j6x(C{}Jcms#uQ!CNpPnzv$&XS;k$4`_5c@G}8bL|Cv*Hthyh6 zJ@sn2z>5o6$3=(uYysn-&rxsS#O8LQs^5{XZKaov*-Z8h7KEuX1}+*z>_=_?HOHx9 zN&C{~;u-fJ({Xv0yONYX&YYF3?4g8(gINR6i!;6WQYfjn6^isQewFwopRgAMa`G_+ ztGb_^KdUxd_rT^sI}*R;XP?`EQ0PR}pW`ljMVXC08&~>DFTp~ZVxotRWZ_Rz`s!Rk zpg@hf1owdP!JQ8TTI__qJH|bbVb{QOy3chsSo#6`w}BR-!keD?CUidsNA8jr#f@Hq zlcM_d%2P8#>bAwZGfJOCx!%d*1Z@F*gwR*<%1gUNIEnJY3LG(;KQ5tyLcvc13w^>9 z@@zJWDc8A}^#To&4=2hTJf~-SM=yueTGOWWN$vs&H1Bx>kTnyoR_B^y40a0d`#4?2 zn?GMud5a&gMZ*Ynvjyw-qU*TAnD9j`>cz;Y9&v9ocvk26F}|Fu%)=DGxCNF*AUhpX z+KII5$R(`N{6K&YC6px4qCPbb<@p1{ton5Pq2>>7*E+0CN*2 zs0n#1q7cyOAF#Ofy{_AKf*8RGZf8dEub&cFm($l815W9*|Desv-cZwLnv$Iy?`tFb zE42o7OV*dsVUREVP^GbK%};pKL4#;EIrgWdbNr#W$9qL197eZPD@mEO8--gSX0Zp% zZ&^14?^Qh)_Wr~^h^xl(6_41~Ss7sqL}OFZvXgZTzl#k~5ED)U&BwV_+t_z2h)}&m zXl@F9urNsec7@CHt1kQZnNZQ+C8xp+K84BlKSK0GP7v$?TW|A<1l8%;lIFk&r(lR8 zyT4K-*5R`&jK;_*5fjeeK{*tktEtUI2k37|1EQ;X8?Z~RrZw9|I2C*eeTB%*H?H8W z>2(6C-~?K<&k#Ebi@sh1zdoj)vd1n-C~5k|d+C`*SCMeQXpA@up1Y;Y&W)dl6S=a& zYxHULZ>-Zq63*PsPyFzBeqLz_X;9MGq3Zzd3+XP#mhW{f1i9@M)r(B-H)~#?)>+yL z-y_{iSxK8HGn~tLrE<|sYKlH@J+Hfazi877k_ceV2LM%!Z~x0We!?rcQz!zJhbuJ& ztc*u@QEqmwHXJse$~FcI1PTH=DOgZ7-1)3(Woi(mOf?%Bt?ty9e$1&4VSQzf#BSfl#@!!vc_u40?!%v~DNem^xK zkn+J_oqGtB9S3M(G5U% zHda)6V|{$|G||PdXt{u&Fy@)K`#yeg&PjPp`*YK$9rGT`dP(`Q7l`GkT}ch-D5d7W zr`Txn&21ayVdQ}Mh1E8AR+3-&>Xf3_i;At0fHOs1+4ZxQ(+cDFs8dyTC6J6~dEfon zi?+8FY~^~D4R#dEclfG6*JSX-EZT2d1z1#L%OeC^^d5tbQu=lZt9dNuFwV>de(=g) z9g&ZLzlTx0OBPk?#Q;y%hESNUXXdZa9`dKx6lom@sG}h>rQD(OO=sHbmzX(K4_lz; z(<+6q!D3f(n$DdfJv%Wjb|O$xpyjlpltj;&>Gi=Nd}~b1JBsw;kni?dAa4pimDVFJ zim7uY5xll1Vrb)oZW)nyHNW0)4_VH7@Dn7jh|9Ca$!b89n$E_y)54KnB|dy3kemLC z?JXyB4V6UpnoB#16bc?+E;D zm+lwz5I>CD;(2i2sGO*)G&Tp{7@txZL9aml3af))4ZhBLs4tNg(B*&FE8WyE1sBhE zsX(^4TH%f%I=jZ)8^bc0F!nP2Nicj8m=x-OziHHy?yx2^MU1b{i0j;HvuUp=!2x?E zKJS;kV%$;3kTT)hDAPknqg)Z|>nqHnvAGC>-Pa%e4j&b(0;nrfxTPwB&3S%V2Ku{$O+b4PF^GXEc-hEzFUK|{7&&li}!?;HI`&+D|tN&f5*uA?Xm1C`0cSY6}Uvf2BV;WS4Ha`Bb5Ea zU+E0%(FV1KZ2Bv_P>gbi*C%n0yIKW6Tpa!cLto<1+Hi%_Ue?|g&((U?-=jkMc;C-K z8Fuw~M5fSeXQss<-T$K2H~Y&_$0fN-7q<(c!krq_U+c-K01M_W7?nrQ;)&_*nc%a4(p}L z`D?W%F>YONf0Uudj+P@sme(u^k!nMNBx+GTSm|f1BlM33`8>EI?(upb|2CVOFP6m7 zsga0rb*+Ix<$HhVv^N>h;g?*OCm*2^y-vz(cB9x9CrTOaNxtyszlJ&lh?zq9evZl4 zZsKTf?WsNDuGgSQucgXQ>$5`A5K7q7X}h^QJvbML-OCOdzv;BH$uCW1HgPaPQLG$!o zTrW_p(Tr>Y$cga;5qnjM*?94AW)XGLJGd+TZD#iH!=TJp4kB@<7ukLFc5%1zuCeDz zwR-;gfY427MU_6PYDg|T+H98A@1QzoXOg%06iT1v1)C#!=Ip`uaA-_)G~TB&Cj^t` zReVLqbHuQQP)ImoZb*h#@>nH>Wu3BSmkqJ_fhR&3WM`sWuPsKARbtY*|tC(4&FLKA0CI8T_q3N)P`o znbjUUjN!BTdi7jJ?HW$Xd;SSKa6jR~$FD@F58fEfTV*e~#iPVdEoB^o)fp5hGn6Bp z$0d6D{p9dQV!jWF$&?vVIUt&G9q}VTq0!pEjxkc_!I^rg)wY(U;cKy=VpZD84{o$wg&Z~hZJB5e~IO8zLa+HP-u?}Y+~IS|DAk^rLm$wDee%Lw!VG8i3&37nSYPR>O<6co zx{O!eck0N%?7>{dt2XFLka*V-DQdV!QwNA7{>t}J&mnbh<~8uOh`Gek4QqGIJYJTd zhgIAZmS>R{z<+3E>l@t{&|u6v!G{yYV+u9Ci&=Hxa$gjH_WN#J1MBnBaz{+qYMVR0 z6X0AQzQ0%aCb03@j_dp>Wx}RPZw9mDr@eNxd~wV^Csf)@&2XE!uV@jU1y`vVz2oca z$mtH?72 zd@as_;}os2UF@!rT>)LC4wR{+#?x|_(;NHnz%%_om225IQvf=-Xk;&Jttq1&o#1{r z*K2evSWozri0-x7Zq_4eC)rovC=&7<5)t!0bhM{5k6Fgs!#MNYUy<_bod~r~+>@a9 z(#t5J65y5$6R`f?%^#VP;ud*$vuCCLTWIwA_+WQ)AbX;-7ndFBx5e$$==~vFW2Z2v zBpi_6i0dQ$bGVk21&hOM8OI&VLb5!}nH%bHw1A}pi@H=^(yNGt#@S;vx)lSpB}JZ# z?94PzpC_)NfD$t?0nZ%~d2UWU+2*=mj53MXwK(`d=Z|;Nz6fWGzzhxhW=igLal%9x zp8M2Qwe{Ua6&Pnt>-^t}vPaN{a&ImkhwG3_{jQM@_u_bkwK=~kmYmvWr7%JH;il~X zmOWAAZR$k$TMds}%rMCoB%&#JPV-ZqD#^UIBQ|W0wAzC=u4N|6y(wRdBSgdjCeS-5 z75*XFC}+IMR7fU&M~jX?6ah_tS67!=;tm8@MW!`WT@+)wUBfZ`+Gwe zRd2?v$Vb{ePeDuSd6Q&a_}p`#Ar9XvgTufTKNU_m@BK-sV}u}5LY)=Zx(e=%pg1)^ z8*53y30qDWUIB+bvpPd`OgL#v5JeXEXHvjwQpUfrS zi%q=eQ{fgll~bs8qakO`Svh}u%g?fP;q^JU2PD#6cOVrU7ZXa7%UgJ6+`V1ot@c&0 zs>sTjL;baXE`OEl8(?oKZOn;j9?)`imE>9+sDpa{mc{PM!jO}1ktf@b`$SE5-wR(W zv85iMq)j~DGIQXK;WQ~aRC3XLN!sw;j^DjCZ`DHh*)D9*C=v!o^aC$Jgjg72c%IpN zo;%5jPN{8p$DIxEsa+-PO%v<*{VW9H!~q*`cndHJuK+7gpdj!@4%6}SjSb`7pn(>i zrpP9m?ULK{_yXns3ljdf*G2z-j?1rexf_CCPY)IsrQ9w9<_hTIg)8SvjNIb?7aMI@ Av;Y7A literal 0 HcmV?d00001 diff --git a/src/main.cpp b/src/main.cpp index 8bf8a35..9d8d210 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -121,7 +121,15 @@ int main(int argc, char* argv[]) { printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(NPOT, c, true); - + zeroArray(SIZE, c); + int radix_tst[8] = { 4, 7, 2, 6, 3, 5, 1, 0 }; + printDesc("Radix example sort"); + printf("Test input array:\n"); + printArray(8, radix_tst, true); + StreamCompaction::Radix::sort(8, c, radix_tst); + printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printf("Sorted Output:\n"); + printArray(8, c, true); printf("\n"); printf("*****************************\n"); diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu index 0ad6b84..38d3df2 100644 --- a/stream_compaction/radix.cu +++ b/stream_compaction/radix.cu @@ -40,7 +40,7 @@ namespace StreamCompaction { int fBit = flipBit(bit); b_arr[index] = bit; // maps bit k in b_arr - f_arr[index] = fBit; // copy same value here for scan + f_arr[index] = fBit; // copy flipped value here for scan } __global__ void kernComputeT(int n, int totFalse, int *t_arr, int *f_arr) { @@ -165,7 +165,7 @@ namespace StreamCompaction { // get whether last number's bit is false cudaMemcpy(&totFalse, f_arr + n - 1, sizeof(int), cudaMemcpyDeviceToHost); - // exclusive scan e_arr into f_arr + // exclusive scan f_arr // UpSweep for (d = 1; d <= limit; d++) { From 829a49e739384a60951b37046e82518495154ef9 Mon Sep 17 00:00:00 2001 From: risia Date: Thu, 13 Sep 2018 02:33:23 -0400 Subject: [PATCH 11/37] More code and images to illustrate radix sort --- README.md | 40 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index c98e927..76c188e 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ CUDA Stream Compaction **Radix Sort Implementation** Radix sort is a method of sorting data in an array from min to max using the values' binary data. This is done by sorting by the least-significant bit (LSB) first, iterating through bit sorts until the most-significant bit (MSB). -Before we can sort, we actually need to find the dataset's maximum value. By taking the ceiling of log2(max), we can get the max number of bits representing the data. This reduces the number of redundant iterations of sorting from the number of bits in the data type to only as many relevant ones there are. This is done on the GPU using a parallel reduction algorithm comparing pairs of values. +Before we can sort, we actually need to find the dataset's maximum value. By taking the ceiling of log2(max), we can get the max number of bits representing the data, which bit is the MSB. This reduces the number of redundant iterations of sorting from the number of bits in the data type to only as many relevant ones there are in the data range. This is done on the GPU using a parallel reduction algorithm comparing pairs of values. The code is reproduced below with extra commentary. ```cpp // each thread compares a pair of integers from the input buffer @@ -40,8 +40,40 @@ for (d = 1; d <= limit; d++) { kernFindMax << > >(size, offset1, offset2, max_arr); checkCUDAError("Radix find max fail!"); // error checking } -``` +``` + +To perform the sort itself efficiently, we generate a a pair of boolean buffers indicating whether the currently tested bit at that index is 0 or 1. One buffer is the true buffer, called b_arr, and the other the false buffer, called f_arr. If the bit value is 1, b_arr[index] is set to 1 and f_arr to 0, and vice versa. We save the last value of f_arr for later to compute the number of "falses" for indexing. + +```cpp +__global__ void kernBoolMaps(int n, int k, int* input, int* b_arr, int* f_arr) { + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index >= n) return; + + // retrieve the kth bit from the input val + int bit = bitK(input[index], k); + // flip the bit + int fBit = flipBit(bit); -To perform the sort itself efficiently, we generate a a pair of boolean buffers indicating whether the currently tested bit at that index is 0 or 1. One buffer is the true buffer, called b_arr, and the other the false buffer, called f_arr. If the bit value is 1, b_arr[index] is set to 1 and f_arr to 0, and vice versa. We save the last value of f_arr for later to compute the number of "falses" for indexing. -The f_arr is scanned using the work-efficient exclusive scan to generate the "false" indices, the locations to store the data values if b_arr[index] == 0 in the output array. The "true" indices, t_arr, are generated as "index - f_arr[index] + totFalse". The total false values is the last value in the scanned f_arr plus the value we stored earlier from f_arr before scanning. By using a GPU-implemented scatter function, we save the input values sorted into the output buffer. To remove the need for more intermediate buffers for each sort step, the input and output arrays are ping-ponged (switch their pointers) each sort step. + b_arr[index] = bit; // maps bit k into b_arr + f_arr[index] = fBit; // copy flipped value here for scan +} +``` + +The f_arr is scanned using the work-efficient exclusive scan to generate the "false" indices, the locations to store the data values if b_arr[index] == 0 in the output array. The "true" indices, t_arr, are generated as "index - f_arr[index] + totFalse". The total false values is the last value in the scanned f_arr plus the value we stored earlier from f_arr before scanning. By using a GPU-implemented scatter function, we save the input values sorted into the output buffer. To remove the need for more intermediate buffers for each sort step, the input and output arrays are ping-ponged (switch their pointers) each sort step. + +```cpp +__global__ void kernRadixScatter(int n, int *out, int *in, int *b_arr, int *f_arr, int *t_arr) { + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index >= n) return; + + // We compute the index to access by checking the boolean in b_arr + // If true, we use the index in t_arr (true indexing array) + // Else, we choose the index in f_arr (false indexing array) + // The index "access" is where in the output array the input goes to. + int access = b_arr[index] ? t_arr[index] : f_arr[index]; + out[access] = in[index]; +} +``` +Once the input array has been sorted for each bit, the output is correctly sorted in order of ascending value. This implementation is intended to work on integer values. An example of a small array radix sort is depicted: +![Radix Sort Example](/img/radix_example.PNG) From 0bb3468401077947373eb5b4960b7bf0008fdc6f Mon Sep 17 00:00:00 2001 From: risia Date: Thu, 13 Sep 2018 02:35:36 -0400 Subject: [PATCH 12/37] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 76c188e..8ed5d32 100644 --- a/README.md +++ b/README.md @@ -75,5 +75,5 @@ __global__ void kernRadixScatter(int n, int *out, int *in, int *b_arr, int *f_ar } ``` -Once the input array has been sorted for each bit, the output is correctly sorted in order of ascending value. This implementation is intended to work on integer values. An example of a small array radix sort is depicted: +Once the input array has been sorted for each bit, the output is correctly sorted in order of ascending value. This implementation is intended to work on integer values, and currently operates on global device memory, bottlenecking performance. An example of a small array radix sort is depicted: ![Radix Sort Example](/img/radix_example.PNG) From cafbb78b584694325344618f76ef78f28687df8b Mon Sep 17 00:00:00 2001 From: Angelina Risi Date: Thu, 13 Sep 2018 05:15:15 -0400 Subject: [PATCH 13/37] Shared Memory Work-Efficient Scan added --- src/main.cpp | 7 ++ stream_compaction/CMakeLists.txt | 2 + stream_compaction/efficient.cu | 5 +- stream_compaction/efficient.h | 1 + stream_compaction/shared_mem.cu | 208 +++++++++++++++++++++++++++++++ stream_compaction/shared_mem.h | 15 +++ 6 files changed, 235 insertions(+), 3 deletions(-) create mode 100644 stream_compaction/shared_mem.cu create mode 100644 stream_compaction/shared_mem.h diff --git a/src/main.cpp b/src/main.cpp index 9d8d210..c8a6071 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include "testing_helpers.hpp" const int SIZE = 1 << 8; // feel free to change the size of array @@ -131,6 +132,12 @@ int main(int argc, char* argv[]) { printf("Sorted Output:\n"); printArray(8, c, true); + zeroArray(SIZE, c); + printDesc("Shared Memory Efficient Sort, power-of-two"); + StreamCompaction::SharedMem::scan(SIZE, c, a); + printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(SIZE, c, true); + printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 2769e97..76a0f9d 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -11,6 +11,8 @@ set(SOURCE_FILES "thrust.cu" "radix.h" "radix.cu" + "shared_mem.h" + "shared_mem.cu" ) cuda_add_library(stream_compaction diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 04f1843..4c831bc 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -21,9 +21,9 @@ namespace StreamCompaction { if (access >= n || n < 1 || access < 0) return; buff[access] += buff[access - offset1]; - - } + + __global__ void kernScanDataDownSweep(int n, int offset1, int offset2, int* buff) { int index = (blockDim.x * blockIdx.x) + threadIdx.x; @@ -33,7 +33,6 @@ namespace StreamCompaction { int temp = buff[access - offset1]; buff[access - offset1] = buff[access]; buff[access] += temp; - } /** diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 109455d..3578061 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -7,6 +7,7 @@ namespace StreamCompaction { StreamCompaction::Common::PerformanceTimer& timer(); __global__ void kernScanDataUpSweep(int n, int offset1, int offset2, int* buff); + __global__ void kernScanDataDownSweep(int n, int offset1, int offset2, int* buff); void scan(int n, int *odata, const int *idata); diff --git a/stream_compaction/shared_mem.cu b/stream_compaction/shared_mem.cu new file mode 100644 index 0000000..16981b9 --- /dev/null +++ b/stream_compaction/shared_mem.cu @@ -0,0 +1,208 @@ +#include +#include +#include "common.h" +#include "efficient.h" +#include "shared_mem.h" + +#define blockSize 256 + +namespace StreamCompaction { + namespace SharedMem { + using StreamCompaction::Common::PerformanceTimer; + PerformanceTimer& timer() + { + static PerformanceTimer timer; + return timer; + } + __global__ void kernScanDataShared(int n, int* in, int* out) { + // init shared mem for block, could improve latency + __shared__ int sBuf[blockSize + 1]; + + int tx = threadIdx.x; + int index = (blockDim.x * blockIdx.x) + tx; + + // copy used vals to shared mem + sBuf[tx] = (index >= 0 && index < n) ? in[index] : 0; + + __syncthreads(); // avoid mem issues + + int offset = 1; // step size + int access; // shared buffer access index + int i; // iterator + + // Upsweep + for (i = blockSize >> 1; i > 0; i >>= 1) { + access = (2 * offset * (tx + 1)) - 1; + if (access < blockSize) sBuf[access] += sBuf[access - offset]; + offset *= 2; + __syncthreads(); // avoid mem issues + } + + // copy sBuf[blocksize - 1] to sBuf[blocksize] so keep value safe + if (tx == 0) { + sBuf[blockSize] = sBuf[blockSize - 1]; + sBuf[blockSize - 1] = 0; + } + __syncthreads(); // avoid mem issues + + // Downsweep (inclusive) + // do exclusive downsweep + int temp; + + for (i = blockSize >> 1; i > 0; i >>= 1) { + + offset >>= 1; // div by 2 + access = (2 * offset * (tx + 1)) - 1; + if (access < blockSize) { + temp = sBuf[access - offset]; // store left child + sBuf[access - offset] = sBuf[access]; + sBuf[access] += temp; + } + __syncthreads(); // avoid mem issues + + } + // Write to dev mem + if (index < n - 1 ) out[index + 1] += sBuf[tx + 1]; + __syncthreads(); + int add_val = 0; + for (i = index - tx; i > 0; i -= blockSize) { + if (index != i) add_val += out[i]; + } + __syncthreads(); + + if (index < n) out[index] += add_val; + } + + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + + int limit = ilog2ceil(n); + int size = pow(2, limit); + + dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize); + + int* dev_out; // data to output + int* dev_in; // input data + + cudaMalloc((void**)&dev_in, n * sizeof(int)); + cudaMalloc((void**)&dev_out, n * sizeof(int)); + + // copy input data to device + cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + cudaMemset(dev_out, 0, n * sizeof(int)); + checkCUDAError("initializing shared mem scan data buff fail!"); + + timer().startGpuTimer(); + + kernScanDataShared<<>>(n, dev_in, dev_out); + checkCUDAError("shared mem scan fail!"); + + timer().endGpuTimer(); + + // copy out data + cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("shared mem scan output copy fail!"); + + cudaFree(dev_out); + cudaFree(dev_in); + + } + + /** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ + int compact(int n, int *odata, const int *idata) { + + //int* dev_map; // bool mapping + //int* dev_scan; // scanned data + //int* dev_out; // compacted data to output + //int* dev_in; // input data + + //int limit = ilog2ceil(n); + //int size = pow(2, limit); + + //// allocate memory + //cudaMalloc((void**)&dev_in, n * sizeof(int)); + //cudaMalloc((void**)&dev_map, n * sizeof(int)); + //cudaMalloc((void**)&dev_out, n * sizeof(int)); + //cudaMalloc((void**)&dev_scan, size * sizeof(int)); + //checkCUDAError("w-e compact malloc fail!"); + + //cudaMemset(dev_scan + n, 0, (size - n) * sizeof(int)); // zero extra mem + //cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); // copy input data + //checkCUDAError("initializing w-e compact data buffs fail!"); + + //dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + // timer().startGpuTimer(); + // // map + //StreamCompaction::Common::kernMapToBoolean << > >(n, dev_map, dev_in); + //cudaMemcpy(dev_scan, dev_map, n * sizeof(int), cudaMemcpyDeviceToDevice); // copy bool data to scan + //checkCUDAError("w-e compact bool mapping fail!"); + + //// scan + + //int d; + //int offset1; + //int offset2; + + //// UpSweep + //for (d = 1; d <= limit; d++) { + // offset1 = pow(2, d - 1); + // offset2 = pow(2, d); + // fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; + // kernScanDataUpSweep << > >(size, offset1, offset2, dev_scan); + // checkCUDAError("w-e compact upsweep fail!"); + //} + + //// DownSweep + //cudaMemset(dev_scan + n - 1, 0, (size - n + 1) * sizeof(int)); + //for (d = limit; d >= 1; d--) { + // offset1 = pow(2, d - 1); + // offset2 = pow(2, d); + // fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; + // kernScanDataDownSweep << > >(size, offset1, offset2, dev_scan); + // checkCUDAError("w-e compact downsweep fail!"); + //} + + //// scatter + //fullBlocksPerGrid.x = ((n + blockSize - 1) / blockSize); + //StreamCompaction::Common::kernScatter << > >(n, dev_out, dev_in, dev_map, dev_scan); + //checkCUDAError("w-e compact scatter fail!"); + + // timer().endGpuTimer(); + + //// copy output to host + //cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost); + //checkCUDAError("w-e compact output copy fail!"); + + //// calc # of elements for return + //int map_val; + //int r_val; + //cudaMemcpy(&r_val, dev_scan + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + //cudaMemcpy(&map_val, dev_map + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + //checkCUDAError("w-e compact calc # elem fail!"); + + //r_val += map_val; + + //// cleanup + //cudaFree(dev_in); + //cudaFree(dev_map); + //cudaFree(dev_out); + //cudaFree(dev_scan); + + // return r_val; + return -1; + } + } +} diff --git a/stream_compaction/shared_mem.h b/stream_compaction/shared_mem.h new file mode 100644 index 0000000..c5fc61b --- /dev/null +++ b/stream_compaction/shared_mem.h @@ -0,0 +1,15 @@ +#pragma once + +#include "common.h" + +namespace StreamCompaction { + namespace SharedMem { + StreamCompaction::Common::PerformanceTimer& timer(); + + __global__ void kernScanDataShared(int n, int* in, int* out); + + void scan(int n, int *odata, const int *idata); + + int compact(int n, int *odata, const int *idata); + } +} From 016817c13fa260e32b093197d6885059e923a574 Mon Sep 17 00:00:00 2001 From: Angelina Risi Date: Thu, 13 Sep 2018 05:31:53 -0400 Subject: [PATCH 14/37] Shared Memory Compact added + tests --- src/main.cpp | 22 +++++ stream_compaction/shared_mem.cu | 142 ++++++++++++++------------------ 2 files changed, 84 insertions(+), 80 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index c8a6071..6ed91b1 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -137,6 +137,14 @@ int main(int argc, char* argv[]) { StreamCompaction::SharedMem::scan(SIZE, c, a); printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(SIZE, c, true); + printCmpResult(SIZE, b, c); + + zeroArray(SIZE, c); + printDesc("Shared Memory Efficient Sort, non-power-of-two"); + StreamCompaction::SharedMem::scan(NPOT, c, a); + printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + printArray(NPOT, c, true); + printCmpResult(NPOT, b, c); printf("\n"); printf("*****************************\n"); @@ -190,6 +198,20 @@ int main(int argc, char* argv[]) { //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + zeroArray(SIZE, c); + printDesc("Shared Memory work-efficient compact, power-of-two"); + count = StreamCompaction::SharedMem::compact(SIZE, c, a); + printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(count, c, true); + printCmpLenResult(count, expectedCount, b, c); + + zeroArray(SIZE, c); + printDesc("Shared Memory work-efficient compact, non-power-of-two"); + count = StreamCompaction::SharedMem::compact(NPOT, c, a); + printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(count, c, true); + printCmpLenResult(count, expectedNPOT, b, c); + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; diff --git a/stream_compaction/shared_mem.cu b/stream_compaction/shared_mem.cu index 16981b9..e6ce620 100644 --- a/stream_compaction/shared_mem.cu +++ b/stream_compaction/shared_mem.cu @@ -123,86 +123,68 @@ namespace StreamCompaction { */ int compact(int n, int *odata, const int *idata) { - //int* dev_map; // bool mapping - //int* dev_scan; // scanned data - //int* dev_out; // compacted data to output - //int* dev_in; // input data - - //int limit = ilog2ceil(n); - //int size = pow(2, limit); - - //// allocate memory - //cudaMalloc((void**)&dev_in, n * sizeof(int)); - //cudaMalloc((void**)&dev_map, n * sizeof(int)); - //cudaMalloc((void**)&dev_out, n * sizeof(int)); - //cudaMalloc((void**)&dev_scan, size * sizeof(int)); - //checkCUDAError("w-e compact malloc fail!"); - - //cudaMemset(dev_scan + n, 0, (size - n) * sizeof(int)); // zero extra mem - //cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); // copy input data - //checkCUDAError("initializing w-e compact data buffs fail!"); - - //dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); - - // timer().startGpuTimer(); - // // map - //StreamCompaction::Common::kernMapToBoolean << > >(n, dev_map, dev_in); - //cudaMemcpy(dev_scan, dev_map, n * sizeof(int), cudaMemcpyDeviceToDevice); // copy bool data to scan - //checkCUDAError("w-e compact bool mapping fail!"); - - //// scan - - //int d; - //int offset1; - //int offset2; - - //// UpSweep - //for (d = 1; d <= limit; d++) { - // offset1 = pow(2, d - 1); - // offset2 = pow(2, d); - // fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; - // kernScanDataUpSweep << > >(size, offset1, offset2, dev_scan); - // checkCUDAError("w-e compact upsweep fail!"); - //} - - //// DownSweep - //cudaMemset(dev_scan + n - 1, 0, (size - n + 1) * sizeof(int)); - //for (d = limit; d >= 1; d--) { - // offset1 = pow(2, d - 1); - // offset2 = pow(2, d); - // fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; - // kernScanDataDownSweep << > >(size, offset1, offset2, dev_scan); - // checkCUDAError("w-e compact downsweep fail!"); - //} - - //// scatter - //fullBlocksPerGrid.x = ((n + blockSize - 1) / blockSize); - //StreamCompaction::Common::kernScatter << > >(n, dev_out, dev_in, dev_map, dev_scan); - //checkCUDAError("w-e compact scatter fail!"); - - // timer().endGpuTimer(); - - //// copy output to host - //cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost); - //checkCUDAError("w-e compact output copy fail!"); - - //// calc # of elements for return - //int map_val; - //int r_val; - //cudaMemcpy(&r_val, dev_scan + n - 1, sizeof(int), cudaMemcpyDeviceToHost); - //cudaMemcpy(&map_val, dev_map + n - 1, sizeof(int), cudaMemcpyDeviceToHost); - //checkCUDAError("w-e compact calc # elem fail!"); - - //r_val += map_val; - - //// cleanup - //cudaFree(dev_in); - //cudaFree(dev_map); - //cudaFree(dev_out); - //cudaFree(dev_scan); - - // return r_val; - return -1; + int* dev_map; // bool mapping + int* dev_scan; // scanned data + int* dev_out; // compacted data to output + int* dev_in; // input data + + int limit = ilog2ceil(n); + int size = pow(2, limit); + + // allocate memory + cudaMalloc((void**)&dev_in, n * sizeof(int)); + cudaMalloc((void**)&dev_map, n * sizeof(int)); + cudaMalloc((void**)&dev_out, n * sizeof(int)); + cudaMalloc((void**)&dev_scan, n * sizeof(int)); + checkCUDAError("w-e compact malloc fail!"); + + cudaMemset(dev_scan, 0, n * sizeof(int)); + checkCUDAError("initializing shared mem scan data buff fail!"); + + cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); // copy input data + checkCUDAError("initializing w-e compact data buffs fail!"); + + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + timer().startGpuTimer(); + // map + StreamCompaction::Common::kernMapToBoolean << > >(n, dev_map, dev_in); + checkCUDAError("w-e compact bool mapping fail!"); + + // scan the map + fullBlocksPerGrid.x = ((size + blockSize - 1) / blockSize); + kernScanDataShared << > >(n, dev_map, dev_scan); + checkCUDAError("shared mem scan fail!"); + + // scatter + fullBlocksPerGrid.x = ((n + blockSize - 1) / blockSize); + StreamCompaction::Common::kernScatter << > >(n, dev_out, dev_in, dev_map, dev_scan); + checkCUDAError("shared mem compact scatter fail!"); + + timer().endGpuTimer(); + + // copy output to host + cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("shared mem compact output copy fail!"); + + // calc # of elements for return + int map_val; + int r_val; + cudaMemcpy(&r_val, dev_scan + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&map_val, dev_map + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("shared mem compact calc # elem fail!"); + + printf("map[n-1] = %i, scan[n-1] = %i\n", map_val, r_val); + + r_val += map_val; + + // cleanup + cudaFree(dev_in); + cudaFree(dev_map); + cudaFree(dev_out); + cudaFree(dev_scan); + + return r_val; } } } From 54e3f55c3c71578f0533d29056ac3839765c57e4 Mon Sep 17 00:00:00 2001 From: Angelina Risi Date: Thu, 13 Sep 2018 05:34:02 -0400 Subject: [PATCH 15/37] removed debug printf --- stream_compaction/shared_mem.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/stream_compaction/shared_mem.cu b/stream_compaction/shared_mem.cu index e6ce620..3571e98 100644 --- a/stream_compaction/shared_mem.cu +++ b/stream_compaction/shared_mem.cu @@ -174,8 +174,6 @@ namespace StreamCompaction { cudaMemcpy(&map_val, dev_map + n - 1, sizeof(int), cudaMemcpyDeviceToHost); checkCUDAError("shared mem compact calc # elem fail!"); - printf("map[n-1] = %i, scan[n-1] = %i\n", map_val, r_val); - r_val += map_val; // cleanup From 2ad8ef60e636a42784945b5b1933d360060feb76 Mon Sep 17 00:00:00 2001 From: risia Date: Thu, 13 Sep 2018 05:53:04 -0400 Subject: [PATCH 16/37] Short description of Shared Memory implementation --- README.md | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8ed5d32..a997acd 100644 --- a/README.md +++ b/README.md @@ -6,10 +6,11 @@ CUDA Stream Compaction * Angelina Risi * [LinkedIn](www.linkedin.com/in/angelina-risi) * Tested on: Windows 10, i7-6700HQ @ 2.60GHz 8GB, GTX 960M 4096MB (Personal Laptop) + + +## Extra Credit -### (TODO: Your README) - -**Radix Sort Implementation** +### Radix Sort Implementation Radix sort is a method of sorting data in an array from min to max using the values' binary data. This is done by sorting by the least-significant bit (LSB) first, iterating through bit sorts until the most-significant bit (MSB). Before we can sort, we actually need to find the dataset's maximum value. By taking the ceiling of log2(max), we can get the max number of bits representing the data, which bit is the MSB. This reduces the number of redundant iterations of sorting from the number of bits in the data type to only as many relevant ones there are in the data range. This is done on the GPU using a parallel reduction algorithm comparing pairs of values. The code is reproduced below with extra commentary. @@ -77,3 +78,10 @@ __global__ void kernRadixScatter(int n, int *out, int *in, int *b_arr, int *f_ar Once the input array has been sorted for each bit, the output is correctly sorted in order of ascending value. This implementation is intended to work on integer values, and currently operates on global device memory, bottlenecking performance. An example of a small array radix sort is depicted: ![Radix Sort Example](/img/radix_example.PNG) + + +### Shared Memory Work-Efficient Scan & Compact + +An alternative implementation of the work-efficient scan using shared memory to reduce latency is included. Each block stores an array shared among its threads to store the intermediate values before outputting. By reducing global memory accesses and instead using faster shared memory, we can potentially increase thoroughput. +Both the upsweep and downsweep are done in the same kernel as they need to both used the shared memory cache. This means we cannot dynamically change the block and threadcount as we traverse the tree as done in the global memory solution, and we must be careful to synchronize threads between write and read operations to prevent race conditions. +To allow the merging of the blocks' solutions, while we calculate an exclusive scan through the downsweep, we save the root value of the tree in the index blockSize of the shared memory array. The blocks must add the root value of all previous blocks to their total to calculate the correct prefix sum values of the array. From 38d401a916415d45feabd38854eef2947db4c4de Mon Sep 17 00:00:00 2001 From: Angelina Risi Date: Thu, 13 Sep 2018 10:17:05 -0400 Subject: [PATCH 17/37] Fixed shared memory scan --- src/main.cpp | 18 +++++- stream_compaction/shared_mem.cu | 104 ++++++++++++++++++++------------ stream_compaction/shared_mem.h | 2 +- 3 files changed, 83 insertions(+), 41 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 6ed91b1..ecb64b0 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -15,7 +15,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 12; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -133,19 +133,31 @@ int main(int argc, char* argv[]) { printArray(8, c, true); zeroArray(SIZE, c); - printDesc("Shared Memory Efficient Sort, power-of-two"); + printDesc("Shared Memory Efficient Scan, power-of-two"); StreamCompaction::SharedMem::scan(SIZE, c, a); printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); - printDesc("Shared Memory Efficient Sort, non-power-of-two"); + printDesc("Shared Memory Efficient Scan, non-power-of-two"); StreamCompaction::SharedMem::scan(NPOT, c, a); printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + //zeroArray(SIZE, c); + //printDesc("Shared Memory Efficient Scan, power-of-two"); + //StreamCompaction::SharedMem::scan(8, c, radix_tst); + //printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(8, c, true); + + //zeroArray(SIZE, c); + //printDesc("Shared Memory Efficient Scan, non-power-of-two"); + //StreamCompaction::SharedMem::scan(7, c, radix_tst); + //printElapsedTime(StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); + //printArray(7, c, true); + printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); diff --git a/stream_compaction/shared_mem.cu b/stream_compaction/shared_mem.cu index 3571e98..7bbc377 100644 --- a/stream_compaction/shared_mem.cu +++ b/stream_compaction/shared_mem.cu @@ -6,6 +6,12 @@ #define blockSize 256 +// for reducing bank conflicts +#define NUM_BANKS 16 +#define LOG_NUM_BANKS 4 +#define CONFLICT_FREE_OFFSET(n) \ + ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS)) + namespace StreamCompaction { namespace SharedMem { using StreamCompaction::Common::PerformanceTimer; @@ -14,81 +20,89 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - __global__ void kernScanDataShared(int n, int* in, int* out) { + __global__ void kernScanDataShared(int n, int* in, int* out, int* sums) { // init shared mem for block, could improve latency - __shared__ int sBuf[blockSize + 1]; + __shared__ int sBuf[blockSize]; int tx = threadIdx.x; int index = (blockDim.x * blockIdx.x) + tx; // copy used vals to shared mem - sBuf[tx] = (index >= 0 && index < n) ? in[index] : 0; + sBuf[tx] = (index < n) ? in[index] : 0; __syncthreads(); // avoid mem issues - int offset = 1; // step size + int offset; // step size int access; // shared buffer access index - int i; // iterator // Upsweep - for (i = blockSize >> 1; i > 0; i >>= 1) { + for (offset = 1; offset < blockSize; offset *=2) { access = (2 * offset * (tx + 1)) - 1; if (access < blockSize) sBuf[access] += sBuf[access - offset]; - offset *= 2; __syncthreads(); // avoid mem issues } - - // copy sBuf[blocksize - 1] to sBuf[blocksize] so keep value safe - if (tx == 0) { - sBuf[blockSize] = sBuf[blockSize - 1]; - sBuf[blockSize - 1] = 0; + + // prepare array for downsweep + if (tx == blockSize - 1) { + sums[blockIdx.x] = sBuf[tx]; + sBuf[tx] = 0; } + __syncthreads(); + if (index >= n - 1) sBuf[tx] = 0; __syncthreads(); // avoid mem issues // Downsweep (inclusive) // do exclusive downsweep int temp; - for (i = blockSize >> 1; i > 0; i >>= 1) { - - offset >>= 1; // div by 2 + for (offset = blockSize; offset >= 1; offset /= 2) { access = (2 * offset * (tx + 1)) - 1; if (access < blockSize) { temp = sBuf[access - offset]; // store left child - sBuf[access - offset] = sBuf[access]; - sBuf[access] += temp; + sBuf[access - offset] = sBuf[access]; // swap + sBuf[access] += temp; // add } __syncthreads(); // avoid mem issues - } - // Write to dev mem - if (index < n - 1 ) out[index + 1] += sBuf[tx + 1]; - __syncthreads(); - int add_val = 0; - for (i = index - tx; i > 0; i -= blockSize) { - if (index != i) add_val += out[i]; + + // write to dev memory + if (index < n) { + out[index] = sBuf[tx]; } - __syncthreads(); - - if (index < n) out[index] += add_val; } + __global__ void kernStitch(int n, int* in, int* sums) { + int bx = blockIdx.x; + int index = (blockDim.x * bx) + threadIdx.x;; + + if (bx == 0) return; + if (index >= n) return; + for (int i = 0; i < bx; i++) { + in[index] += sums[i]; + } + + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - int limit = ilog2ceil(n); - int size = pow(2, limit); + int mod = n % blockSize; + int size = n; - dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize); + if (mod != 0) size+= blockSize - mod; + + dim3 fullBlocksPerGrid((size + (blockSize - 1))/ blockSize); int* dev_out; // data to output int* dev_in; // input data + int* dev_sums; + cudaMalloc((void**)&dev_in, n * sizeof(int)); cudaMalloc((void**)&dev_out, n * sizeof(int)); + cudaMalloc((void**)&dev_sums, fullBlocksPerGrid.x * sizeof(int)); // copy input data to device cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); @@ -98,9 +112,13 @@ namespace StreamCompaction { timer().startGpuTimer(); - kernScanDataShared<<>>(n, dev_in, dev_out); + kernScanDataShared<<>>(n, dev_in, dev_out, dev_sums); checkCUDAError("shared mem scan fail!"); + kernStitch << > >(n, dev_out, dev_sums); + checkCUDAError("shared mem scan stitch fail!"); + + timer().endGpuTimer(); // copy out data @@ -109,7 +127,7 @@ namespace StreamCompaction { cudaFree(dev_out); cudaFree(dev_in); - + cudaFree(dev_sums); } /** @@ -128,15 +146,22 @@ namespace StreamCompaction { int* dev_out; // compacted data to output int* dev_in; // input data - int limit = ilog2ceil(n); - int size = pow(2, limit); + int* dev_sums; + + int mod = n % blockSize; + int size = n; + if (mod != 0) size += blockSize - mod; + + dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize); // allocate memory cudaMalloc((void**)&dev_in, n * sizeof(int)); cudaMalloc((void**)&dev_map, n * sizeof(int)); cudaMalloc((void**)&dev_out, n * sizeof(int)); cudaMalloc((void**)&dev_scan, n * sizeof(int)); - checkCUDAError("w-e compact malloc fail!"); + + cudaMalloc((void**)&dev_sums, fullBlocksPerGrid.x * sizeof(int)); + checkCUDAError("shared mem compact malloc fail!"); cudaMemset(dev_scan, 0, n * sizeof(int)); checkCUDAError("initializing shared mem scan data buff fail!"); @@ -144,18 +169,22 @@ namespace StreamCompaction { cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); // copy input data checkCUDAError("initializing w-e compact data buffs fail!"); - dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + timer().startGpuTimer(); // map + fullBlocksPerGrid.x = ((n + blockSize - 1) / blockSize); StreamCompaction::Common::kernMapToBoolean << > >(n, dev_map, dev_in); checkCUDAError("w-e compact bool mapping fail!"); // scan the map fullBlocksPerGrid.x = ((size + blockSize - 1) / blockSize); - kernScanDataShared << > >(n, dev_map, dev_scan); + kernScanDataShared << > >(n, dev_map, dev_scan, dev_sums); checkCUDAError("shared mem scan fail!"); + kernStitch << > >(n, dev_scan, dev_sums); + checkCUDAError("shared mem scan stitch fail!"); + // scatter fullBlocksPerGrid.x = ((n + blockSize - 1) / blockSize); StreamCompaction::Common::kernScatter << > >(n, dev_out, dev_in, dev_map, dev_scan); @@ -181,6 +210,7 @@ namespace StreamCompaction { cudaFree(dev_map); cudaFree(dev_out); cudaFree(dev_scan); + cudaFree(dev_sums); return r_val; } diff --git a/stream_compaction/shared_mem.h b/stream_compaction/shared_mem.h index c5fc61b..9fee8a2 100644 --- a/stream_compaction/shared_mem.h +++ b/stream_compaction/shared_mem.h @@ -6,7 +6,7 @@ namespace StreamCompaction { namespace SharedMem { StreamCompaction::Common::PerformanceTimer& timer(); - __global__ void kernScanDataShared(int n, int* in, int* out); + __global__ void kernScanDataShared(int n, int* in, int* out, int* sums); void scan(int n, int *odata, const int *idata); From 3605d5c6f9cddd5e904c20d61477d3736df57d17 Mon Sep 17 00:00:00 2001 From: risia Date: Thu, 13 Sep 2018 10:25:37 -0400 Subject: [PATCH 18/37] Update README.md --- README.md | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a997acd..4049530 100644 --- a/README.md +++ b/README.md @@ -83,5 +83,20 @@ Once the input array has been sorted for each bit, the output is correctly sorte ### Shared Memory Work-Efficient Scan & Compact An alternative implementation of the work-efficient scan using shared memory to reduce latency is included. Each block stores an array shared among its threads to store the intermediate values before outputting. By reducing global memory accesses and instead using faster shared memory, we can potentially increase thoroughput. -Both the upsweep and downsweep are done in the same kernel as they need to both used the shared memory cache. This means we cannot dynamically change the block and threadcount as we traverse the tree as done in the global memory solution, and we must be careful to synchronize threads between write and read operations to prevent race conditions. -To allow the merging of the blocks' solutions, while we calculate an exclusive scan through the downsweep, we save the root value of the tree in the index blockSize of the shared memory array. The blocks must add the root value of all previous blocks to their total to calculate the correct prefix sum values of the array. +Both the upsweep and downsweep are done in the same kernel as they need to both used the shared memory cache. This means we cannot dynamically change the block and threadcount as we traverse the tree as done in the global memory solution, and we must be careful to synchronize threads between write and read operations to prevent race conditions. Each block essentially performs a scan on a portion of the input data. +To allow the merging of the blocks' solutions, while we calculate an exclusive scan through the downsweep, we save the root value of the tree in the index blockSize of the shared memory array. +The blocks must add the root value of all previous blocks to their total to calculate the correct prefix sum values of the array. A second kernel call to do this to stitch together the blocks into the full exclusive scan is used to ensure all blocks have written their data to the device output buffers before attempting to fetch it. + +```cpp +__global__ void kernStitch(int n, int* in, int* sums) { + int bx = blockIdx.x; + int index = (blockDim.x * bx) + threadIdx.x;; + + if (bx == 0) return; + if (index >= n) return; + for (int i = 0; i < bx; i++) { + in[index] += sums[i]; + } +} +``` + From 88e6511a24f4afdc68835337b1a49e6deaec4b0c Mon Sep 17 00:00:00 2001 From: Angelina Risi Date: Thu, 13 Sep 2018 10:49:37 -0400 Subject: [PATCH 19/37] Bank Conflict reduction added to shared mem implementation --- stream_compaction/shared_mem.cu | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/stream_compaction/shared_mem.cu b/stream_compaction/shared_mem.cu index 7bbc377..3f3de5c 100644 --- a/stream_compaction/shared_mem.cu +++ b/stream_compaction/shared_mem.cu @@ -28,27 +28,31 @@ namespace StreamCompaction { int index = (blockDim.x * blockIdx.x) + tx; // copy used vals to shared mem - sBuf[tx] = (index < n) ? in[index] : 0; + sBuf[tx + CONFLICT_FREE_OFFSET(tx)] = (index < n) ? in[index] : 0; __syncthreads(); // avoid mem issues int offset; // step size int access; // shared buffer access index + int a2; // Upsweep for (offset = 1; offset < blockSize; offset *=2) { access = (2 * offset * (tx + 1)) - 1; - if (access < blockSize) sBuf[access] += sBuf[access - offset]; + a2 = access - offset; + a2 += CONFLICT_FREE_OFFSET(a2); + access += CONFLICT_FREE_OFFSET(access); + if (access < blockSize) sBuf[access] += sBuf[a2]; __syncthreads(); // avoid mem issues } // prepare array for downsweep - if (tx == blockSize - 1) { + if (tx == blockSize - 1 + CONFLICT_FREE_OFFSET(blockSize - 1)) { sums[blockIdx.x] = sBuf[tx]; sBuf[tx] = 0; } __syncthreads(); - if (index >= n - 1) sBuf[tx] = 0; + if (index >= n - 1) sBuf[tx + CONFLICT_FREE_OFFSET(tx)] = 0; __syncthreads(); // avoid mem issues // Downsweep (inclusive) @@ -57,9 +61,12 @@ namespace StreamCompaction { for (offset = blockSize; offset >= 1; offset /= 2) { access = (2 * offset * (tx + 1)) - 1; + a2 = access - offset; + a2 += CONFLICT_FREE_OFFSET(a2); + access += CONFLICT_FREE_OFFSET(access); if (access < blockSize) { - temp = sBuf[access - offset]; // store left child - sBuf[access - offset] = sBuf[access]; // swap + temp = sBuf[a2]; // store left child + sBuf[a2] = sBuf[access]; // swap sBuf[access] += temp; // add } __syncthreads(); // avoid mem issues @@ -67,7 +74,7 @@ namespace StreamCompaction { // write to dev memory if (index < n) { - out[index] = sBuf[tx]; + out[index] = sBuf[tx + CONFLICT_FREE_OFFSET(tx)]; } } @@ -80,7 +87,6 @@ namespace StreamCompaction { for (int i = 0; i < bx; i++) { in[index] += sums[i]; } - } /** From b643ac3d912a2af0fbbf3150cbb9759693066a3f Mon Sep 17 00:00:00 2001 From: risia Date: Thu, 13 Sep 2018 10:54:53 -0400 Subject: [PATCH 20/37] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4049530..eb1b3f3 100644 --- a/README.md +++ b/README.md @@ -99,4 +99,6 @@ __global__ void kernStitch(int n, int* in, int* sums) { } } ``` - +#### Bank Conflict Avoidance + +This algorithm is further improved by using offsets on the shared memory access iterators to reduce bank conflicts, events where multiple threads attempt to access a region of shared memory at the same time and thus must wait for the bus to become free. This is done by applying macros to calculate the offset on the index based on the assumed number of memory banks. These are taken from the example code in GPU Gems 3 Ch. 39 linked in the instructions. From 776b074124414b62c7ff69f873a4cd3f5562b492 Mon Sep 17 00:00:00 2001 From: risia Date: Thu, 13 Sep 2018 11:05:37 -0400 Subject: [PATCH 21/37] project description --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index eb1b3f3..4b8e812 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,13 @@ CUDA Stream Compaction * Angelina Risi * [LinkedIn](www.linkedin.com/in/angelina-risi) * Tested on: Windows 10, i7-6700HQ @ 2.60GHz 8GB, GTX 960M 4096MB (Personal Laptop) + +## Project Description + +This project implements a variety of scan, compact and sort algorithms on the GPU with some comparison tests implemented on the CPU. The base requirements were to implement CPU Scan and Compact Functions, and to implement GPU Naive Scan and Compact and GPU Work-Efficient Scan and Compact. I also created a wrapper function for the Thrust scan implementation on the GPU. +In addition to these base requirements, I implemented all the defined extra credit assignments. These were Radix sort, using shared GPU memory in the scan implementation, implementing memory bank conflict avoidance, and improving the work-efficient implementation's efficiency over the CPU implementation. +### Features ## Extra Credit @@ -102,3 +108,7 @@ __global__ void kernStitch(int n, int* in, int* sums) { #### Bank Conflict Avoidance This algorithm is further improved by using offsets on the shared memory access iterators to reduce bank conflicts, events where multiple threads attempt to access a region of shared memory at the same time and thus must wait for the bus to become free. This is done by applying macros to calculate the offset on the index based on the assumed number of memory banks. These are taken from the example code in GPU Gems 3 Ch. 39 linked in the instructions. + + +## Performance Analysis + From 67029aefd64b2f241c43d28c4dc6b6b6a3424285 Mon Sep 17 00:00:00 2001 From: Angelina Risi Date: Thu, 13 Sep 2018 13:38:57 -0400 Subject: [PATCH 22/37] Optimized stitching of shared mem scan blocks --- src/main.cpp | 2 +- stream_compaction/efficient.cu | 13 +++- stream_compaction/shared_mem.cu | 101 +++++++++++++++++++++++++------- stream_compaction/thrust.cu | 6 +- 4 files changed, 95 insertions(+), 27 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index ecb64b0..3637cd4 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -15,7 +15,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 12; // feel free to change the size of array +const int SIZE = 1 << 9; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 4c831bc..507ba32 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -43,6 +43,7 @@ namespace StreamCompaction { int limit = ilog2ceil(n); int size = pow(2, limit); + dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize); // allocate memory @@ -61,11 +62,16 @@ namespace StreamCompaction { int offset1; int offset2; + int threads; + // UpSweep for (d = 1; d <= limit; d++) { offset1 = pow(2, d - 1); offset2 = pow(2, d); - fullBlocksPerGrid.x = ((size/offset2) + blockSize) / blockSize; + + threads = (size / offset2); + fullBlocksPerGrid.x = (threads / blockSize) + 1; + kernScanDataUpSweep << > >(size, offset1, offset2, dev_buf); checkCUDAError("upsweep fail!"); } @@ -75,7 +81,10 @@ namespace StreamCompaction { for (d = limit; d >= 1; d--) { offset1 = pow(2, d - 1); offset2 = pow(2, d); - fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; + + threads = (size / offset2); + fullBlocksPerGrid.x = (threads / blockSize) + 1; + kernScanDataDownSweep << > >(size, offset1, offset2, dev_buf); checkCUDAError("downsweep fail!"); } diff --git a/stream_compaction/shared_mem.cu b/stream_compaction/shared_mem.cu index 3f3de5c..0f89691 100644 --- a/stream_compaction/shared_mem.cu +++ b/stream_compaction/shared_mem.cu @@ -4,7 +4,7 @@ #include "efficient.h" #include "shared_mem.h" -#define blockSize 256 +#define blockSize 128 // for reducing bank conflicts #define NUM_BANKS 16 @@ -20,6 +20,40 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } + __global__ void kernScanBlockSum(int n, int* sum_buf) { + + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + + int offset; + int access; + int a2; + + int temp; + + // Upsweep + for (offset = 1; offset < blockSize; offset *= 2) { + access = (2 * offset * (index + 1)) - 1; + a2 = access - offset; + if (access < blockSize) sum_buf[access] += sum_buf[a2]; + __syncthreads(); // avoid mem issues + } + if (index >= n - 1) sum_buf[index] = 0; + __syncthreads(); // avoid mem issues + + //downsweep + for (offset = blockSize; offset >= 1; offset /= 2) { + access = (2 * offset * (index + 1)) - 1; + a2 = access - offset; + if (access < blockSize) { + temp = sum_buf[a2]; // store left child + sum_buf[a2] = sum_buf[access]; // swap + sum_buf[access] += temp; // add + } + __syncthreads(); // avoid mem issues + } + + } + __global__ void kernScanDataShared(int n, int* in, int* out, int* sums) { // init shared mem for block, could improve latency __shared__ int sBuf[blockSize]; @@ -27,8 +61,10 @@ namespace StreamCompaction { int tx = threadIdx.x; int index = (blockDim.x * blockIdx.x) + tx; + int off_tx = tx + CONFLICT_FREE_OFFSET(tx); + // copy used vals to shared mem - sBuf[tx + CONFLICT_FREE_OFFSET(tx)] = (index < n) ? in[index] : 0; + sBuf[off_tx] = (index < n) ? in[index] : 0; __syncthreads(); // avoid mem issues @@ -48,11 +84,11 @@ namespace StreamCompaction { // prepare array for downsweep if (tx == blockSize - 1 + CONFLICT_FREE_OFFSET(blockSize - 1)) { - sums[blockIdx.x] = sBuf[tx]; - sBuf[tx] = 0; + sums[blockIdx.x] = sBuf[off_tx]; + sBuf[off_tx] = 0; } __syncthreads(); - if (index >= n - 1) sBuf[tx + CONFLICT_FREE_OFFSET(tx)] = 0; + if (index >= n - 1) sBuf[off_tx] = 0; __syncthreads(); // avoid mem issues // Downsweep (inclusive) @@ -74,7 +110,7 @@ namespace StreamCompaction { // write to dev memory if (index < n) { - out[index] = sBuf[tx + CONFLICT_FREE_OFFSET(tx)]; + out[index] = sBuf[off_tx]; } } @@ -84,9 +120,7 @@ namespace StreamCompaction { if (bx == 0) return; if (index >= n) return; - for (int i = 0; i < bx; i++) { - in[index] += sums[i]; - } + in[index] += sums[bx]; } /** @@ -99,16 +133,20 @@ namespace StreamCompaction { if (mod != 0) size+= blockSize - mod; - dim3 fullBlocksPerGrid((size + (blockSize - 1))/ blockSize); + int num_blocks = size / blockSize; + + dim3 fullBlocksPerGrid(num_blocks); int* dev_out; // data to output int* dev_in; // input data int* dev_sums; + int x; + cudaMalloc((void**)&dev_in, n * sizeof(int)); cudaMalloc((void**)&dev_out, n * sizeof(int)); - cudaMalloc((void**)&dev_sums, fullBlocksPerGrid.x * sizeof(int)); + cudaMalloc((void**)&dev_sums, num_blocks * sizeof(int)); // copy input data to device cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); @@ -118,9 +156,18 @@ namespace StreamCompaction { timer().startGpuTimer(); + // scan blocks of data kernScanDataShared<<>>(n, dev_in, dev_out, dev_sums); checkCUDAError("shared mem scan fail!"); + + + fullBlocksPerGrid.x = (num_blocks + blockSize - 1) / blockSize; + // scan sums from blocks + kernScanBlockSum << > >(num_blocks, dev_sums); + checkCUDAError("shared mem block scan fail!"); + + fullBlocksPerGrid.x = num_blocks; kernStitch << > >(n, dev_out, dev_sums); checkCUDAError("shared mem scan stitch fail!"); @@ -131,6 +178,11 @@ namespace StreamCompaction { cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost); checkCUDAError("shared mem scan output copy fail!"); + for (int i = 0; i < num_blocks; i++) { + cudaMemcpy(&x, dev_sums + i, sizeof(int), cudaMemcpyDeviceToHost); + printf("Sum %i: %i\n", i, x); + } + cudaFree(dev_out); cudaFree(dev_in); cudaFree(dev_sums); @@ -158,7 +210,9 @@ namespace StreamCompaction { int size = n; if (mod != 0) size += blockSize - mod; - dim3 fullBlocksPerGrid((size + blockSize - 1) / blockSize); + int num_blocks = size / blockSize; + + dim3 fullBlocksPerGrid(num_blocks); // allocate memory cudaMalloc((void**)&dev_in, n * sizeof(int)); @@ -166,7 +220,7 @@ namespace StreamCompaction { cudaMalloc((void**)&dev_out, n * sizeof(int)); cudaMalloc((void**)&dev_scan, n * sizeof(int)); - cudaMalloc((void**)&dev_sums, fullBlocksPerGrid.x * sizeof(int)); + cudaMalloc((void**)&dev_sums, num_blocks * sizeof(int)); checkCUDAError("shared mem compact malloc fail!"); cudaMemset(dev_scan, 0, n * sizeof(int)); @@ -184,10 +238,19 @@ namespace StreamCompaction { checkCUDAError("w-e compact bool mapping fail!"); // scan the map - fullBlocksPerGrid.x = ((size + blockSize - 1) / blockSize); + fullBlocksPerGrid.x = num_blocks; kernScanDataShared << > >(n, dev_map, dev_scan, dev_sums); checkCUDAError("shared mem scan fail!"); + int r_val; + cudaMemcpy(&r_val, dev_sums + num_blocks - 1, sizeof(int), cudaMemcpyDeviceToHost); + + fullBlocksPerGrid.x = (num_blocks + blockSize - 1) / blockSize; + // scan sums from blocks + kernScanBlockSum << > >(num_blocks, dev_sums); + checkCUDAError("shared mem block scan fail!"); + + fullBlocksPerGrid.x = num_blocks; kernStitch << > >(n, dev_scan, dev_sums); checkCUDAError("shared mem scan stitch fail!"); @@ -203,14 +266,10 @@ namespace StreamCompaction { checkCUDAError("shared mem compact output copy fail!"); // calc # of elements for return - int map_val; - int r_val; - cudaMemcpy(&r_val, dev_scan + n - 1, sizeof(int), cudaMemcpyDeviceToHost); - cudaMemcpy(&map_val, dev_map + n - 1, sizeof(int), cudaMemcpyDeviceToHost); + int r_val2; + cudaMemcpy(&r_val2, dev_sums + num_blocks -1, sizeof(int), cudaMemcpyDeviceToHost); checkCUDAError("shared mem compact calc # elem fail!"); - r_val += map_val; - // cleanup cudaFree(dev_in); cudaFree(dev_map); @@ -218,7 +277,7 @@ namespace StreamCompaction { cudaFree(dev_scan); cudaFree(dev_sums); - return r_val; + return r_val + r_val2; } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 5fa537f..998efcf 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -19,11 +19,11 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { thrust::host_vector host_thrust_in(idata, idata + n); - thrust::host_vector host_thrust_out(odata, odata + n); - thrust::device_vector dev_thrust_in = host_thrust_in; - thrust::device_vector dev_thrust_out = host_thrust_out; + thrust::device_vector dev_thrust_in(n); + thrust::device_vector dev_thrust_out(n); + thrust::copy(host_thrust_in.begin(), host_thrust_in.end(), dev_thrust_in.begin()); timer().startGpuTimer(); From 3882da2209ae9478adb6d5b717f6b72cd3bc12a7 Mon Sep 17 00:00:00 2001 From: Angelina Risi Date: Thu, 13 Sep 2018 14:59:25 -0400 Subject: [PATCH 23/37] Fixes so everything works for large arrays --- src/main.cpp | 2 +- stream_compaction/shared_mem.cu | 116 +++++++++++++++----------------- 2 files changed, 56 insertions(+), 62 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 3637cd4..fd52fbc 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -15,7 +15,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 9; // feel free to change the size of array +const int SIZE = 1000000; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/shared_mem.cu b/stream_compaction/shared_mem.cu index 0f89691..ba688a9 100644 --- a/stream_compaction/shared_mem.cu +++ b/stream_compaction/shared_mem.cu @@ -20,40 +20,6 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - __global__ void kernScanBlockSum(int n, int* sum_buf) { - - int index = (blockDim.x * blockIdx.x) + threadIdx.x; - - int offset; - int access; - int a2; - - int temp; - - // Upsweep - for (offset = 1; offset < blockSize; offset *= 2) { - access = (2 * offset * (index + 1)) - 1; - a2 = access - offset; - if (access < blockSize) sum_buf[access] += sum_buf[a2]; - __syncthreads(); // avoid mem issues - } - if (index >= n - 1) sum_buf[index] = 0; - __syncthreads(); // avoid mem issues - - //downsweep - for (offset = blockSize; offset >= 1; offset /= 2) { - access = (2 * offset * (index + 1)) - 1; - a2 = access - offset; - if (access < blockSize) { - temp = sum_buf[a2]; // store left child - sum_buf[a2] = sum_buf[access]; // swap - sum_buf[access] += temp; // add - } - __syncthreads(); // avoid mem issues - } - - } - __global__ void kernScanDataShared(int n, int* in, int* out, int* sums) { // init shared mem for block, could improve latency __shared__ int sBuf[blockSize]; @@ -128,25 +94,21 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { - int mod = n % blockSize; - int size = n; - - if (mod != 0) size+= blockSize - mod; - - int num_blocks = size / blockSize; + int num_blocks = 1 + (n - 1)/ blockSize; + int limit = ilog2ceil(num_blocks); + int sum_size = pow(2, limit); dim3 fullBlocksPerGrid(num_blocks); int* dev_out; // data to output int* dev_in; // input data - - int* dev_sums; + int* dev_sums; // sums, from first blockwise scan int x; cudaMalloc((void**)&dev_in, n * sizeof(int)); cudaMalloc((void**)&dev_out, n * sizeof(int)); - cudaMalloc((void**)&dev_sums, num_blocks * sizeof(int)); + cudaMalloc((void**)&dev_sums, sum_size * sizeof(int)); // copy input data to device cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); @@ -160,12 +122,29 @@ namespace StreamCompaction { kernScanDataShared<<>>(n, dev_in, dev_out, dev_sums); checkCUDAError("shared mem scan fail!"); - + // scan block sums + int d; + int offset1; + int offset2; + + // UpSweep + for (d = 1; d <= limit; d++) { + offset1 = pow(2, d - 1); + offset2 = pow(2, d); + fullBlocksPerGrid.x = ((sum_size / offset2) + blockSize) / blockSize; + StreamCompaction::Efficient::kernScanDataUpSweep << > >(sum_size, offset1, offset2, dev_sums); + checkCUDAError("w-e compact upsweep fail!"); + } - fullBlocksPerGrid.x = (num_blocks + blockSize - 1) / blockSize; - // scan sums from blocks - kernScanBlockSum << > >(num_blocks, dev_sums); - checkCUDAError("shared mem block scan fail!"); + // DownSweep + cudaMemset(dev_sums + num_blocks - 1, 0, (sum_size - num_blocks + 1) * sizeof(int)); + for (d = limit; d >= 1; d--) { + offset1 = pow(2, d - 1); + offset2 = pow(2, d); + fullBlocksPerGrid.x = ((sum_size / offset2) + blockSize) / blockSize; + StreamCompaction::Efficient::kernScanDataDownSweep << > >(sum_size, offset1, offset2, dev_sums); + checkCUDAError("w-e compact downsweep fail!"); + } fullBlocksPerGrid.x = num_blocks; kernStitch << > >(n, dev_out, dev_sums); @@ -178,7 +157,7 @@ namespace StreamCompaction { cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost); checkCUDAError("shared mem scan output copy fail!"); - for (int i = 0; i < num_blocks; i++) { + for (int i = 0; i < num_blocks; i += 16) { cudaMemcpy(&x, dev_sums + i, sizeof(int), cudaMemcpyDeviceToHost); printf("Sum %i: %i\n", i, x); } @@ -206,11 +185,9 @@ namespace StreamCompaction { int* dev_sums; - int mod = n % blockSize; - int size = n; - if (mod != 0) size += blockSize - mod; - - int num_blocks = size / blockSize; + int num_blocks = 1 + (n - 1) / blockSize; + int limit = ilog2ceil(num_blocks); + int sum_size = pow(2, limit); dim3 fullBlocksPerGrid(num_blocks); @@ -220,7 +197,7 @@ namespace StreamCompaction { cudaMalloc((void**)&dev_out, n * sizeof(int)); cudaMalloc((void**)&dev_scan, n * sizeof(int)); - cudaMalloc((void**)&dev_sums, num_blocks * sizeof(int)); + cudaMalloc((void**)&dev_sums, sum_size * sizeof(int)); checkCUDAError("shared mem compact malloc fail!"); cudaMemset(dev_scan, 0, n * sizeof(int)); @@ -229,13 +206,11 @@ namespace StreamCompaction { cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice); // copy input data checkCUDAError("initializing w-e compact data buffs fail!"); - - timer().startGpuTimer(); // map fullBlocksPerGrid.x = ((n + blockSize - 1) / blockSize); StreamCompaction::Common::kernMapToBoolean << > >(n, dev_map, dev_in); - checkCUDAError("w-e compact bool mapping fail!"); + checkCUDAError("shared mem compact bool mapping fail!"); // scan the map fullBlocksPerGrid.x = num_blocks; @@ -245,10 +220,29 @@ namespace StreamCompaction { int r_val; cudaMemcpy(&r_val, dev_sums + num_blocks - 1, sizeof(int), cudaMemcpyDeviceToHost); - fullBlocksPerGrid.x = (num_blocks + blockSize - 1) / blockSize; // scan sums from blocks - kernScanBlockSum << > >(num_blocks, dev_sums); - checkCUDAError("shared mem block scan fail!"); + int d; + int offset1; + int offset2; + + // UpSweep + for (d = 1; d <= limit; d++) { + offset1 = pow(2, d - 1); + offset2 = pow(2, d); + fullBlocksPerGrid.x = ((sum_size / offset2) + blockSize) / blockSize; + StreamCompaction::Efficient::kernScanDataUpSweep << > >(sum_size, offset1, offset2, dev_sums); + checkCUDAError("w-e compact upsweep fail!"); + } + + // DownSweep + cudaMemset(dev_sums + num_blocks - 1, 0, (sum_size - num_blocks + 1) * sizeof(int)); + for (d = limit; d >= 1; d--) { + offset1 = pow(2, d - 1); + offset2 = pow(2, d); + fullBlocksPerGrid.x = ((sum_size / offset2) + blockSize) / blockSize; + StreamCompaction::Efficient::kernScanDataDownSweep << > >(sum_size, offset1, offset2, dev_sums); + checkCUDAError("w-e compact downsweep fail!"); + } fullBlocksPerGrid.x = num_blocks; kernStitch << > >(n, dev_scan, dev_sums); From 9831715ddf36b18b7bf92b64b591702f4e16c1d3 Mon Sep 17 00:00:00 2001 From: Angelina Risi Date: Thu, 13 Sep 2018 15:00:38 -0400 Subject: [PATCH 24/37] Removed debug printf --- stream_compaction/shared_mem.cu | 5 ----- 1 file changed, 5 deletions(-) diff --git a/stream_compaction/shared_mem.cu b/stream_compaction/shared_mem.cu index ba688a9..729d158 100644 --- a/stream_compaction/shared_mem.cu +++ b/stream_compaction/shared_mem.cu @@ -157,11 +157,6 @@ namespace StreamCompaction { cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost); checkCUDAError("shared mem scan output copy fail!"); - for (int i = 0; i < num_blocks; i += 16) { - cudaMemcpy(&x, dev_sums + i, sizeof(int), cudaMemcpyDeviceToHost); - printf("Sum %i: %i\n", i, x); - } - cudaFree(dev_out); cudaFree(dev_in); cudaFree(dev_sums); From df3dd11a085802e73feb1e59649893153789685b Mon Sep 17 00:00:00 2001 From: Angelina Risi Date: Fri, 14 Sep 2018 10:10:02 -0400 Subject: [PATCH 25/37] Added test for average time to run each scan/compact for perf. analysis and recording Excel file --- Project2 Performance Analysis.xlsx | Bin 0 -> 24706 bytes src/main.cpp | 143 ++++++++++++++++++++++++++++- 2 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 Project2 Performance Analysis.xlsx diff --git a/Project2 Performance Analysis.xlsx b/Project2 Performance Analysis.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..146eef79853400ea95f93141391fa6af77adc0e4 GIT binary patch literal 24706 zcmeFZRd6KPlCCReDls!NGcz+YQ;M0f#LQ5VN-8n4RAOdkE^&#O`Ko*O?3sIao6UVZ zW~cKZ&B9hZgizStVi!*YeV=041_Wl2;}qre?I>gE6|_1Dc?tkI(Q-H6Bep7$IQkwSaH)U zkY@dZAELi-U`D~T^SG4WEPgfiwrEIlEiadX}iIv=>d_A;Gw`93= z{qV}kn6K9eGF&fZUS(ibCGg7K>7(Uyom&938qT=Zm!P&CTr=R5?s0C$TT?qQtk>Hp0BYQzI0Fdn3vN<(nrH9b{$TOcbsO%*iY({@7b-Er;@7?s zgX9JxQ&p@NLnUrrBAn<8(s_k<2v*lP)EEXdQia|yNTme$&e_+w=lySuV>VsG2x)Ho z3}lovn>5mdA_qd0v_Qgz^~>fdloN{&_v z76VL%@Ws}KLoRc3f{U}6S_8kY((xx_d~XE*<%5;uO)}q`vd2UiH(@EOeWO%^!z1s9 z;TCda{@r%oL~m$Y%a-rna9C55p#ap07pFb`Tgyp;3C8_2t~aCCzQ=nUmzxpdL@i{# zcdYQ(x1iXp1cNfC@-};cX|Bh;%KW9sJBOcvX9)%x#j#(z!;m`>-}J%`I1=nof+gVY ze8mD+dwyT}vA%rz@5cuykmCQ#{TtO7Np3%FTK01l_S5|hoXl*T8R-A`{>SbAFZT8S zvGs~XIfXt(xX=s9*RbJ-g^hS5VHtNJ$u=TYzW}Lq#KxF>Qk;!03S1;r>|js{{|>*` z(X|cUnB!rh`yJMbC}dO~k|y`c(6nbqS4e7dr&MvritYX{ZVPt{_Zbq>o|JAKan$8M zOY&q!Hc7sRsk;vtDIGa)--eE9w$wG^Iu9W zI)WDn=aN00%|IJ=Hnv!)@)@=vx_iP_Q?cZ-sxi)S;wJVqFtzEv5&hPM^5#P)pEayZ z%!>TTJS8zimV4)~-N1S^mhIWk0o_+Vd^r*nMNsnV^CbLFCrN}%J_Y++5dZ}O!Ulo@ z_ON02m!G)XI{}RC?E!zBlKCgD%{LW0N@t` zV-qCd-Ig*o*<|3L;L8U~8f_ZZ3A@FHnf&1;#=#(c>-L3ktBPSnj(jVerCAs8CP#pk ze12m&r=%jp*(o+cfjQJL}RxEfRG zotmvF;k>9fmxsou>Y406dWP$l4+K8{`v0^cVs^rL^iTVFfB*tQ{Ji7ShW@AZRH&`l zt#Kmx8W?^EZSjyOEe;_6KMeacwvTv2Y5(KWE{7EjmaOyYI`iaOy6eCt+HmVRl)C4j`qgFM94!G z`oV^-o`LPk41~tYc!SKe7os126TdYVJz3V-;x4lf!)IBz19BwMHoD-WgyB}VyH~dI znhRQ!NQ97pSAg(G7D9>R2|QX*&oa!po8Dy8bRoM6?;Q^~eojFCr~Gaq)F* z8-*tOjkNp}n#LP~WQHOCOML2*G<464RKsnAPBi_b`;Yza3bXK*n;%jLr!hGgIoEU5atBTveZY@0KAjZ1`kU+~e*wZ0VFkwKX(YFCjp z`s5XWxhWDfy^y0XUjt>><94#tSk6)N?-0wZkzZqEos^2DxQ(DQtl@a_NiK&W~{Ctcp2Y zg*2^b3bEGA;`;XVDT>W*lI#@OnI87IUWZ4t%y@q$-`t@Y;U7*&cPwd;T{NuA@P3x! zYa9oSJRkzFevDJdPSrhEQSpu39?j7^<&Cu*p6Ik_MO>=UXA1k6Z*s zsE4y>c^qt4ShxZv4Q7f^RX4CV#KbK+J#qIPwVjagFu4+mmZ=hALHP@$10xOLSF9QR z9tpI&YMIrk@px$%?o!DgLrO<7dARljLv?wp`hZcp?aAk1cV}xsIf}3VhHytyyjTOh z*H1_J8y9N)nL_*-T1!5271V#Zkh7(clbNZCi<6a|h4UX#*e9_^x=#=(^djhj@l*=& ztf?n5nG`(umv)rZ6*=w5GOfAA?qrdw!D!g!fb5ZwWE1|ui~Dswd?wFs$6gR=G#~xAK z({;75n*iM#_}^*jnHOMF@TsTrXIO&&OH&t58?!&fEKsurtT7_J^Dp}&J)e)d?b1Rw zjm^$09uDT>ot7JqMU?1AlUhs;zQ4%3>Em)Y&G<#gAIv#VZ8$5>ev^l3Fd1~JWX9DC z!DLc9jx3i8?czu4fg?F7DJz+dH1n9E?Ht+clBrySz5i)8P86WX-lubr2~O0s&MbA7 zVwWDo(8!+1y_^YO8)MW^_a2W*iqJ=ukW9l%NNf5eK$w_$&;%__hjaXx344Cehb~^H zyKbM%?m`z}jUi_l(Q0XJJfC5b`0Ir@Zm)1ged!#=A)*SPUzg@`U@C9vDGsu?L?i|c zy&JX4b0b{<&15sw;lrr_sLT>8v`YbH3M{ej+0i<>D7*aL?i@a<@&bBR0;KR&DkoLj zG8k79GW&!tQkq?KUAbo|^ra?Kn*KhR=RQ{dOeAY>+WUeuXUM45Asp7x7|k1GYgi?I ziYXsvIXC#N!Je=qND9aB`KQ0AJ=}(bBQv+)^HZ z&@R9kZprszv|^mflFLFkLJH7Pstlw!zvr$6jAgwTRT|i%^K!U8zH2$oOvT|I+{@6I zA%cAPW&2PuJ>Lk#aFSJw0)?Z5C7!KiTKc^chM7{dwV6%5eA)cwk2e0sX!d44kALl#=@esX@v?Sl~#Sa~^f zeCix@nR^zL%JYRy7{bBUx=@8;rQ>hD3%%p~ojLblAa%+^0s##Y{?X0&^OSY5G_y5h z`170Tj|g$9H4%lwf!c-fECA>1{>ru+NA_!L$|h-p)HFL8x3TF|Nu7l`u^k5mgpBJ> znX;fTP1v3;x%7Z^m01sT&h2Slakd(K{7e|iy5@t6D;kk%-VN9r!Tf6P(#b0Pog$? zsRUA7Yt-liP@xGwyUjTRH_4-iBzsA|qYvTY1^-G%rzupf#3V_4)DJRN^!8;YA~}Um zk@oj4ZCBW4N)U#Rrrgvht0l{;-R-Lr4i7n0KF1?ky*A2W;=$kXo_(N1M)tFOB6~Z) zXl6j4Q70GOhtLwS%l%+(ku~(uoN>NB>am_BgbLR7E;Ilw(H*)wdDHG{nk@#lWDQCw zYFg$#NwJGJf>(TA$ZONMP#Z)4HEp0*--1^b7qXPA>R%H|0N1)D23sve#jo2JmTg{a zak7+)YM|4%By*;(*nA666qR!N8A>10Xzkvzie}n-7>n`?E?B4|k!ngQ9=`lkSw&Wp zm}6f%`ka{`C;&v6RBC5r08U{Vs_thLCD&H}8rd8avVABrx#3S$DtKwN$7=q-_ac(W zCPwCh6g|TTO&sNW599t3*``;T+yxFoh8#Zc_sgT3ur5CT+ry_mgSwVRY8g97G8L8c zg*bzcw>OMm23_x$$5$Lj^AuOT{avq5V`*IIkFo`H!Tru5z`5Pb9(6tpoasNT#X&& zm6#f{-|&&h5k;ImvnpM+BGGu|Z6e|OOMJ{bvNaL6=SbDT;d5jnPKIR4^bI1$;bXmK zgkM*$Cw|#h_muz^_Nmnah(UT2i)98pT@J^*GXoe@%?y?Ml%@WH1i&Rc& zP&8%_SIM?D7WR%o9_W-2Up5P#uthhd^*HylRK2oe5#+YmK=ujtqUNZUzz-P{+q$Gq zVxY7v(!9{mee@h?{Os$83=Mwqoppj?`B-BS-Z$S*ASnTYc#T4#VBjz|g2K_>bw#tw zwQbImzb$`bM+8L`SMNYc+KU;Oim0P_bsxod2X=-4kJGt&N#_1WX?7qQSP%U`Knw7) z%vtYtNUU&P^i*@!UwHg@DKnqb=dU)K1Z3Q358mD130Y^f`vi`XUtAyND74q^vk)bp zkAV-g3A_7(wa-LDGM20t+*khJ-P~JGDF-eiKR3HSkah)g)q)VWu?{8)7sUltV67TjwITpK8A6Y zF$iOumS-aMMQgA%BW#{op~bib$BE@Q2Js6(jF{qQAUV0pbU%;XQ=>IhGc^e-;0fgr zvv0`*qJ?B$2jhpzPd6;79J@z%qf{)6x-ijXZrvpLQ3$4lbnoPTIs-cY6nNGd+#j|y z8UaZB+Pj&8Gz8xqkz9C&J(zn)tU@<1m2%_>0=HGF9Nx9OuCG0=B?--6+F11I_awW%-a7QV+{YW>>>1B(k8lO?1 zA;45v$?VBHq32C61HJE1*x(>o7`##Prr`0as7g=AgwAR2v3k9bz&;DrZD$dDwRYf5%oIS4X zDoA@Go&EScy57EE4(T4q>*e6jS*(1hC~TibLis$4Gn~wB7`2|H6$7=|&P74@;YVcy zxgotwmyB)Hg`^s+ocE=vJj4xCeeA79_t=#hamz44Nf%-6JkijU*P7sBk9#?f3YTQ}C;8-5dBJ>3wW=M zni*W)%MKEFgw+K*#S;#gMyF;SC~Gn^zkIC{$P{Uwl4hK@+&4lR6T^KfU(-&Y2kW+` zkH2k{pjDUgc6=VYG@q*^|2lsEC~}=G&CFb!|LU3idGY5-{Ab7HYup4(FC)^|&x9T6 z-fq?lDWOPT^pg4xIMwte*cQ3&s@UG1J}79*)*|iAyl7RRSZST=dMi6r8J%z#%N}yc z>L=^a0y-Fl3%bOGWS=bDi&sm@B&+eL!a8E0SP|#qbTO)%bnTIqa4LWNSlZ$cv`T8N z3h)FPkMhZn<&wvdfafSGd36!q6=YGCu+!J?XKQYZE|kJe1qWUm3mGKbnMF=c8Em_L#gw@bF9py+mwvqqEz(Fcc)e)9Ley`u;-*Ku zD-BkbCHx6wicN;C_^9RzCd2h1abqWaD3qo2arDaKurx4Pp)YN_IN};Q zlBtTKY|`kkGL`#UfGb#X3Ouy&Gv+T88f(M#^pge~(a+Nlahd)shRaL~o(%HDQ#Uc_m1 zq6o@=bvQnL)=~c{$CptrqPhP~4(5M%5i8Vnm{Ymv?F}rbCT42$N#8GCX0|OFp^Iz`rKjP+pN6aiO z+qhM6q<8(&kDluxX$!X{fOiZ6@m6fGus5sK?0x=JIuA`i#Mv&g+S^5TW6Z-qZD8&m zSdN8-+b@fb=8Mkrra>m@dd?L@SsS9c@Oz`W*oP_5o89koIOf<|_@_%f>?;zWH#*4~ z7oh?lE-$GC-|N#TN8z_>rXYUH=;) z)0EuZTh1nIZ>EaMN7S<6s!YDgI3A^D0>K)xo3*|uW4(WTh7QZ_Z|RrnD=))$o&ghH zV;3ipVhGu^A z_4WmxVZ1LD*v$oDgN95>*LOEYx?Z0Y>=R3rcDtL0ku}=L?*>lGFpZQYQ@h1o?`{}o zL}2t#&8OOFZG?!S%~sduf^ZHaVF4DX=WFY*n>t+W@LX>Fcev?Tluo!m)_#4tM^|sC zl_jA*)5!Q;+7Kdpqqd5Skr=*PHbt<{rnxpO5fZ+TLLvrUapvq&Tw!*%ijP^|go%0N z6b1aVDZMGJR(NilSt&g8-5q8|Nv&NYT$N1T>_^k3P>~ff2Hr_3-ibQi$@`(^gMN_} zw@F9E!3@iLUgM?5NhQtjMWR0XTafE^OC*vnAl2*m^XIMKFY zG7U@>BXG7|C-nxy5Gk1;&ZSEyY^thL*yF?f7bjT#*yKeRh6`+c{ZbNQI*8ri(E-<_ z6lvmBW2O&mbHAQw;x003MmRN%WS3v5u_B?qN^of9+b7&$3@bSN#&<@DMA6KQA!bTm z(c^hcy~B?KKwLQV$ZhNZCi>8=E@5!x;XT)TGRgc#PwVXhf!_5aCi~7x@7~!Qo1KU3oF+f*h* zOFd<|Lz=4;H&)4^5&|?0Y69nHqaO(?2iO~RLdL=3zhD4_ep&4+X@<2I z!WC4ds>h+BwT7RtNZg9kS!#etFgb}iqmC4C%U6va;Au>D%tGlQKo$5P#EQTj>3oNx zW?0iNA1DSnG^}s|G=lO&5t?BF$kU9O;3vK-!vI*F!1-%PoeLe+YZqb5=FL z*D~G-R%(0GJ!+s~bTjoT%B&GNx$|69U0V_ns8IW-fNk!Wgt7{-QDj3)?!|00*H~~s zwpxinOsRY)nzinQ1pd0UAjgTKiWsDhU#2=W1wMTj4`0oyKv1bgd+aMRIg(`REUW`l z64hEfG6W+{yNC1*>I&XMJ&YUHGdVH=v5}3=Gn4aT@j7|QgNilO-7cj%eu%WWbt0o9 z6d6(mD{hoLj}t`@F?#O2zXgmGZqoY<+G}=QNS2x7 zdGb&nAfI7eKz>^r6x7^eLszT8hGvqcY;@5i?Y%%LF*?`mlUS|XO!_ztxdpZ0x}2%X z>ygUR@{^4%TU}Ao1cR&7M}C&Zo~qMwrm7lVjHPGLB)U-}xCMw=SX555Bw?_zHJnnf z$&#0odTt(0BmDP{PHm3}kA@CjUIe>ZVQ zhrg(fA@j~1gyC>3**zmGbJ&h@Vsl@^^OTu$$tBqu*d>EsU;+%sp_U(doQ!(@YA{;c zS$z41tP2oo0K*P7DSr00RSS|D1~6?#3s3nPJpGDab!+N*qWB9DH|XH7nb0@(A)_|r z8@yi_kW^Oc65N3M&l}1dVI4o|Yu=S?zmDz!9WV@t*TtS?QYpV@;GsQN?Dihl?+N!$ zZ?fD5X%BdJ;G38Jf17gPq869DeXiJjq6M^n9kG99q$c(@_D;@!W*Y@bLUy0Uh*&l9 z6Obb^VI^^I(McpY1*V*h@yI>lTA_n?mZGHy5poaD%SqRhKPkhEhGgbCIc}pQg816w z7GN>7rkwL(83vS}4F=CMkw}a=t3Z{~V|Pnug^r=FeQP-gmM`SZaElM&+YuHiA`|!yHH4FAewI$7QQW>@9lDy=o-y`mJz3;0r%chDBpCppbgAj?a$%@}Ak2)taaIl9=>ddK5@%{sO;3C<=KgrJ(9KEOb8m^C;r1hkSn-JszeVbW)ZXZPJ7F z>ir|yaBLdxtx-q*#fj{+wJC-EgAn)zgIWNN+mBfLMKtmg*RYg!3sr?-XWuMaiyJ+* zug<;8(WEruM19%I04YnUkW+!r0y;X|-Yv61Jm@A2fQAb$5t0)iSay zek5WPBx;BTYrY}i*l);k$g`l9}V=C3Hu*QCuiZgVv!ILziU}c(kYh}i)*bs;z#?rmMKUcbbwT_ z&d_k_gc$4s%T`bWSc{|&edJL*m84u`90V3n%m6BU+z31Our67ka8lJw*vg=p1*xwP z21UtAQt+bbC2Gv{6hX*%J?EY=!TbEesgIB0`K0vCej1p+RJUR^vQ+Kswf>?9O7DV5*RR0rTp6L{N{52 zDe@ZQ@LGnPIRMtJ!KTWkIgJMExRYc8o%L(K{EOF z%pp25UDgmk+J{Ujrr(DqS{sG_-b053+B7Bh>RsP-7+mht4S%ucA#&@R5U0nZcZEzn z@^aytJCW7nsYYf>54{rV6Qoa}7FCur>RIL_h58sRik``74|p&|IeI+k8#biS-k{^s zk=%=g{1RUOAqA_S~gLfb*3=2rl8W9ISQ;rhn@p4w9*P&0eT`%Q+DAiqPeJ(xaYj^|Wg{jKpJb)$g6bsKDJ!0q*>i^Uu; zn*fSolD4b_xEWJI8tLX_aLsFGWXHaSsv-S-Le`uYyU8v0%QQMTO&!*1P;!@PsqS%3 zCkLZc{wiTg>-?37p7bL8&I=!u9=_&$XQTsC2sh38qzlnXddb;RzEj!P?oOXT?p=`P zj7X!|{D6*KoH}-)f(1r3tB1w$k>^MpcFQ8j#~n%pQp_A-$JdC`7?r`Kl#r&*0YqG` z0Nd+ROD)4MnQ}Hs<_fDXeW6FZX{o=D9lvwcO^Blmi|d1u8<-A67KQ$j4dr?wz~IoT z^8y8{AEXu2rB(+*I=_;J&L+&U38)NseF_D*tjXwU2@WH8`Asz;?-?8j^7NYM_ZpTN z1FzHQ05L_fo~lqP+6#bV}-`pcLKq2Gf?7gecf=P$sBq}2hZSB1^ zP7+J!wkGDjw*AQKP3P~lozb%yBsW|MPq-S{imN{~|lNxB> z=maGfb7%wZRPSpg1MuncUp2NA1ji7Fq#c-p$(F`W%|v6^>=GIwP`C_`oj9_3J}Bo- z;tpNtFuGH*P_h7I&U<`~rVyEb_SX0PNHMdMkn!d!&e_xjc{r^fGAX)+*S^S+Efwo8 zp|$oeDkzgLZO@-Km0)=@rc9B`yjc%+fI86`%87hbhRi*-O3Qpo${8SVhnxBpS|4MU z*YTF6W)EwAsOGGgY_8@B<1Vbe(0_^_a510p(J9iP|KZ|*?yW)*?&T2v+m*y(%j-ZM zQ;uZmu8%Q#X(B9J~gq5vNC*P$)p7qd!4WG}Rkaxz}j(7f85iWhm_mQuTQcU0c1 zw@#5ecDKc5*H0)vpV*(aBPD%JqnqwM@+U+09rFwZ@dMo%cThC;27Q%L%0# zP^4WfvZ=A1Osc6z%9<=3p=t1y_(}|SF8(p0-+7e6uX^t6X-rFzAHHgWSh?3ILUR}l z;7YVtkWOO6D048Xz_>r0uM}XY?BWi)*2kdgo1giG$M}^dVml-a$Ti+xjLEEmoYYZP zSkGVMvXC)PS5gxUu`+WnQ0juh9C((&+?Ln0eosm?vg#O4<%&9-I;YcX^Ov+$c-nD{&ax zor2=oOO_PrxSH81O<~h>V#IVwoa2bV@Bj}ixdaZhYgaY%U%`q93h^9ip=}FYEVR(q z%inRA&#ue-eM5L{Avc8V83l7H_^~v;8{+q>V$FE3Yzg&o*4Ij3SYU@MAM7O`B``VV zLt<~&=lbyYkE_6#C1x|PnrPNS^?K49Wxq^N^;7p`QeK=lTZa1xKDeRn%fY#jG0jJN&NqPmpyCx(^;+u_ZW%gyu_t<9k8UP@+F+0FYzn($M zNP<5=u0wPq9&fU_*tRxDFz66g&g6iaf;XX2N_YUzjHpDJ(h{a1mcn5vru?WT6?}Ug zReV1npq8R_bPB4{87Yk0YN1zz=(6|lS#_@qg8nHt9#|oF?cdpqx@4Y^exvX+&;!A> znB8iI;TPw9JW!K?I{VBVzBTa49M>2C4IrvqFwt`xzaPt1#C4lTDMM-Jq>HY*NH*m0 zlmAk^_G?sgyu_X7F3$zjX)IZ}Er58?2#c{*L0mO~)06)u?e z9@_sy@Hop-l389EvLQKr^u)28;=%*r+X*X&J*^x!RGNqUk>qRjkD*w#8a>I$fyAxw z;mtrU!564;8*n*hB3|w!2n89g6gpSgjA@D`d`2G?n|19nOkMsQ)o=M_68inhqRKzy zI~JN+lp23nu)oS#z2r5ZvEFetR(vqVOx4oZM}EN}NBd2| zhOfgAdKO2es-@aEp=MH78q#w1PgQwWoo!&K{H_@ZH;7(Una+p${^?N%v4xQ~7oo(d z1UegUf6I=9S0FefJ}b|Fzpp(1ie@_YyX@$GpX0K@)VD@a!cFu|WH=_xO}H2X=FB~u zXCN<%0jZWc@MPAK)Q&}m{+oSKHY<+G3bS=8Xf##+k9l}T%s z+<6w^Kbw_}YjyFt#)YsB+pnxkA<^n$k#p&FpJnF^zXF5_F(Zwjsw^z07uq65Ue8brcPCQ;U)nT7koPa7;pN}PlzD04+WHEI&{F>4l#|cc*9XBZb zIn*M`wXVTrQ^#8sb}!tQI>C{~dAaV%Hw zO$exl8?6b8fehp_^&F>ZcDGR+ghPCdYRPK#qB4{69NFtaW!UVqe3Y|FLJgD<^qW?eArsDjQ44|rUfFy{wQ_X+#b%zLxd=M=Sc z#KcoIETBR5Q%rC$Lv1vkw}>uF?!=^&d4{WyK4Y*X6vv=!>sjO#3UN)@2cmZJ5WZX7 zD1bqD(UuD-wTeo(>u@!Kr~AOD_9g_uC~ZJ&(S`&KQ0Pf)0-yv#=F!FmwII1PA6%@< z7Uen_HB`ZJ{j>#^Bq;-9_#7-f(3NBWwpZjUw6W!?cB3`A1z){_4lP*H#8!?mDnY)L z?v!Qf0l|r56DlB8jm8q^&Rw68&Xi@=Bt#$Uw#g~ZxO)4FjAx;aWGU@{Zkq>P%Gr;e zbv%mGe{L*&s1F3f(!ZrRLMX!lG3W{yRkrz8rHQ3CtMmE%#mRB1G*73}Vg&$T?*Uh; zm?)S&2fT&k#6RI&C%p3zHwN1dQUTI#39$o4#fPT$j?b_UmzMv8ZL;lqclQVoz#QI` zj5SabuEzvMFAJqEjIfH75HN&V4lglpQ$|vc#58VYCrADyL3OMS};6D zM=mpq(m%*U@3h)JVzS1&fQ6Pt>00xalpbO|@vuCeECqTUAt~onoap>J4`#&-y*nG#UH=J#}M9;>-<_s}|U zqdaExcXVSZ?0B5s2bJ=iyjL;9u$hDN-O-xLE*QFnTfx{Q3W3h4VrvZDeVYNN%MyTG z0!cbQCCZKcps)a|{z=|*Ro~yJxsCn$j`kCs59-tv2$&JH`>GG+vO1O$NXh6Dv-h-S z{K5-EX($ZI_kII(E{_~*_Biii!riPZ!0IAAt3gG1(s}XPejze%N zl>6HPOjjW|Y)cWH#4+&ttwAz%iitcsXdW9 zpEekp>RloZq$$yUW)Li1yie6^+nYt=n~e3(%n^lcbfipOo!P~Q z!ypZ5!|$tJjkGmT6L)M>l@vuw0P^%r2&@#+C12U?CXDkA*k!MJbI zD{85_UEt3eCaFKr(4>H%9vlh-Y-G^a)Rtk+V&bh&(fOAwj-&R)!Bx z+Cx36lY~vnMog*`a`2=!u?{{Q6w`&GsXAPNjwySl=Bh}EXeaILzi?1Ky@M$X&u7u& zPL(j;V`XD@gf8S8-PI=jsE3!Q_UxuXjI6n<&ho#>Fte*py?2@BKob_tyh_{x>m^)^bLnadiywRwK3H)B#=_u3|vFkfd-6S zLSNG+qzRm3b>838UH-G`yr!!OOiWb19aBg5si3|?D+-Pqdpp11IXwJ(L|`5cPrdg! zyMy>S2>%y_|M&dP|D4=8)mXF3XZ(Ct$La(4X#*U`Sxs_)tb)3-)pz*~XiP8|-NDo{ zsqDC`nrXUdBbQuF<8x+63WA5lWGcfsop#dQwQ%G24?0g2AQy8WI(y8i(cj!aU4BK= z;M!|501HE0i4(LG`>0+PN8|aA zp5jlQE<1Zdb1_{cpJ`7|w3?RYx>hYDL*ob3@=u&KcTAMmik@EIOIwu8H<96)9m!1emjTZv5J;^$$JoQnj&4! zKCO?k0l`}F@6k=W&Z1*zWf=ryV~CAhm)e@n5&ob_U!b~>LlHAg31zHHL`r02Gdv`P zzQv`vLbgda$K?m|ZJAy;DzU1<8A~EnFIM9?ZJU^7wEynLDmpK@=Yop8M>pcWlJn%L@*!b4w48g zmYYoiW=7SG`Q!zY^2l%ZNxx>5H7QS@(#-E5SEH~WsDa%C%7nZa6P+0n=3QA(HcL)% z1W>cxHLee5ml5l8^j-O8)2M(y8`;r{wB~ee~#& zWLkA*CZ|;-L&3%@8HR~X-TbtIG7Xh%WY(0_X;jaf54ed=rg1E;iRi7UcW_~)TERPzFq~zo_cGqY{q>&r*Vuh zYJDN9mQGZJ4M$yI#9E=D8Z#tX*=Y+V>YFrF>Z3g>0MIUsM zDyks$?+~!6nD*;n(j1i9DjCRSI-`V}XtQg5x;6YR;MJ?hVBQMETa#>P?U>VkSbR|= zUNs=QE)-Xv`sx}@22Azq9HkDzsXNEU>Jj5)*XN8&+2Vwz~cFHd&uKp#W_*g<=VPZH!p1Zvyu z77v2)-R4C74-!Q17YQ*b-z=O0lLLjNFaNh}#Ip<|9u%%FHTE|aCas`-i&OUw4{-q1=r6S3;$<>#Spf-*AK641ZeGol9&=4p0NsBZ-ym`aRblmLEs3XSBG{puip_~_D~lz{g^J(Mv?qWg z3m!1hO0UvCK@j$zASlM4k*M5MS00Rv+h^kjTa6rRG9sJ_o^fh!+#~ZV3*JI1(xNmY zdSxDTIZe1J>q0h%==xCBRiy)@wPa6fl2PifjNCj4=LBVuUyOyeTjo9^_IY6p6(>5e zUOxE^S`X}tmt0?rub_HcFc`b# zXoa&cYwk%GujfeqBzp#~y*23p2Wp^dAZRzqv75@0h)|zTzG(+)Qke`B7F-=`n-(?* zMsl;}YPP-ydq@w+vVees#vX#l#avE6DcqEpwaXO%LLZGTn5YA>kiFvSkl8;$5T3ap zE28oYgLLAhr75fydX>rda0}`F%a@_ZWxgw6f#rSNW(j*S5=-0z&kcvm%LkprFsdFx zr#H|dR-Qlu=;s+AgeM4ta8Xo)<%wss8bU;e@4HH zG;za8!e|f*D2qH4xpfwcovGgX40rCB-Yt!kX3CtDF)Q`JT2-D}J%gewS!SCj<`i1? zt5O z31+kio*Q+Up4Kg^Mhk3gCetnO#nu;L_UKH4JHIbp9(;;K_8#tZ{3zq}h>d6}ULJ4b z4mKKw5t0+Cj2xoiUpR>W69*9kK5#U4bA5JzESDb%wC?`NTncR>}A^ z@b6XT%sgF2hB}qF!oh^xI)Upp2CK6>7Y_`_P?y$ z724;Lu5?s64==F9n54I!Zqb5e6%508ijnC)YnQFsi9zd*raK}ubE__BCz65ZT4tc7 z^ad$B2mIce7)t82?OEWwBVgDrMc~8RY-JZzmK5ugWB7MG1%+%gQ(aGMynZd}x;g3h&KHs|s{LeeYfPk_Rh3wZD z;X*Dzp9$bLaJV)U;Q}j+B{gnqjDQoHEaZwz^6E&#D-uqZ;+kgWSG4SbZp;i8y21%OL{Rfx(Y zt1JJo&13m#$W=4jzmB!Teqm;VZL!V1nuY)}QeM}d>Ocs^HlKEcN?Fvera%B+nUD8) z>F6#CSFzBX++H90nEG5?&t+^=^5|Eym68<|dDLWRP?w)ipOV#V4TuS2gFG4Z03+Lk zIL05ZX^3&eYCN`D2rf*Ab@BJG)%fvfiv*D+eM`=R5?%|jHXq%nAgLuV@ET1Al$ZGb zEN4brvgP-4+239D-I>LoA5%y>sPx38tF_{r+`+K%^%ZNAZO#gx0VHnjh` z`gy;B&A-QA4uAbr6E!}UzKOt@5<%qTo19;iI+}ig z+I#^?oC+xZ57|X0}(f0$V}95W6Ze6v2wM{-YKsOI+>cilW;zJ-5#Z{V4qOIH@0{=vSQLA|i1SJXmn zM)D5!wzn=W?n@pBZ~ANUl|ewpnU`zJ<{2(5|4U>gQvcf+sBB&~>D`52=CY;Fa}FLa z$a=qaPFL)8*SkpaIm|K6+>HdT#nW^B4UA0*&H}7XQ<%jJ;z&B|AjLFtY8VuZNiG&V`vyoR#m_|0~@m zE@OTr@PYl2Eh?J5!Btbwbf`N_e7M;^um$oDX@2M;D#M9(-Q8n-`|tD zYx(9kjfztKOsf{Lv@NO0&k!is9T&!)VCea3>DLyfIEjVX)4Sa~SD%ZLO4`}Dh+*!f ziYGQ&y?)0I^?O+E{QV&1RJKHg4D%_wtRqi!)}Jmq9;?@|El`KUBGlAivTDQ4^G}51 z|5w<3K5uSZV|}>XxW@kQ=9AaeXMcEk{~K_E5whx)kx3VLlo|H3m|^J%Mq@ddIlvp$ z0Myf5VH!a+Xjld5Y*%!Rs3%+^Yg7TQ(ZYS+CAvB2ryL?o$^f1(jdtcChGw+$-4L2r z^MEyDoc4xp0{UrY2owGTgB)l*cs*<}D6ODtML#(Tq4gXvsR6a3pCg5C0Q%V%2m^e9 zlgUu8<2wlh-9Yr?BoGFAOM(rAPfkN37I}&qe6$9-f#`=aAPfwX0vib3-~}-b%t0K{ zfNmK2kp>9EQh_IP0lkZ97~+5fbi>fk13(y73fvKo)i7|U534uPH`5~wYn25XhIMN_ zx|7iR=m=9@s31EDY4;$y5$Id!5JpI-p%?+*P={^;ZbwYfLoorog%KQC=tiKAoFW`? zL?77*#7c2=6VTUVB21WMf@}gJ!JwOfzOVve0=F&L1b9kDPQ&P?pie?0Oi^{fa0fW8 zp__m^ES7m7n*a_AtR~=&kP=^H6CkMqs}Z;zvBVGA2(Tl-sT|!Ks2vDIFfcF_1v4<9 zcPG$wqqg6WbsL5u>4vrD(6yu2!3YD^L_nO6Tq6f~vjPu90iBvG#qbEo>WT#M02&(F A^8f$< literal 0 HcmV?d00001 diff --git a/src/main.cpp b/src/main.cpp index fd52fbc..673330a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -15,7 +15,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1000000; // feel free to change the size of array +const int SIZE = 1 << 15; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -224,6 +224,147 @@ int main(int argc, char* argv[]) { //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + + // loop 100 tests to get avgs + // make time variables + float time_N_S_POT = 0.0f; // naive pow 2 scan + float time_N_S_NPOT = 0.0f; // naive not pow 2 scan + float time_WE_S_POT = 0.0f; // + float time_WE_S_NPOT = 0.0f; // + float time_WE_C_POT = 0.0f; // + float time_WE_C_NPOT = 0.0f; // + float time_SM_S_POT = 0.0f; // + float time_SM_S_NPOT = 0.0f; // + float time_T_S_POT = 0.0f; // + float time_T_S_NPOT = 0.0f; // + float time_R_S_POT = 0.0f; // + float time_R_S_NPOT = 0.0f; // + float time_CPU_S_POT = 0.0f; + float time_CPU_S_NPOT = 0.0f; + float time_CPU_C_S = 0.0f; + float time_CPU_C_NS = 0.0f; + float time_CPU_C_S_NPOT = 0.0f; + float time_CPU_C_NS_NPOT = 0.0f; + + for (int i = 0; i < 100; i++) { + // gen array + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + + // cpu scan POT + zeroArray(SIZE, b); + StreamCompaction::CPU::scan(SIZE, b, a); + time_CPU_S_POT += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + + // cpu scan POT + zeroArray(SIZE, b); + StreamCompaction::CPU::scan(NPOT, b, a); + time_CPU_S_NPOT += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + + // cpu compact w/o scan + zeroArray(SIZE, b); + StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + time_CPU_C_NS += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + + // cpu compact w/o scan + zeroArray(SIZE, b); + StreamCompaction::CPU::compactWithoutScan(NPOT, b, a); + time_CPU_C_NS_NPOT += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + + // cpu compact w/ scan + zeroArray(SIZE, b); + StreamCompaction::CPU::compactWithScan(SIZE, b, a); + time_CPU_C_S += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + + // cpu compact w/ scan + zeroArray(SIZE, b); + StreamCompaction::CPU::compactWithScan(NPOT, b, a); + time_CPU_C_S_NPOT += StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(); + + // Naive scan POT + zeroArray(SIZE, b); + StreamCompaction::Naive::scan(SIZE, b, a); + time_N_S_POT += StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(); + + // Naive scan N_POT + zeroArray(SIZE, b); + StreamCompaction::Naive::scan(NPOT, b, a); + time_N_S_NPOT += StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(); + + // WE scan POT + zeroArray(SIZE, b); + StreamCompaction::Efficient::scan(SIZE, b, a); + time_WE_S_POT += StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + + // WE scan N_POT + zeroArray(SIZE, b); + StreamCompaction::Efficient::scan(NPOT, b, a); + time_WE_S_NPOT += StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + + // WE compact POT + zeroArray(SIZE, b); + StreamCompaction::Efficient::compact(SIZE, b, a); + time_WE_C_POT += StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + + // WE compact N_POT + zeroArray(SIZE, b); + StreamCompaction::Efficient::compact(NPOT, b, a); + time_WE_C_NPOT += StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(); + + // SM scan POT + zeroArray(SIZE, b); + StreamCompaction::SharedMem::scan(SIZE, b, a); + time_SM_S_POT += StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(); + + // SM scan N_POT + zeroArray(SIZE, b); + StreamCompaction::SharedMem::scan(NPOT, b, a); + time_SM_S_NPOT += StreamCompaction::SharedMem::timer().getGpuElapsedTimeForPreviousOperation(); + + // Thrust scan POT + zeroArray(SIZE, b); + StreamCompaction::Thrust::scan(SIZE, b, a); + time_T_S_POT += StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(); + + // Thrust scan N_POT + zeroArray(SIZE, b); + StreamCompaction::Thrust::scan(NPOT, b, a); + time_T_S_NPOT += StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(); + + // Radix sort POT + zeroArray(SIZE, b); + StreamCompaction::Radix::sort(SIZE, b, a); + time_R_S_POT += StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(); + + // Radic sort N_POT + zeroArray(SIZE, b); + StreamCompaction::Radix::sort(NPOT, b, a); + time_R_S_NPOT += StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(); + + } + + // print avg times + printf("CPU Scan POT: %f\n", time_CPU_S_POT / 100.0f); + printf("CPU Scan NPOT: %f\n", time_CPU_S_NPOT / 100.0f); + printf("CPU Compact POT: %f\n", time_CPU_C_NS / 100.0f); + printf("CPU Scan Compact NPOT: %f\n", time_CPU_C_S_NPOT / 100.0f); + printf("CPU Compact NPOT: %f\n", time_CPU_C_NS_NPOT / 100.0f); + printf("CPU Scan Compact POT: %f\n", time_CPU_C_S / 100.0f); + printf("Naive POT: %f\n", time_N_S_POT / 100.0f); + printf("Naive NPOT: %f\n", time_N_S_NPOT / 100.0f); + printf("WE Scan POT: %f\n", time_WE_S_POT / 100.0f); + printf("WE Scan NPOT: %f\n", time_WE_S_NPOT / 100.0f); + printf("WE Comp POT: %f\n", time_WE_C_POT / 100.0f); + printf("WE Comp NPOT: %f\n", time_WE_C_NPOT / 100.0f); + printf("SM Scan POT: %f\n", time_SM_S_POT / 100.0f); + printf("SM Scan NPOT: %f\n", time_SM_S_NPOT / 100.0f); + printf("Thrust Scan POT: %f\n", time_T_S_POT / 100.0f); + printf("Thrust Scan NPOT: %f\n", time_T_S_NPOT / 100.0f); + printf("Radix POT: %f\n", time_R_S_POT / 100.0f); + printf("Radix NPOT: %f\n", time_R_S_NPOT / 100.0f); + + + system("pause"); // stop Win32 console from closing on exit delete[] a; delete[] b; From 3d2670029c8e6c054e8688207fdb5a0e8305ac9a Mon Sep 17 00:00:00 2001 From: risia Date: Fri, 14 Sep 2018 11:32:14 -0400 Subject: [PATCH 26/37] More E.C. documentation --- README.md | 44 +++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 4b8e812..20142a8 100644 --- a/README.md +++ b/README.md @@ -88,26 +88,48 @@ Once the input array has been sorted for each bit, the output is correctly sorte ### Shared Memory Work-Efficient Scan & Compact -An alternative implementation of the work-efficient scan using shared memory to reduce latency is included. Each block stores an array shared among its threads to store the intermediate values before outputting. By reducing global memory accesses and instead using faster shared memory, we can potentially increase thoroughput. -Both the upsweep and downsweep are done in the same kernel as they need to both used the shared memory cache. This means we cannot dynamically change the block and threadcount as we traverse the tree as done in the global memory solution, and we must be careful to synchronize threads between write and read operations to prevent race conditions. Each block essentially performs a scan on a portion of the input data. +An alternative implementation of the work-efficient scan using shared memory to reduce latency is included. Each block stores an array shared among its threads to store the intermediate values before outputting. By reducing global memory accesses and instead using faster shared memory, we can potentially increase thoroughput. Additionally, rather than increasing the scan buffer size to a power of two we need only increase it to a multiple of the buffer size. +Both the upsweep and downsweep are done in the same kernel as they need to both used the shared memory cache. This means we cannot dynamically change the block and threadcount as we traverse the tree as done in the global memory solution, and we must be careful to synchronize threads between write and read operations to prevent race conditions. Each block essentially performs a scan on a portion of the input data. The depth level (required level of looping) of this scan is log2(blockSize). While there is some overhead described below that makes the depth comparable to log2(n), the reduced memory latency in this portion of the scan should provide improvement over the global memory equivalent. To allow the merging of the blocks' solutions, while we calculate an exclusive scan through the downsweep, we save the root value of the tree in the index blockSize of the shared memory array. -The blocks must add the root value of all previous blocks to their total to calculate the correct prefix sum values of the array. A second kernel call to do this to stitch together the blocks into the full exclusive scan is used to ensure all blocks have written their data to the device output buffers before attempting to fetch it. +The blocks must add the root value of all previous blocks to their total to calculate the correct prefix sum values of the array. This requires a second scan over all the blocks sums. To simplify the problem of having to recursively scan and stitch together blocks of data for large input data sets, instead of doing a shared memory scan for the second scan, the Work-Efficient scan using global memory was used. This second data set should be much smaller than the first, keeping the extra compute overhead minimal, as log2(n / blockSize) = log2(n) - log2(blockSize) depth level in this scan. +Once the root sums were scanned, the outputs of both scans we put through a kernel call to stitch together the block data correctly by adding the correct sum value to each of the original output's values. This requires only a simple parallel addition. ```cpp __global__ void kernStitch(int n, int* in, int* sums) { - int bx = blockIdx.x; - int index = (blockDim.x * bx) + threadIdx.x;; + int bx = blockIdx.x; + int index = (blockDim.x * bx) + threadIdx.x;; - if (bx == 0) return; - if (index >= n) return; - for (int i = 0; i < bx; i++) { - in[index] += sums[i]; - } + if (bx == 0) return; + if (index >= n) return; + in[index] += sums[bx]; } ``` #### Bank Conflict Avoidance -This algorithm is further improved by using offsets on the shared memory access iterators to reduce bank conflicts, events where multiple threads attempt to access a region of shared memory at the same time and thus must wait for the bus to become free. This is done by applying macros to calculate the offset on the index based on the assumed number of memory banks. These are taken from the example code in GPU Gems 3 Ch. 39 linked in the instructions. +This shared memory scan algorithm is further improved by using offsets on the shared memory access iterators to reduce bank conflicts, events where multiple threads attempt to access a region of shared memory at the same time and must wait for the bus to become free. This is done by applying macros to calculate an offset on the shared memory index based on the assumed number of memory banks. These are taken from the example code in GPU Gems 3 Ch. 39 linked in the instructions. The offset macro is reproduced below, as is example code of its use from my scan function. + +```cpp +// for reducing bank conflicts +#define NUM_BANKS 16 // Number of memory banks assumed on SM +#define LOG_NUM_BANKS 4 // log2(NUM_BANKS) +#define CONFLICT_FREE_OFFSET(n) \ + ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS)) + // Offset added to each shared memory index so that more threads accesses through diff bank + // so fewer must wait in line to use the same memory bus (thus less latency) +``` +```cpp +// Upsweep +for (offset = 1; offset < blockSize; offset *=2) { // this offset is for calculating the original indices + access = (2 * offset * (tx + 1)) - 1; // index of shared memory to access + a2 = access - offset; // secondary access index + + a2 += CONFLICT_FREE_OFFSET(a2); // add safe offset to access index + access += CONFLICT_FREE_OFFSET(access); // add safe offset to access index + + if (access < blockSize) sBuf[access] += sBuf[a2]; // manipulate data at offset indices + __syncthreads(); // avoid mem issues +} +``` ## Performance Analysis From f2b4e85de972d9fbd7b9ee89dd701cbaa7fb1919 Mon Sep 17 00:00:00 2001 From: risia Date: Fri, 14 Sep 2018 11:34:41 -0400 Subject: [PATCH 27/37] changed some code spacing --- README.md | 68 +++++++++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/README.md b/README.md index 20142a8..773b1bb 100644 --- a/README.md +++ b/README.md @@ -25,27 +25,27 @@ Before we can sort, we actually need to find the dataset's maximum value. By tak // each thread compares a pair of integers from the input buffer // and selects the greater of the two __global__ void kernFindMax(int n, int offset1, int offset2, int* buff) { - int index = (blockDim.x * blockIdx.x) + threadIdx.x; + int index = (blockDim.x * blockIdx.x) + threadIdx.x; - // compute which index to compare - int access = index * offset2 - 1; - if (access >= n || n < 1 || access < 0) return; + // compute which index to compare + int access = index * offset2 - 1; + if (access >= n || n < 1 || access < 0) return; - // modify in place - if (buff[access] < buff[access - offset1]) { - buff[access] = buff[access - offset1]; - } + // modify in place + if (buff[access] < buff[access - offset1]) { + buff[access] = buff[access - offset1]; + } } ``` ```cpp // The loop iterates deeper into the reduction until the final max value is sorted to the end // This essentially sweeps the max value up to the root of a balanced binary tree for (d = 1; d <= limit; d++) { - offset1 = pow(2, d - 1); - offset2 = pow(2, d); - fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; - kernFindMax << > >(size, offset1, offset2, max_arr); - checkCUDAError("Radix find max fail!"); // error checking + offset1 = pow(2, d - 1); + offset2 = pow(2, d); + fullBlocksPerGrid.x = ((size / offset2) + blockSize) / blockSize; + kernFindMax << > >(size, offset1, offset2, max_arr); + checkCUDAError("Radix find max fail!"); // error checking } ``` @@ -53,16 +53,16 @@ To perform the sort itself efficiently, we generate a a pair of boolean buffers ```cpp __global__ void kernBoolMaps(int n, int k, int* input, int* b_arr, int* f_arr) { - int index = (blockDim.x * blockIdx.x) + threadIdx.x; - if (index >= n) return; + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index >= n) return; - // retrieve the kth bit from the input val - int bit = bitK(input[index], k); - // flip the bit - int fBit = flipBit(bit); + // retrieve the kth bit from the input val + int bit = bitK(input[index], k); + // flip the bit + int fBit = flipBit(bit); - b_arr[index] = bit; // maps bit k into b_arr - f_arr[index] = fBit; // copy flipped value here for scan + b_arr[index] = bit; // maps bit k into b_arr + f_arr[index] = fBit; // copy flipped value here for scan } ``` @@ -70,15 +70,15 @@ The f_arr is scanned using the work-efficient exclusive scan to generate the "fa ```cpp __global__ void kernRadixScatter(int n, int *out, int *in, int *b_arr, int *f_arr, int *t_arr) { - int index = (blockDim.x * blockIdx.x) + threadIdx.x; - if (index >= n) return; + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index >= n) return; - // We compute the index to access by checking the boolean in b_arr - // If true, we use the index in t_arr (true indexing array) - // Else, we choose the index in f_arr (false indexing array) - // The index "access" is where in the output array the input goes to. - int access = b_arr[index] ? t_arr[index] : f_arr[index]; - out[access] = in[index]; + // We compute the index to access by checking the boolean in b_arr + // If true, we use the index in t_arr (true indexing array) + // Else, we choose the index in f_arr (false indexing array) + // The index "access" is where in the output array the input goes to. + int access = b_arr[index] ? t_arr[index] : f_arr[index]; + out[access] = in[index]; } ``` @@ -96,12 +96,12 @@ Once the root sums were scanned, the outputs of both scans we put through a kern ```cpp __global__ void kernStitch(int n, int* in, int* sums) { - int bx = blockIdx.x; - int index = (blockDim.x * bx) + threadIdx.x;; + int bx = blockIdx.x; + int index = (blockDim.x * bx) + threadIdx.x;; - if (bx == 0) return; - if (index >= n) return; - in[index] += sums[bx]; + if (bx == 0) return; + if (index >= n) return; + in[index] += sums[bx]; } ``` #### Bank Conflict Avoidance From f9a8cf14a1a67815d39ab09a101eacc0b9e1be1a Mon Sep 17 00:00:00 2001 From: risia Date: Fri, 14 Sep 2018 12:00:24 -0400 Subject: [PATCH 28/37] began performance analysis writeup --- README.md | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 773b1bb..721c7bd 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ CUDA Stream Compaction ## Project Description -This project implements a variety of scan, compact and sort algorithms on the GPU with some comparison tests implemented on the CPU. The base requirements were to implement CPU Scan and Compact Functions, and to implement GPU Naive Scan and Compact and GPU Work-Efficient Scan and Compact. I also created a wrapper function for the Thrust scan implementation on the GPU. +This project implements a variety of scan, compact and sort algorithms on the GPU with some comparison tests implemented on the CPU. The base requirements were to implement CPU Scan and Compact Functions, and to implement GPU Naive Scan and Compact and GPU Work-Efficient Scan and Compact. I also created a wrapper function for the Thrust scan implementation on the GPU. In addition to these base requirements, I implemented all the defined extra credit assignments. These were Radix sort, using shared GPU memory in the scan implementation, implementing memory bank conflict avoidance, and improving the work-efficient implementation's efficiency over the CPU implementation. ### Features @@ -82,17 +82,17 @@ __global__ void kernRadixScatter(int n, int *out, int *in, int *b_arr, int *f_ar } ``` -Once the input array has been sorted for each bit, the output is correctly sorted in order of ascending value. This implementation is intended to work on integer values, and currently operates on global device memory, bottlenecking performance. An example of a small array radix sort is depicted: +Once the input array has been sorted for each bit, the output is correctly sorted in order of ascending value. This implementation is intended to work on integer values, and currently operates on global device memory, bottlenecking performance. An example of a small array radix sort is depicted: ![Radix Sort Example](/img/radix_example.PNG) -### Shared Memory Work-Efficient Scan & Compact +### Shared Memory Work-Efficient Scan -An alternative implementation of the work-efficient scan using shared memory to reduce latency is included. Each block stores an array shared among its threads to store the intermediate values before outputting. By reducing global memory accesses and instead using faster shared memory, we can potentially increase thoroughput. Additionally, rather than increasing the scan buffer size to a power of two we need only increase it to a multiple of the buffer size. -Both the upsweep and downsweep are done in the same kernel as they need to both used the shared memory cache. This means we cannot dynamically change the block and threadcount as we traverse the tree as done in the global memory solution, and we must be careful to synchronize threads between write and read operations to prevent race conditions. Each block essentially performs a scan on a portion of the input data. The depth level (required level of looping) of this scan is log2(blockSize). While there is some overhead described below that makes the depth comparable to log2(n), the reduced memory latency in this portion of the scan should provide improvement over the global memory equivalent. -To allow the merging of the blocks' solutions, while we calculate an exclusive scan through the downsweep, we save the root value of the tree in the index blockSize of the shared memory array. -The blocks must add the root value of all previous blocks to their total to calculate the correct prefix sum values of the array. This requires a second scan over all the blocks sums. To simplify the problem of having to recursively scan and stitch together blocks of data for large input data sets, instead of doing a shared memory scan for the second scan, the Work-Efficient scan using global memory was used. This second data set should be much smaller than the first, keeping the extra compute overhead minimal, as log2(n / blockSize) = log2(n) - log2(blockSize) depth level in this scan. -Once the root sums were scanned, the outputs of both scans we put through a kernel call to stitch together the block data correctly by adding the correct sum value to each of the original output's values. This requires only a simple parallel addition. +An alternative implementation of the work-efficient scan using shared memory to reduce latency is included. Each block stores an array shared among its threads to store the intermediate values before outputting. By reducing global memory accesses and instead using faster shared memory, we can potentially increase thoroughput. Additionally, rather than increasing the scan buffer size to a power of two we need only increase it to a multiple of the buffer size. +Both the upsweep and downsweep are done in the same kernel as they need to both used the shared memory cache. This means we cannot dynamically change the block and threadcount as we traverse the tree as done in the global memory solution, and we must be careful to synchronize threads between write and read operations to prevent race conditions. Each block essentially performs a scan on a portion of the input data. The depth level (required level of looping) of this scan is log2(blockSize). While there is some overhead described below that makes the depth comparable to log2(n), the reduced memory latency in this portion of the scan should provide improvement over the global memory equivalent. +To allow the merging of the blocks' solutions, while we calculate an exclusive scan through the downsweep, we save the root value of the tree in the index blockSize of the shared memory array. +The blocks must add the root value of all previous blocks to their total to calculate the correct prefix sum values of the array. This requires a second scan over all the blocks sums. To simplify the problem of having to recursively scan and stitch together blocks of data for large input data sets, instead of doing a shared memory scan for the second scan, the Work-Efficient scan using global memory was used. This second data set should be much smaller than the first, keeping the extra compute overhead minimal, as log2(n / blockSize) = log2(n) - log2(blockSize) depth level in this scan. +Once the root sums were scanned, the outputs of both scans we put through a kernel call to stitch together the block data correctly by adding the correct sum value to each of the original output's values. This requires only a simple parallel addition. ```cpp __global__ void kernStitch(int n, int* in, int* sums) { @@ -106,7 +106,7 @@ __global__ void kernStitch(int n, int* in, int* sums) { ``` #### Bank Conflict Avoidance -This shared memory scan algorithm is further improved by using offsets on the shared memory access iterators to reduce bank conflicts, events where multiple threads attempt to access a region of shared memory at the same time and must wait for the bus to become free. This is done by applying macros to calculate an offset on the shared memory index based on the assumed number of memory banks. These are taken from the example code in GPU Gems 3 Ch. 39 linked in the instructions. The offset macro is reproduced below, as is example code of its use from my scan function. +This shared memory scan algorithm is further improved by using offsets on the shared memory access iterators to reduce bank conflicts, events where multiple threads attempt to access a region of shared memory at the same time and must wait for the bus to become free. This is done by applying macros to calculate an offset on the shared memory index based on the assumed number of memory banks. These are taken from the example code in GPU Gems 3 Ch. 39 linked in the instructions. The offset macro is reproduced below, as is example code of its use from my scan function. ```cpp // for reducing bank conflicts @@ -134,3 +134,13 @@ for (offset = 1; offset < blockSize; offset *=2) { // this offset is for calcula ## Performance Analysis +To get a rough estimate on the advantages of performance of each method, a loop was added to the main function to generate an array, run each scan and compact algorithm, and measure the time it took to run. This loop repeats 100 times and the average timing data is output after. By running this analysis for different block sizes and data set sizes, we can approximate performance in each case. The radix sort performance is also tested, but is not meant to be compared with the others due to performing a very different function. + +### Varying Block Size + +The performance was first tested on a set of data with 215 (32,768) integer values (minus 3 for the non-power-of-two test). The block size was varied at powers of 2 from 32 (the reasonable minimum, the warp size) to 1024 (the max threads per block of this GPU). + +### Varying Data Set Sizes + +Once the algorithms' block sizes were optimized, they could be tested for varying data set sizes. The data size was swept through powers of two from 26 (64) to 222 (4,194,304) for completeness in examining small to large data sets. + From cdd7ac8cc356184d56339a9aa45e31df9dda48e1 Mon Sep 17 00:00:00 2001 From: Angelina Risi Date: Tue, 18 Sep 2018 21:02:46 -0400 Subject: [PATCH 29/37] Speed vs blocksize images --- img/naive_blocksize.PNG | Bin 0 -> 15255 bytes img/radix_blocksize.PNG | Bin 0 -> 14984 bytes img/sm_scan_blocksize.PNG | Bin 0 -> 16777 bytes img/we_compact_blocksize.PNG | Bin 0 -> 16270 bytes img/we_scan_blocksize.PNG | Bin 0 -> 15869 bytes 5 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 img/naive_blocksize.PNG create mode 100644 img/radix_blocksize.PNG create mode 100644 img/sm_scan_blocksize.PNG create mode 100644 img/we_compact_blocksize.PNG create mode 100644 img/we_scan_blocksize.PNG diff --git a/img/naive_blocksize.PNG b/img/naive_blocksize.PNG new file mode 100644 index 0000000000000000000000000000000000000000..aba994a497ab6613ea3708afc116fccb8fb275fb GIT binary patch literal 15255 zcmeHuc~n!^+P7X?tksCrS|t!rX{8QGML;ABB49ui1QaC*I7PxNkf4M~iv_eoYE|MC z=ExXO2m~U5_F_RB1OgcZB&kNoBt%R?AR*yBLA|efZ`W<^`>yZ%{`j)U$~uR0_I~y= z{GMSy`@%kN80d5G=kw;x19^J5`OTa6H~74H^CLdf1Md9&k!%q7nV;YX+c~eM+gJ!( zd=loe*Ja+kd)UP@M?VFw7oGA5Oqe(C3zYUdpL5dm%)EKfO`dKp`_F`o4d06&dct}+ zq1vs8TX&oB?12>FjrSG_>=H#!rY~GgNjtxu8}^FGny{}A>oHwDu=aezZ<+(>z^`3? zigI~i<>5Qahb8#<_&B$H4pS!IkSVN$5=0-DR@#R1OtG~F+>F1@2mpQ_xcsHd7fHQ+=D3r+7qhdon`({e+Fv^lJtFXvopECB z_FuoudNnpgHVth{QzvZ*X{Y1)$zX$x4+~e6VP-XIY-rl_NM~)EM%lvqx}mFWRxK&u zKlK)~?Yp0D5l^H}y?WNBaGw>)SiJEU3T}FOYY)5BcsMCbBK#`4Dt%_6(;1Vj7$25S z&P+@B1zsVN{xYB0;2xZ1Xj;DNbi*Z0ss(0~d@CF~!Ry32Yi6W91zs~(8mE<-Xve%8 zO2%u7Me$ITYDb(swkc}~JgzbhR$7$7u(BK|(o7D;_V~QCG-QPwdwNe07Xd3xN5i3a z#v_pB*ObL~qs353-s07E|32 z%n(|joteK8#zMK=NLCepVX*g5XDJ zM}&NeUIVdYuYD+O1M@}=tZwhFgNCH%`)%BqSGGD{X#K%OY~hgUn;0=`9YbT6n}xDB zhq~@pS0kscXEnn|UrK0pr@BQWdmWcTgiN+9!O}id+!x>QINIV?v=!xdKy;P(D^U<; zC1O!ue4Lo+A7S(2JMrt%U^)&#faVvu2#V&<%X zuSy zR?#|e(-Fgio`Ocwp5e4HJB7d=9>8w4euB)}gWb!|f!+Y$MR8IJMu*BfYsE-I3+QGa zXtN{xC1bRkfGJniblPzAgKKK5_!DV8HNkD&Rp-1yu@TTp?+!%OLGbI7lw3bjO&kQ> z+--Ef22oQaXUHQx`qd_6IegdbIQ@8^74CU)=LYMaSdFNPEH7FPj;&_*yqYPvlFiPW z(-n8e?RoZ&OD)lysZyf3&)6RRx!pcB@WeDrMhdi{1bjM|koZMI2D7)gUwSzUoP9Nt zarcfu69qyxJe^sED(IZQTam!I%_*ghq@QDlGUBe3ma~P4g5{0pej%6{bvJe_UcvyFMe}b_6Gobw${8r3x_$v8WLD zKyOYbp3i?8xzKv4i4qf1!`O#lvLcHr-nTukVJd-Fq&w?gFmhx{Dx0PM{t zq}7Z{=BuNwDSNu?th*y_4T&Q)fqfXU=I~I$11bX04*F7Jca@nv`evr#Sd{EUCFDoH zjGmT~q9M8x9Rj(q%>M7b(0WSFmSjaVmK6%KBWVVGL+ZEQ5Zd!ni^cssOZ#+j9boL} z-Lgd6(>os0AF-NWoah zrLOG^FQ)Cb#4|afz@k~Co3RDJu#1Vi<<6m?)tH~F5u^Ap#|r;Z2xvGrABEJ_zHCwp)l+cFmi7f(EnRL|5IY@9X+^8}-fFFyNgGCeNt3IiHrGPcbd!N9#)w1l3?{{D7uy9yMU8&z9{pC8N+KG#Wa4b zyEO4BC06RSAEhRCzTr@iBUWG=xvUVvVX4c-hoz3gLFpFGApz>V}y!H@wBkg)zZW zbuR_buxdB%eQM>pRRT&R-rw4S7J-V&r3J#pT39 za%W$}(_fM11Sx8I>SrGdX9;pLrcBQWkv4Fbv7>aTe5k5q_3!!i_GQSxVI0L7b*n*I zN$VJ@#Q(tBZs{*J#%)9!(x-!P(G_^kQ)zaLKTYfa{;F{t797?}3}T1UusG%sdl)Su zn7!l(`WAVqH+e8Q$`9*v;*1;4T_6m4Y99c4^Y2jWVnf$`cN zi{VgAZI(QSO~nMKqiqroiOY{3pesnNJ_*>;#^#dKqw*&mD(Ajwkq^z-QFe_&rM5h1 zO7)keZp3sc({dD^+~xATH5 z7$9hy3;wn%F+Jp1ZX&KqzF!-$8uw(d`uqF)GFXzfSCOmlN|uT_qC0P*|oEFv5ni1!n)$3jdRhiU7jf^q8!Y8MF1=s1h;Ri z2?~{fq6~mf5a&hVH{*rhGj|i^pz%7=aobqxF0!aHL zX?aE-a#dxQE7J}cKF!$S*KnDw)Qk4@LwTfR9nbLM(PPm7D10Duf-Y)UC|NKtTCe2RQE6aB- zxN-|vJxmdi5%5(1{edeo@^bex=^G%;eX)8huOI9omonFxpVwQrZEl?;7q5(~oZ17D z{=Qgak>g?I!Pq=@GUJmK_$@W-afDMyn+{WRYS+q*Cs@8_wOl~(jGx2_taA(66rSUvI`ARwl`0z?pi;H zQd8R_ne>V{M|O2`Y;T_&9Y`!Q0^hv741E15pp_1kemx8DPsY|um=yFt9@V9)=9Y^X z@s0(>xFvGmrMy@OCn9oH{6Y)riSmbzdPE7g&G~e4k5(&Kdly_Wf~RY9Y7};ISWzqo zbV!)tY4dPjBj>nH$9OgnII5XsR-BJ24`z;k0ax)*7@y$*oOXwT67t8D3EC!1d)|1AI zavj!7_t`9kgMaBjTqB;_n8Bib@lI&4YqGwgkadiGTSp0PIa7Hb=>?NsOzeUY{z)4! z!c$HcZAL>Z*SETVAUqKKS;>1cN^WrRk~Ax2!v4o?i1Lykyc4`=!TXCj2qDRaQZchk z!0Zd0>vy{{rtT;(+kG>xr9toJvFj#Qe(7rgmT=w9_G$UWwFRzEPX5HSm4p9cn z!*xI8*GjX)yI;Of2Nd1=`TJ%QtVjIx{iPy$;mh9FR{#<_wy|T?V}CDJWJX(*DYBWE ztGR6x9`DV@KGxg!M}n9inbYA6wvY;a)g2|AvpyMl-Bnyvby3xjku3au69HBlmI$PU z*DHSYaLsiJ6A%7cmFvws^|j6F9RA}*Jz^jTfqx%c@b<{?Y`Vd^)DYI0Pb$0b?*bht z0k=-(x&=d#!SEi#k5tIIMd0cw7RUA8^pVi(q=nxoR*|e<1t&X)d}N9Npe@oP@(~(t zB?zvtD<;sw&)#P$B;11g)G+Os;ydRafr!ppe@20+#*^tvZ@@#CRL@~sDx6nHkfKY) zKzZbhQ|cp5GHLQ92ySy?jM^#Q^N}w2Ey&T^2nq%_qTr$2*Fo9xklJE-msU}OGV+|d zpY{EeX#2i={TEu$>E;k>tGp*sX>dgZC8ypXgSVw)K?6 zY}G$O5{hx*N_O@kHh|Yo*GW$0 zsN6K^cdN8IHzpW%uope>baSf~lU%AeOKMBtadW6NNWcG&_rfy*)H#1|G1Sa~C8^|e zKLq_ebDz2Q+O@}m0emjr_V>2*S*LbZ4iY9g-5bgOzBF%lO~WAOU4BFUJ5``CX2fs-~!wk{ro%&51_K~%CY+rcQF7?wZf;-@ zR>X&H!-^j3!;9DKCI=a2RVMX_I^-f_TQ8_JlsMHXruWlzF$IB7m~4Qc-UKUtqvknfG=83vfY zLi4Kcw1lHpAg{M_62b>4)qSAoU%Wqy{^idm{fO;-1K~ja@vynmz(PG=G=bgMQwGE3 zF;n}JA6yLKJcY9Zb>)c>Nn1bXuMx8W#{mSr=`*~?uR?*yxA}!r>R?N*y^!k%qjgAM z+(g(Qb7CK`Qy&$M5Z;%r69A6be~JtEGSUaW{3A-G!p$)Y}1dqCI40;*b63O;6in0f0D8&voo6vSORnv`2?wofXw-_Ssg0v(Xv zdd7byj?!HVtr^+5@!_lbV%KtZiYvP;&@LTFJp0 ztoP?gviz)5gjD#ix=YDz!L{6D%veao)WuX?C7}uMr*iV$q9`H-XW=b!VcC0LY_06-joEpv2@l1$m{P zQho(Ln7R}<2f|9}j%zWlraIy<5F5$MPB;3wvMdV94#XB(N`-U+BJyFeGl=o2?qeY& zg!Ulh>YEEoh)0Dt5$FQh{7U?Pz@X*NBopgj*?gN5QP_tliOgfNd0s#A5WzYM8c1sB zs`?FVlLN)iJdry9*iLoiG>$J0=I_x}%Ym#zwh!ed9Ms=LcqSykNF8G4h>|^_Rw?)~ z5^X#K1c3_@UB!|wfmn=K0C*Jf_*X!LLc9fpasU2qn&+Eh-c^^#A49wuM$c;@QOIS* z=XTcw2Doz0xcyj@V5_5*C1&I?3WUvhnI&;a)S^_U;MkTnQgtiK%DqfX(Up}A#w1zY zGh%yAuKj^gu}F4?!>KEFsoHFsRkK-FyfGrlPM)DgZQGTe2y@NTGo~(bT&1I~0T~K) z+r?s>*gE@D-68jLUCpEdm@dVxFBa0x|Bb7wKHDSJz;=DQD#w}L`K^Elv5NW+yhNq* z=Dm{N`i!2-vj`cwQUJs+x1N#ph(XSrXS#wVfL9kn*4|EU-?!j;cTXoYN>_FRITQb6 zcsba4(1o5%em${tNLPJ<0QJ6Wn#RK1BkRGf$8?pQ5U672{s%6#UWBz{cRBz91Inhd z8T7{)(ntBmM_#M6{!QuZs*6`!;GqJ7jV7(XVsJ3?s?5!@=>W z{a$V1c1z)y0sym12;%OH!7~`tSe?xbqWAT?DF35?`(x+Q4b)IOo<*3DM*EXxekGY} z)Iliqv_o@ZAiiwP-Ob`m9r>lL{N^|fp36lFftqcY_zQ668|20j`$;JabqCvGpR?@C zB^cKhCyt9`pzl-(?yK5+zz$dYmBaPb<6fX7LsCfI+a&tq#eF0gRHCTx8{R2`0LgM# zJytYV77g{LaYb{*f6i8bxFU4F-pdd2_hAbl>w|C8hT-pJ_5b>=`e!6J)e_FF$Y6a1 z+QBY&2|t_`xr0Kg&U8xM+bk!q#=7a2xh4)466V8Tbux(uuy$W!BzK9W{Pl4 z^s2xqve)>`nkTz-TOsRMZ{@ajD6G!JKxg1*O#wyZ)_x;~_^|+Oue;;O%RM1fACksO zh0=P|`A7-XAFDGwn8A}JzfmR^c$w--e-%&~uSR_>{sI)3RLj!Ur2`dg0q+Z-itfZY zTB?W`A+$&9st5XT>-~t{L6~j=%X5iz(Qzh2(pS3b{qKc@HI>Es>zSPnU3?+;h>kPX z+9R1+KZjhImNq=M)csvZ_)PKzV9S?Nhh`9qi)xK@jJLLJK{CB^5l)VP>Fd=hrOq}e zA=g_+X9w&HTYnk83Os9j4CEo@dbJ=1ihr#dqFvWoNjByGy|3qh`M3O+k~J zp^J4ix7RQd^32bX06u{4IHgS8o&MrQLn}~rdOGIl3TO~~8&LB>s_{U5tVr%}4%9D( zfhuqj+TZ-I+_k(gw{1mjb`hHZ&Mp2~OVmmomr2uy+k7kJ?p$!~@ck%IATJ)TV~U{N z%jMjy!A)Nb8NcPj_BspHp*pws{={0OBd2byBDn!O2x+0fmii-39S?DBy1ubnM_imd zsYiqXHBMdXoc~8m^Z!rs|DsG53u28D)zB>6HO6XozGxbyYhY$b(&M(KCOcguq|?~7 zp|f!uT`PiCNYLB17fPXSWoufYO`(gI6*B3shA05BFyptoG;^Kj)dq9>F~F8jKd_Og zZ5oM@zwOaddYbZIJ!{V@DAP!TvUtL|?vIh|uX+utcayP++Dr8RelrZvw!Uzy1_D#h z9~(H6r0M3hVojYjL`!^rs#<9i{5N?~2%~4u5CUx{U-sx01b}225pNegyC8>-Gwvky47ni{G_ZV$)Y-a7E%Fn+R z2t3_=jF}@2hY-ETaE)7not)ONUxoTKiqH<~a-_Mt^$sMo6#rFpKQ=-#9V_L(L_EkMi61cIX(k>mOKi%nLjQ*h1XVyS5o^yy&QXzOT<7?V(@+|Hi9x+1x|P`-my z0xobE#Vu~MRZf}91E3akA<=vuzQ7DV>ox>;H0Up^UD|1)$V>f1+9UCx<__^^>K#v4 z)OL=TUNBKjwb-5BdXqc=ESORonl`UE*EQ>Py`WUS;Jc8J zKs3+EPyNTi$wWUxlE%;HVL}9-V4~Pu(38$6`Aj-_(2%@Uor1;XEcpHs&hho+c4y1( z;Fr<$xJEYF+c{k}1>+^!jVFwb)$e?BV&oHGa}el1iu-9U<6i&fGzoqrTZdC6|NaO{ zOE}Q-r@{gqX)_LTD*y#m6Ew@VnJJ}Q+>@|&?If^8nvF=EnVLv?Ww=%UK$t#U!l9~$ z)EnVh-szf|Y5sPDiLd4!3P0Po+CO4h-@JfsV-kvsQn|Yx7{-A~@u*GDqiMiWXg7h* zge2an1Xkez+GzHH2gm2`76Umpli=c2X7kB`4znYOmX;c4eK;Bu`P0z-s1SBa!PPjA zTcPV7&s%vf6X@T&w+Un}>w9K*zAGcoI~3C#=R_0(mDwz_RH}`~BRhx3qv)K7@(t2( z%xwd@KhW5A=qt#le!F;u4K`0NWpp_(E_H|_4-FaBUCd*mHz0rxrfJ?X!qCmE4Z}dE zvzrX)_FtPvIC?xycsJ7$WMu{9H31z*%NXC4ieG}hvoT&zUb-D^giZ&_PSaa*$lJ{U zxugSKoyP*Lf$q7VFhWVTOBKssObr;$)eu}FTUw$(Ck)<@ULCa10(`=a#=_L33h%OP z+Dge`Eveu_lqvE|f4uz6+G7@PM-n=K)`3r)FebT`b&46k_%HCsn=CX6&dMwS|1N6W z1?Q%m*h3EDKNb(#53WN&F?+~Mk__Tf(jsG|f2NWh~ndwEDMSMg~jo z9*huFUCLm$)J!|A=c)5e#de?VatnRik^1^wTQA@fn(F_D2atd<{`zU9|LMaIrG;{t z?CP5a+wOR`|E$rLQ{GeHJUcal3(YU7H8a*8o{}WWfFn!={9O+k0E=7#*v2Fm|GNl^ z12&1DowXUa;|uI#_d439#=`7n?1j!ktI%vS~)E;pFH?6Q$VnpHT@ zp59hJn8fgI_O3nS=AlsatuE?p=sN|pQ>r7kxj729L>yTa+~4EXC1wj(WK9y!rS5M} zcAEiI_)I^t3B2_dU_FA=^uvf>A~^pJglE^o)o zEkQshRFFw$h^b;%J9AaBYJV^m(o@Fgbo?4++$4-IMs&!<$)4g;xlkPUt$!JBggHHh zM$EK6RwRp^Vux`P4(_ze6PimgNyf>kZq=JI0$ReY59i!qUQsu>(z}Z?vj(>axW)Yq zTMcn5KxVwv{x~*9lsF1GfHz^=P*Bb%T5n%k&@W(h-aG9)ej&4JOX+yQxMXTohbow> zi1VP=F~po7OxN^>g{S2vjTFQ1)x;XJ;Fba>1|){sYpToBaRr-xuSS3&j;kLP(Q zTsu@$ck;IJso%X!I#b?|a^j^+=)q3jG;k&-W^`<)tO-rE;3>sQ^bng9l1fCNlN+4y ztk)zHe2vOok(>z7!KHA6P;DYjZpdr94%M=IbxYtAcj%{VP6j|Tw>0=K8*J`|ei955 z)8iYK`*z@SSz9@~2#LrbXifZ>dX)q^FmywxWIN<~DR}vSEL}THP+U1*z|Zg)iAcikGq`0t@l#6D~`8FPP9r& zB27C9PHO|M0qSK9uIcF7Ja(P5_4?`020bs$8nM~vd{zl0v(@DEccgqe0pDe#aAmVG zlLDt`ko!3Ax}?()pg*(AQ7|EL%H{W`n%plM``$!_tey2Dn>VDni|NtqUOzUKVKXcl zAgGGd(HQLbm6)t1?D284V+Vl4ZF@_eMRdLF%dJTKR!J7fn#}sxgW;E;i!4eYdo2!% zn^Dl9nr2h!$cfmnL>lkOeN%-Spo~`BHcAg~TApeuA)Ao(<`s<j3|`c z`_y7-r^$=Qe*rKKHk1GW literal 0 HcmV?d00001 diff --git a/img/radix_blocksize.PNG b/img/radix_blocksize.PNG new file mode 100644 index 0000000000000000000000000000000000000000..f5d898ff7276704828b0e3b35980280c70df994c GIT binary patch literal 14984 zcmeHu2~?BU_OD+51#d0T={kg|)}ggtMMXeB5)mr`A_{^6B~ejEAtD4ZAxV2_!Cn~@ z2dW4G0U?ACl|U2>(Nc*MgOEXpB+^tNi6J8S5=cn$P7rz>(pK+!>%Fz!dM=~lm+zc? z_VC-kv(G+1Zr$Q-@jmSRY15`z_-xt`IBnYB!lq4|9yikr{Ljy~mA&Av>B)iK>!;PW z+ep9<{}b)$?>TMSrMy|=yWayp|NX$G(Bx^;K29-yrVFF&)22;xv-R2FxjhXj@4J+0 z&2FIfYkk#8pSlGP?sQ46=8(@*cakF?yYD%uoN4FQQG%PzI2~|R=Q4Bi`5(V}WN7`# zYvFX~>$g9TBTwIE7+@PlX$Fl$WcJF|xyg^monn0Q7fP@;^o2h8mcN!P^2ty6(0rLs zetE_k;}hFoJpI7)&n(087y6hdb%HdC=rCQ z#{~n~`tgwtdlZX+*9Q%do15`k)d*AOt{KfwNw5q%+d9xeB3{F(hwR7X3{kPmg9T7R zjx<#kNrK}wW5;Eq&mZ5-LlJclO;E#-Mb=5Q>Ogq<=(BX4UJq&csmfuLEDxa{9Ud05 zS$OnPh=5-k+Tt{>Ua9IteVn^JV@Sa==*2p%nuyOC7|#wtVEP(q10UQq=+!JkV-B{E zqZ`y-^vBn(o?*Vszs&3`_S+hNvapsT&|N?L+lns6N~!s9yfD$e&iOk%4Cm;IeNdfq zF|pF@bTVh1;{`=@f=0}i#ec;-&Qm{~>1~e)_F8AP!)XJj^S(&ni`ibVNXR)d?$at7*;3a_YTe=+~MVE6^aW0#?W}IE*jy4N}#}uQK7mPj=Wj{7hE}n(8P{v z8sk5sG9>X5r(U9D&@&F*2IPOwXjXDgx?(!RnWniWy#5tb(##m<4yZdr$QCuZ_Z4{! z>dM?u1%Chy>D=r4c}XfbF#x{>9eApjBZGI#IrzakJ3O(vpo_$6TNHhN3!dmp#0xA^ zuAwl_mBLzCu$2zOA>m1~gKo0q_F>s&H~}wrns!Vxdd0FeCE7vKU#YDUDDr}(@q;u9 ztKDu@yPR1YP7+0XX=OH$yf2W59waF7_8qCgY-K%1EB>%Ieh<10di+8w%$#d>=`!)D z;qR{69m54tEo$K`mqyL#=(iCXbRQ-(;a^Nio>%XYp#UonaTIYVK14{9)F6ABdZ{4tTp*v;ArdIDbyQhStCowpJBpWzRXVA{ zI|7|@dsvGLw-*UTdIDF@xzsA->D;1cO8a+@7avkUm*7#vWeJM-2SI!)jg}HEKuROU z2~WuiOy3as99A=O5=p_Myzwly!mBR?U!z}J!D0v$yJdITw9;mlgKobM`nIQE_n^6G z*t6a_OL)D(>JGjqwL}U8zb90-iup4R+Ot-j*LJr=R0xOq$p_mraD4-fiM9-Vjej!E za5v3eu)C_cRx>WYz$>)GZz0plmWxzr%cYU@L*vaT?#P2}=UTGZo2Jj^Nrm zwy51|Gqyn<1d&WWG{_lvq29SJ$FWWNn)&6IUDlq&UZmcYC2K~cYoOFJN6Rw8E;}wp z7OC9T7yb2D1Bvb7^-Gj^YPMG};_EAnZJmz=ZA40Kc!2`*EI$Z;k||d7(VF3qxAoHs zwgWUQ-v`0_o(S1K)Z4?2%Bv|v+?TG0R*38vJZxVo{#w>kY}3J6^_dLyz*%WLEizB` z$j0NBoEhYlr64BF5`M!kuC}|!h@Rz(C<$z}sLf;cm4_4Cfg8(Mah1skz5Yn1%a zE;BDR6%ic}!B{SmJz7+$`4Yz#HKFjjq+>Bk2!$OFv=<>myq>3eKb6VHm_G_OCf~;M#T~nr+#U6qn$GHd^q3;FjcrU`bAm z-BUKpzdQ3Gzt66WyYJL}p-xiwzRqQgIse(m%-v7|lYS%R3-M!}E3T@*?%mv+0k+6J}rrDpw_xheK9%d`a_Fo=8MN7we-q-rF5Unc%`5DIb? z>xLBG)C{|SX$jnQf$FQZK@8nlL9A|0R&&lBKalJQ=kB;hUuc~#8?1bykqN?+aoDQ0 z8okfJ0U((4JA%Tq-u?hPr^MyXFce<@_oqeEWsV<+H@YOkPJIN#dx{&ObX1D-8(H-q zp*zkhFRZZ4$JA(c+BPLda`-fHPGgw|Uew(-z)@E!hZ#)ZQwarS`g6k#%RLC>yu1Qv zC9-Kce`TvSLPWD`kai$XK)N9)NwQ)`1H(~DYZ?jB49J#c&J8nnAm=uX_`w6*{+)69 zLQ>oUdxM!s^TSQ3B+NPtParu}<)K}EgKtE!TyDVV*axutI*BaDI?A50+`Yo3syPwA zjOohPJxKf+W6)gIYp`N+hrn_dzNa|p%cnHMImfwjAR~t@d!C@csVOuBAprPiT6lpS zi^FCVK;ObIV9DZh0&me174AR+#+nl#fxDqgvjFy-^y6_tpiq%$iy@uUv4vQ}<)aGn z3b+DhD=HaL9)EB`mzV86W4G1kc>P?JNwp(M-EJ*%?L6ShIZ_{R&EJf2{U z#|AuQ5(!8ifG*xyTlT-5NSffD2j^t)_%zf6Ii#g)l#+o(l@J7G&OHF0Xw?d0x>hA0 zliM{)JA&YZ@lkP0mX(poN>&_N8I!<$1Ucv&#XM{$->DQYz`n67CF6G4`wD#aoowfI_=)4EuqWU;#Q6U^ z^TNWHhJ8*W>SS@?4ejJS>t_VPeDXi0SC*eS^J3)nF~=_)U%12*=X>^y>x%)tpB)SH z>B67)SY%P^v8WAD$i-jJa{7De*Jn58r;uU-e7wNN(a|X6djA*i_kGbD)b%0w+`9*) zq;Ea(VOV+WLLc>ys*|g~2Or0#o)Tz)MAVOJAX#1j39eg)YEB;;RuB!^fqJ_+%Qy>P zpS9SWT3K1?GLIg2rP@=<2it?;p}wXQV7q}Lq>qmdAD)ZziHIJ52z#K~PYY+PTo6{} zGjmzj>ilx;2AnT!y>UUzo_SO^zu=Z`qbH5)wF3F|%tjFZ*#; z1V^{GC*8(~J~kn{aDOM(Z=krMdOD@n-IQQ`DV^1I--u_4Wc9)J@bon9ybIW~mvS#v zCe~$AwjgpwVImfhIHWjR`or1pbN!F5wy+h=jl8D;_OP%e0fnZF^UGCGJXzegzq7hh zuDKUPim0AXM&3R3LHO-?)s6aWha;8X-OkVb7x**=0E5pr8oj5@j0(Hh>KFhye56Iq z;$7eF@I7mTRcJwRskAPp&N!QiH?Vguw#Pt?rRY&hN5gPzi&ckiEc5cG+YZY4P8~Y z1(R9bp5n2}gh~6-J3lTnA*R1M6c+o4&!c69xxZ6^MY>L z9K)a{x-#GomM=ZDj~kQ)t=?VK4;vXYz>fdH_~aJm=$(&0!U3*dk$D>ZG8HC9hq2>_6ML6^g$SOe$q%C0ZSHh&VLcl+Q~_f~EUIikQ;fRmlG#N?2=ZZp0R1OYzp` z)Ph>~S*9fXV}x-_4vxMkm0U}Z#@_VaIO+SUC6ieqa@W~2MquYhaf1#)(Fi90s`BPD zb>^feJq?=7sqE_$`7oH)Rz07h=OCuCDBkx4P%G#D6cq(fNXmEPlwc&YME37ZMo3TP zWOB><4!}vBCxsLAJtXLXrlNf>X%cQc%r2ZUS|3h%98cFzrL$Kh0GeU_L?>>OI71Fr zzoXGBQ>fO)czAd)$4a;knY6kv%@>pEx*RZ>+U;g^-+LMrO_v(xfu(9E0qGWe(F;O= z{a~}B5i={0GYtYX%XRd=X|8skH+T*B<(_1Ey`OT*{|3ged z)-lWHbNpaFG%p0h=eNj#`fFn)!8B&I(qU=eQQG}qZ(?JORAOlfvg1`$kGitWAH1nU#$+fh-UsbUBd#uj_31)D>`L-R9_wuCl z_nrbP=@Xt|C$quXe{6M#oeAY{?2>hMB;DBh&7FVnTC$v2NH-HhW{=^ic)oet$^f+?a>Pk74CanL!$y(ZV8GR{6J3vkAH zbZJLHAWPr>4Mvx=i9)PT^EG7g6e5;k?D*JZuuctK9$$tz9rfPCj8OjX()akAQR6A! zkuxj17~6>Xn!Fkr;DQTm=t$Dknj4+NqGLnW0h_3{I`~W~7R1!el6pYRmH=nDJ*CMA zYib>^4_G(Yp8sdT{o*Ne`z!TDfK<^)3yQuhCI2Gt{_!o8 zkqD3B9_7Vv?Y&@Dn#?^`-?nB$4#-KFG7cu%^96YQ?jf}~ywJ1%m~c6c-%EzKrN<0P z##I6|f(HFG8QxRQ*HwSt&9ClnMsLo-`b_Ei-qXXKSnz8Ekev~n#dZ>cs2}?%=Plm_ zSR#}b6x9Cv1q%EgIe%d9MCX`O*o2eO^tmq(^7ruhBUjqrsTt`&kg{q};NXI7JT}Oi z(_$EA+aqXWudnZNM|>HwtLEE@2t(#iE!()4;Re_aJw7KIB6PgAu0MIFqWdksn+&#e zuPsM)IDzA5+CEZD7=*3CZcF!w*h%A8i>R0(Vp;NA%Vor>lXSBhSgq#kXHF^H>(F4! z?6;lWnjUwodVahXCFg|J!WyKg&*4*?eF{6Dj`RYYdNWdi<%%>vRA8d+5;3=qLS4oU z<5a{?;X!T3>Q|y0yaN$$cm;gK>-neove)v!lz9%~FZ6gZ7fyxKRGM_!n$r^EacU>+ zK2|B38}O?8Pks3}&E~pST=_yqND;TXw7PwN4!NI#q(D=gdup~R)Sbj7W7@fG_cFv3 zT834aRl6PK8l)F99c;C`JdduP^6IH1_0nMZPfK6+w6T&!lQd&0Ql~_HNms9;r7>Ga z*Ge&=(5L-KsoPXOed{@K#=v!rttkGyd^$y}cs&`K8t2vUZ!=$3l=W)({`Eq8N`gU1 zd3fw(_4XT&`o@HjlZ4j9yT+t!l;c(G2j0O&XVNNX`dIA36?nWqNi|GZ$VeNEI)7bm zD*Qw+t-BhvGYpLmVue%mEs`>hT%~wg&4}J$+GGCI3gQRxc;bLaQ)2CnyU|&a9nBx={XN_wm*$nXIAM)1@vycRUM!q~-hHo1~+r z>}X^ZKwYo-AgH0h=$hEIzB1*Kv+f5#RoIorN71vO{O@xkfQNTpGX6U=H^7)*S&LuP;-Ug<6Hz2CneWfM>F`gg z0DJVi<*((bzw~}k4(SaF_<}d!06ob|db!XDq^ z`*WwcjGU=?*ZA{yjp24@dh7hI!yajt+*t{2f3o>C;5XrtuYm^G`tBKpCAM;iO+j8E z7H~%dRXzK^7{zI4LdRDaX-$yDo_7WUkmlYhYM@tP2&5dHum)D$ijr?7 z^ApEpwrQ7;Fi?#&^{VNJ1vx#gq}pIv*}e;u;<_$b%?7*omy*mBej4{AMl5;F`K-eB zyemwBO^jP%FvzWdLZetuN(sys(DOkO5nkI}{59Z#ku(KA=$h!Q#Cwd0|8Q#NO4QkA$B={poinpn4$l$4T-tP+jMjtZ7WwpnE`lp=O zHWKq+gl+|4Mmp=mW<3alt+bwMQAKdTve(&_DVKP|o91Ove2dzu9ly!TeaEbfo9O%j zc~wrXoIO<`cn1YKhGBTs?RyhCklgqNY0YVe%z<}}22k`dT<*d)UPMxY6?})UsB>=d zW%YJC)~}PLzqihXU>WspWx*N2tc?_8U9cCw?6p#Xi2!+(U|g6>2X$APD*s zzU-dn(?zdxWfL&-20q2711eYq`&PQ2H>$s?;d^n2f|0J0U^YzGn$Um!o2 z%Si9R!S7qob$H|N$G`ty*Aqow6PA5-gZ2MX5Y&N^rm_21+}CsnA!HE^BMY=rY-H3s z6R4Xd5NP%tm@d0p;NV~xh8e32AQ76?qz>J){+15)lXu=1w#}m+94ZOs$2+CC#yo!O z2(@yF;bNqcpJ^zT9!m9)ud#kd@Ojz~j)lEKSn&9@O{d=MKPd(MCua|&S6{3}h=a27 z7slePoMNCeHQNmFH|LSyCl~s}PVm0cVb+}U25r){YlisG zyLUut5Y9XcUCnm#^5z}Ye7$eiqN2aj%=O1;FK>b~RX;cq&_9U}CDQuB1}w9CO(Hqg3m ztA(0M!V1Fe(QldfXSL?N-hz8uN^I)zQipb>)D z^rVWV-$`^B+_^NVvoX1TU2Mk4jsCTjcGIvUaRN2OLc-|M=Lv`EQ`zQ^L8EVmvaHwL z@nSHaEbHn`m*Acssa^o9ia8d8>AB-|-DxYLUe&Uw1dDK1}1hdAorwgr2Gfg18= zP~k{Xp0~GdyMrLHVMaHS2;oa=8j%wY@yqay6m7{v>$x634U!skT&;k0+zDo>w8s@< zttjChbF;04I6i)1y-LMHVVn~8Km>@JXU;WTA#=52$lf2b5^kADhO=M-g8F^=VdwSA}|j;XUvlwfAjqnrKs8_t zt%WZF?49mlJfApdbu|WV#F^+nxyjKLF{YGs=cyGUL%ITz?!;{N@dejB!deR)Pz2)@ z3-p)~ApHztUIveyXKzR~G!t~1k$n3O1^;orT~@~F@_n4^K`nP;+^;=6*eRV)0dOrx zt1Ff_?TtA>hl5UtyCt6S?mPPqc2>)2_H+9)1SCP~Tc5JmTZ@7cl$5lil8-1{l%GqNnZWora7<8F-2 z-UhGs5G!wiV|1lb*K2nO-B~?7(S0<5#?Un`q~0(}8TX*7e;>zQ_Fa_CChQTI`K3UG zdO+D(UzrjX4zEiWx3fnt7R`0_B^ zXHsL+F1L+xc3CU{;e>r8>AW6D>IQm7SbA50>k!$Dorbi~+PX}q6Q?rv=Gv{wk3xGl_Oi;S+!cLQyr-5q(p zZUq^qSC6XX0#xB0Y2jVa;+J$C=F*-ZO&;Oes$__Kl9@*Mkd1(}`w- z(fhK?4WQ>bLaQ*Si2+d*mppKrL-&u4nOq$q_E;4yzM@}>GHCd>k4a0LTVZoI4k$-p z0x@5^)KZR{p{>d~3g9F+Su|B~i>-sMz#T2E?FbsPR#y3Ky6e0Yuc z!Z&C>6c;HJZWJ`x>zuNC>Re(Xo@35Q#l10CU|v!^ zQgdyR7rCu8D@!<6+7Y5qN3B9KI4f3JuEHO-uS7>TZG*JpOB)^f@-8#aWzXmy?4pS| zx+{x8C-JdnB8JGO?Y<_`;q9|dHYOhGEXXS~s5If?VLwb4$Dwa?-0Dp?hR ztq!nQ=FnGw)pGcuptpH$cmcr{r}hjxlW}?oX0ao(p}{&B{TcH(cZNZAgZOZ>6w5r$ z5n%d+n|vd9_9DB=<+7X$=tCWT>auJ#E}&0%Ww@T7ZY$6x`Gcj*kF3ISZL(#RVV`;hiDAx+kuB-J&eQdkTmxL_46Qg~ok8htKu1e5); zEZE4o-G%E*+b~V@IRl&|Aj^;2Tm36@9fcb!UFgglN)Zmxs@<|BAqpBBNVf$Fia*~U z_rW||m6oT*kEh`+p!)1qg(tBMil)eR#I38k`;|b?<|Hr*5^0=E(#Sr%u7?!FmnquP ztc&!;ojM^_(#+A#Rk-&E)%UNW*nap;6ws9(QPv-KkLPC{lwc;XD0TjE7%hfUi8Z-F z)*G1~i7Qgh-S;cJ=c+D)vvQMt`zo2GU+UZ%)Z(*7M``fj@Wa(f<7@f`E?I}oQP$^e zfq+>~!3h<)L5w~xKjM&IME8rCNsxXCh9phhq`~ah9t(Lq<);PC7ld1lu90WGjTuqc zdc}u>i7S)Hpd@5tk;lNpJo)3_I8mv6mD;=r7>jA47IF6Fi)Z8`Kc>}1c|n%DBrI+H|XU>yBZgbDQZvs1hjz` z9Dtf zY(6bAyKg@i3V?`GRTw^fZFh3Ju2w1U(k-qW@}uSzG*{}zaMH_uEX%Ba_RX3AlK%rY zApi6pv#R@e*s=$GSC13 literal 0 HcmV?d00001 diff --git a/img/sm_scan_blocksize.PNG b/img/sm_scan_blocksize.PNG new file mode 100644 index 0000000000000000000000000000000000000000..79b83940471218bdc7901020b41f220defd8ab16 GIT binary patch literal 16777 zcmd^n4Or6W`>(A{w_l;P+Dc35WzB8Ptkg0MjeM(FnrW7h(q<+`Xil*Nff~&$P1;)8 zi;6FquSh0Bp-BBItts;q>>BaSh@AG`` z=e|GB{r229r*?#efd1k14;vdB(AF)1J8f*BW-8MW}Akt z#iPK*yNS^4P#c>@@~o+^-?Q9vcuP#0jm?MYmj4-p2f#;dY@jY%1EEnzas^s0(X5$VGc_(s@pnIU)EU7cnAA!UlR^vfj`p_X?)D#ryXoAc;jY)P)E_=3utq@yFue zOgHRe>_(}yS1P#lG?KOmM^3on=MG`0o00kB?t*hrR0MHK-JzKm9=->%2D7jFxLn#n z_D|8dVRhf zp1%^FKFuQRC7(}Le9~Yc2y?jScKj7rDP93xz^wr;_J;>Wz{4}{at z@~imGWU1+k{&B%C73e;6GX#$obG_^)S=6=$vnosR6E2o!>{Tih2bddel@;S~M8!0T z8kYV`tw_N&w8^mIt{-dhYcTQRX{zJq#gutNmy|;2UcNaQk}|B@ zZ~$fEYIV&S32{}}*{4^nY$n2uUOi}?4wIeU$|>(W@6o5v9*@rwpT&L|?;a#PLry_m zt~BH`my-nL@(127t;}AuF44KSW5y=;&z8qlbt;anZ#%}F31u(ryB)>s>A?KVo3$;r z$|xMt{j=Uk(c{}xEVP)zNPUp?OSg%adoyZ1D~t)|Gz=IP9=vRDiludU4z`U|F`E0R zd1M50*Fl7KWa@&Pp}T+qpYA2_566lI;GrycbR=_;-Dx+ScajA4n7yC6i>SLHPaQ08 z%jj$^IK8T_1SLVi0mQbAhC}k@=rMDFl8i!*wPEYSlOp0LzWUQS&MYqrYonq1M|S zA}OZ*N46h1|+XeI~g>pI(SSC_1pD#vAvz5qN8 z^Yli(_U5-+m72I=d)h8kd>lpBTEv7Ga%px={WP~e<6>R9Qok1aptN~4drRM(9sG0j zxX(h_`*<@V3c2n)xU4kF?lhI;>AX7B`KN!da|*F@FFQo-_d7cJbEt8TxNk+`AS zH;$~XadqdLH3D+&%Bp46HY_v}iE?%6Ij1Wb(Kl{#wTTYraD!qSOmpz{ zMNo45AO-g=7_QrBf2A=Spg$*WzWlv!Hl7`G-X_~hQ}!OuU)zscLsN{Go7|cy(%hpp zX#VZQU}IDw`rty|^bZP+Ixo~Yu$|VTTXR2Jf+HVy!J&z>#9V-669voOqbM!sj(#a> zt_=;c+g{cYFxD)P;f{dhhv^s!D-Sy;a)|Yl2&p2Ph%Bs0V?jWD@L|)Fep>(V7K#55 zj84-{)pzRDS^hL=tYG@+VqLtCzLBr@6P$%A&eTjR91%-&cL%WF^qgz6&%&zfW>)bs z7;$HV>>^^1C}#z*b5NI?;Botr6#2fWfC%=5d~!IG!Wk310uLnHa7o>$Nb3))QS~5E z#tNr)-4lh4zMJ%wF2)#6J~@D*^7Ug*>HZZ|V{byQW{@CzIoXyhCWr}b4F=&*e>9}9 z!MSwARBCdvlkX(?LI&Mf5C{V{3nP)54&5BV(dp*e4mI)C7KmjfQP@B+{)4KzpCWTdbvgTI5CTXTm%=@F3_Ul_A|z7;(lU=-A)m6R<(>6c4x4@EAVmRhYW^)Z3k_pE2t>nXN#aMx)5hqk85bAs>CaPC73S38Ou6Hj4X_? z_vz7s>unSm%`nxkQnZ4b+vv`*(U3k0c2$z|C{{e;rl?|g)Il4@0&WJ9V~xxO5JqVG zQbqsmM;K-N<-Y0p&TL{T5wML{T#MrJhG$-j+oAnum8*1ua0>fundV$u*Uz$`M9*ZL zCub8^TVARdJywTbOstR4Q%U0!|D<**+D6~QGMd9Nn`$4q-(o@VVmC6BVQ4UwckDoT zj774qbt_TGB7Kr#yh4qu_iv`6dT(R)!gm@s*yG`n&95Z^}s zI36HcL^)z0jN)QY;X?7+N9QxALs)x-yW>J39f04%6R|T-BSa%Q{Olc1xX9E#?92aTldB zQrYP}f4;b+4Zza1zV*(ZgyJ7hErUsN%dEmmuK#&EB=HyZj=O5&J=I;27u9c@}?$qz1kVOKiP`{l^V#R@lnrADFb4LIL`x)e!b{Q;uQ-UIB zm-~EHiA-_+hqxOZuFR4N?3&sIry|&^mwt8Rbf^_sbbIKkOu)?^#+6|xuO>BH^ z$0lLltOj$D_~k-CQXbygh3OX?E}#4e4L5$J4^3x}P5+3kmtSnN3uN{P`=oWW-R_#0 zhPA4ewu}M2Tvv&rx^QnOQ7<6t_bdD`j=EMR7@PM{CQ$Y`)uO9?(N#N9U-8ka_)iq88d?tQSjZh1^tl#%moXbQ*P*&Al$7FA- zX>!9QURm~>W|?VW@lD?G^r;_$>g&uO*R)X#H?qrpdMf>CCiUSO=BK^9UHlQl4+jW& zAEmM_GhZF_C?}=1f}HZ?rUM^=_843wKKq02;uCnMb8>RLbB61xs;Yn+ftS9(iu&T7KYqwQMcYEhqa^*Aup%uKy3d9s{UVNjTH$F>%X? zObU^-;YF=t3%}XULzYO8HcI;4Y-8JJ!@z!7N5d}qgYi=n>dae)mqWAcVXSSW3%$Hb zf8VmQ2OpFjf3O`0L@VF~UTssP*W#_9Ez*Q0DijqR5aa6UxnZg2>YJwkCZ=~fz`UZ) z9GvL)orh7Dp7($e0I9f$lSl#z(LA~FmJG5Z-qXgUp-fL@)^#FzKjLP?3~QtU@)B-f zz`1#wXP&zZa5kiB(h{m;Y~4V+oemRBI^hTu>CTyY*{LZ_8$I+P2&VV0s8oz7ecAvd z?tKXf0LJ#O18qx28ZGgAH^e7HlFM*C=>YQyt$eoi&SN0HHA~6<<|kPW=WVZWDgM(F zN~Wabq%&wQ{9v^cGw6;Gz+&xE6l%IF_;-Zp50VP93qljCa=N}t`b<=DJZW*eR@yeH znG87wClwH~GRYn>A?3eg5w01kKsx1rq1oYO4|BTJF8V@x%L}m{A(;W%9`-&!;wy&K zkP5oKJ(H3peEJC}&_^_1^m}=SIn&dxRuhW$sB*hYEA`(dJn4T!Q%Fd$H^692@j7JE zEhpmLD$c2D4_Nuq(3BGnukNwKt|#aFq<%PeuEGb+>n6{}1lJKL1ziXFBrh+Yyf(4Y z=~B@gA6`pldhe-oIXQcTNu&bgh1&H+irs$miywE>j=cY;c`&>WP4aBDaJ{U)fN-MP zBlfn1TG=MmFh9aIY}vAJVyhfrslI=l8Cc_BRV8?yJR~Qr9lW9)!RP(!1n>CZ$BIFo((cdL9%>{;CqVvz{2GI4}!^81+QpDbG@ z!1R|^&R9t*IBBm9d*4|mP7f)*vGm_dl7(a&f+{B$Wix^K{xnuzT8U8X;Dn&^2j_tE z-abRuH%XD!O=o?5i|Hxcw*ex`}PBjO()d`0$H^}O{be&xL> zuYgb(J&^tO$z?|pMng^-)|7P}KJ!2lJP>S$bv-5MBJub@Ng2lCQIm?PIG8+X18&EyZb?)58h8|ltlkBixuGesh|IXg=_N8Gd3`vE8c2gYlc3Rm5 zhI`ezaKj@C_u#E=$L85E_t4)~bF@z8#oe0tDhHJhrhEQu$@&a)5~`ab+$WDXppZ@$ z)$pABDWLl|{w9Uz)N%G_dQzYu&4(n8XLS zDwLlsM~90B&p{PK|6WC6nb3Fri8xJB`uIUoG=GVgeFU5@3GQ{gmO3YmS6T@jYqq5V zs?5JIi_3c_W3%Y$doQvXlFtDd^!s6+pI{V!vh}5tWfD3b`#5d0h_E&NMcV>&cvvR6 zAiLU!v@7w(Pq>GUROKAlt$*|OZj4OV1uCF&;RwdjR z^_V0$+VU0ic<#s7=53z!e3Y|A$C9gSn=;x_TV1?HURw1o67`lT(p@R`D%fql&3Wa&5o2kLSCh$ z>HKZfBDa06CqBM*`B%7Je@A}zgR-!Jz`JOG_-x=gTNA4Q)pcFVoYoqzw{sl#6uwvS z-is4}2zs0`&$D;9hDC2G_6By_n<+jdlLQ5eHY+-TkpL3lL__5jo!T4un9_mZ1A;@& z_zk-&OT&r)gumg#sSYTr6I7-fy3!PBM|HY7d|M7^>HDS4q_^|ySNXhI>Kl4|d;wjD zyGEECDjd-#0S0`P@M436IBlBLO!&T?rEg&P2ZBal(4nDnJv`}JH?Q*RP>Moo*?@27 z*c+Ym5x@~ZEnW4M-vyP06{Tm$OIP?@u58-vbbUVi1HS-)pOqwNp(~R^WoH%hs2(PLUfQiI^2zV( zxXQcMJ`@0(Pu+PvQk-oi&M>@g_fM&aD4!6bX7xhraB=mOSJS%wj-np03|2l^nI~YT zNX85YCiE(ae@ZyN1GfrD{??@jSP7JM$Ww#wPm5AG?4wU=!RreqHT+7^!C#z+hidQx z{el?lWnj9$W9Fl*@#^v!FPtz3wh;>qjsH4&ac>$N1e2yz3VG+Doo`Id8@=~e`*>gE z^nm?PKk>UN+sX(~l*T=PJKLEAH2ru?7Lm!!CF)Z7QZ~OzL8vxfX?sHw|SIlq8dgS9(;*69zi#WrFtlW5Y4ls&8k&th@0Gkp$Q~-WD zTHiAfJ}H~sJC#{{zP|+x+9k!K5W~+DR?ln?GkDoMUC%H5x_a$&b;@G;eicJ!JF%v? z>%a%rA%DD|{(z^gi9iY|(8>+578!ePk{xNa#{8UAU~Zjn zGMwCJ-Kw5`PKs`wZ5;y(peU%7PpPwq=|aDfCRn*)c01Fb`&l_=Hf&b;UpfeHe2h01 zv&$0FTBiUzOPiNiZAL?h^CD8tQYwf88?4mTfDPBFr5(iSPbW5TOyeRh{9c}Q1ovd= zv|6nJ2og0dwzZa!v-OjPASMYw&+eS02({Kf;PIVDMeyuJ;*3@kkR<9x zqc~dbdl+l^vJNTEwJpzF-S1S;aFd$dFvfv>XPtN~sSIdR)Xeh@i(=E@_>E>{YHPzR zD-i}9;;MSOjg1Gd@r}*emgaVkutMH1pLi~LHDw9E+sqdhowANFUyF;eqHj*cv)aLT zX=R}^h0S+928FvrD!d4>vBb__)*H+wW7-3~MTOAOKxBh776UOtnKIHf3@H)=;b|E< z()B<2)cw?}6xsv8hIrk2q&nFhvkC+^T+R`eLUU}txCvNYRVqb>>I$|RMrcq#xd&V- zdABw2&JSIuKf)`gGSRATw5~BZQakBpeCcGRis`YX8eEUcw*Am`K8=_Zdp z9YhC_qGSE-V`D7=;(r}y{K-xLaZ35pU{aU&q0x>EEIFDGKef(!;U_kGMx9B#Fdu}# z(xO*eKi+f4=$FwZY5)k)NVXCsc;0D|m9~-QhwQt!lI>nMcqNRF{?7W*qrw%R8Rjkp z?KzJkk_F4H)cJ7$$%TI%NdC!Ayh;(>p@JRcyM2Mu5wO31uA_C9GXqEzNoLc)b3X*_ zgA0S=HjRD%I*am$!0%58dBv@ILsha?z1WoOW+L_6wi2_u!E`xg@ab2em`OZLWc}cW zgJk=7uyxCQ9b@S1+)Tj}Ma?g*-4(_B?$RI?w^l|O`oQ{K59GA8E^a|7KXTo_6k9(W z%_q;(Wgj&u?^@|m#k2r1J7v^biklbHwefeR&fNVw!R~AtkZ9ohw^S^(n1DskQ_A$1 zGb(j|ni8#8C$^UFAQ&zsTSx|Z<-~?2a@sxiX+|ngDRp?>P(LNvO5^~l634$h>^1vL zN(pUgG9kd!%>DeX3SQ{()Ap6J;AqMzKO#%+Nu=e=D!LDW|l{-4_Z0!Q=VH~qT zAu(r^*@8{jlbKukuY1jYeCOQWG?6-GW3t%%v4le;u9_?XavnhZHR}lo&4~o^9dta9 zokF+3fz%cq4dhq;*Pj|!@EeVBbV#NST4zp8!nzj>&Y=%M4{Y59J9uyWgH?`?Gok4#sdzo>7kz$=X3GoWLgKA^a(zMRsmaTa=%XtePlr z=6S-GRUUgzXVaco>u~xZ?^&C>iL`SAjvI^>?fUJbXXKU;*6eM)k6x({=~Hz(Nmf>p zeNAp0>$i%V$;GYsL3fNBdo&AB2Y%GQW7z;J)&A_D3w@X?rCGgM-Pj94IsM1GkfVnL z0u|$Q>#+eK2l19H>z}2+h#tt9(Kh%#&0nW*)|R-L#=bBi*I3WR71G0%vO@)3hwoYG z18vjv_4M>)0@+`+WO*LPN{1lKx1zEs+UJ}Xh24!~q(`qz$(C4eHTv9F_K|1l;@t2J zh^X52_Z24CWYtlImEH~D4+bUDJ@@qC=l~-@v5jY}^T$wYkicnB+NTO1E5mf)Xmp|? zfDH~!Aqbz=t2I`dzNbLqwQK*VX^)1JR4t)YdhE^gu-=pPfZwK1;AvNy>^VU z_-;%$oUr(N>tKRAM=IbQOYcMnaR|@*O-Hv|$sr);lwEk5;8}&xtVUb~%GU1mKmg6r z@x2D^{NHXs&qG54&U8h`o&m9i{^S>0;8tZB$-GF36_>QYh7ih9WPk_Ya z(w1~MDAV%_1~?P5=9q(h3L5IUc-Ju4H=k7S0!Sh>ls~JVXJz086iASP1F*N9$Ng7% zY3s93Us{SdtUC$$zcYHY%fQI0hmDiiBbQ&D#wy&z4x-r(KpMa5` z$OoanmNyhx3fdkxy^dQRl8Y+SJ4p8VYI$q}t6c&Bs{MQXif8Atc={@>g&(W`!0+Qo&=2BBjev{fTr3of5Y_|H%1r)d7erq3;f`lNck9$(`Ou{W zyTrlnp4xw!a^$7!%o}#nq?aOzx_34ni4*P7-P2bB-6o*z@e5#SaIS=fpGtH_KZRQ! zFjv+{7C-8}H&Nv5FErGzLdl%S{z3YPJ1|@7p8StZgSr2#RDhp)g=5*9syW!Q98T>5 z86^Hl8d>+eRi3M2KgQRt!~~~bYw0yrJaPK)ex;YOnyaJQkF?1*ggSR)ohee?$Z4Q2 ztD5~DVmz)I9K)`b?W^=(S)u-ZBm$A;9^sEugb#m>Ktuv11od2E*~}jVMXTzfQZS-S zQv-Gmt_Ir>*YkaY2;QW@;^#{hv1MMLv+}z3CPBIig$ZyGEXbB>=h1ef`x|$l5#!NKB>TLpmEfgG!(qd$dT^yWbm9c0B<-N>cP&4nAqk zwx#+gypDv^-XotbktB>!WiLbGPQn$5W2iAkTk=>GH!U_Srg3C8%puUOR(&ttoO)YO zPUb|W00n9IHr+d=ooT@e6_VOFYYoUCKl~z9#USq^nDcuTH~dCM#s@0>TvDdDXrFWl z@7=u2Z#@9Ug9>1XrkkPEMnh|MO{3ARPvgb3#K|)+PP>0i_(3plUu)ZS`E$RW5cNu& zUCnIRbcFl;H2Ke!F8T(FJF&OveT|_mPqB@wQnOMwomI0!;wT!$^MQMVm)P_;i4SC2 zrY5&zyj{|Ubumj{@5Oh1t=|1xRinr@EEfClb%}4vYo+j4rM@ZsfG#9t0kv8qiS8Pp zB@nC{WE<5wg^*Gh82F+Sn5fgB9qrNl>~X-Qw#$WXX%8$kt?bYCn+EnRH+0!^H_OvH zyKO3=S9_OSnNeS~0huI8pAHJxr3?kZHu!$5W>^-K@R(E}yPkSu1BzEG8qV8wH~!V+ z<g6qo|Y;F}nyb;Dq1!7r82 z;C5)}orm5pesapzdf{en1BE=K284-2lqx}~m;Y6@-v~ey&_od(_+_rG8-LDDe}nHg zyjo>%25pzV!as@UM01DFj0vn?YPm%rfacmd^3fl~;WfP_g?n_`b5jx+d9+6< zR|npO0af64uOoTg2~OKQ@89=Ep7H~8uG*G3U65;1uknSv!%+b-V&N9iWp3UmL2*43 z4o?)F7z-hf7A$pN*7XBWe_XI@=KUi6M&zkgi&Fd@zVv!`ms~OOYzv*5`DCS+c31mt z)S#d>{NW9WCf?cZc8+Z2Z#|=LSLbg!O8=*CJrNKHmXQFU6OnvoL!FKWi%rDiXZ}Dx< z9hWNgmC(&(ROK@{czU8EIS0#&@4dB}b(KL9ghI1=Bkf4(w}dycy{p7 zAs2dJ;scUMbb12US%3 zV1J={Z4h(0)SUNfj==gaf$D=K&%I|Z_T`$6ZsOlA%RcF|TE;EsD)y}%jP}gSBW;UC zh-!=%9inuTH>mtd)7eSvTKjDkaB#;UVPGCW!xklF{7BYpDM6^mFW#9hl9x7s9n2bK zqI5i{xL1f^any{pdW2AzG2Bt+-@(-bCDg`pvfiaiW^O+7luC|AftN)WKvxcWIyJOE z8lJh!d6Rt8)dl9sd&G?wWEq$x{0TdS|y|6{YAaPn9wjh~U;;*;~46dEFOR8{P>qf=+@ zzM0|Qj(5e`dEiZ6WNG(nX54=QG#-!RTlbQoOaAG=CS zLBd(}EFneb#frtJv>U5?^(V%08A%8_SlV_jQxYQd%MZl|)_?IvG_gLbNB6A;&q+VK za2iMt3lsYD?H3x3ZoI|I2rjf=D|~UXEl%{b0^jd+dZ1&CssqA5?3MipB2ym(cXTFE zI+BsGym+uu2egDXj&jFrH$qzI?uJN;itjf{$T4~k`z?zGx*T51VU_a* z`LS~UTtBcwpka^Q8TFDfk!({Wvrf5}MR60p=*WUR^}Qf|={-I!EkWLJ3K%xk;a%Km zD&q|acEHKzgdhmy03S9w$FltSLizPZBEH^Oq*rvC@?Jl8qFdkOvx2!XpZ)vCHf#VE zTPlhgChb7SZ+suMkDpMH>HC#fHF&%Z5IWF?XtN6Fq_gq<7TnPa)(76{TUd_2gyY3` z4@vbmeQ0@hlk#s`*^D(A9z9CELDB*=2zPiugfD>BrBLJc@$n#qv|CSaBjh%;=u-FJ zKh0eBcXZ_I-uyrN$N!T;4YVXgBVYH2;5uJ*mi&jB?2WpeO5lC<=2r+tpo&RO%2~o9 t)I_(W^9rYM>%V`wV3BD;Jv{?;$wPT*g-b5*^9eRvgTewEHt+lHe*gyv7JC2y literal 0 HcmV?d00001 diff --git a/img/we_compact_blocksize.PNG b/img/we_compact_blocksize.PNG new file mode 100644 index 0000000000000000000000000000000000000000..5a1cc9500e080108399fba2e9e0d72389affa430 GIT binary patch literal 16270 zcmcJ02~<<(7H%x9cDP{GTH7dad!4Ejyjr3H0$K+I49FNEldTgZhA9Lj7_e1Qu|O4t zHUg*GJvjc$+)9xCjp-z&|$nlL$>*JW747Tj+q<3Jj`}^&Ip0Ig&ov*=Qn?LxA zTOI{f>xYMTOg-z6jxRg11~YZvjZzzo;pEiw#53CD=DUVB4AojEKG#ne z$E~5U`gPLhBCHF|Fx=&WY~9jPMH`QtlujB4t=&UD#7h&5aqn#IU?fhA9eC6`xO|!9 zIBebaYRYoQf3p`n>lbz;v=4d9cOT8ZFSlQ&dDl|%T&`_mByQ0L+Di(@nrQ@f!lZH3 zdhBU;eOjmi-Ri#`fwOi^cwX7eKD3hkWxNeC)2pg&0-tEq5B3Kl{nLac=8K970-F4a z9Eo@($$8{Ww{?>%%lL#2`L+C!k9N93F38FJlYm5%;`1oCpxjRJ0!s|xzLR{<$7rYhh}CXZ`vuRHM@70F1t-5uz~z~R z+*3_Bjuj(M>)2-B(^F4b?57^br{#q%Ewh!}+j2&6&BBn3tJR-6Gug7)7ugs`lHWl& z+fa)MmF@U?&Z2|Hy!&mbxCFM29WvQWRZJ8cuTOj)*Ow45zVk zpmVrJF+xII7UT%R2M|ZtKH>Qz<=l**1J|@IU2ox1lb|H~UcvTZ;Y|NmJ)$my7EYPJGl33oC5#2h6RTV@HB>z-l89|Wf7>07r@)E#0&ejnYt zsBM6xB8EA6<#md$I{qZG*(dl^E>PAM zH9Pf;wy|UMJVC5b@_H)tkT1$wLoNIWQgKPmks8Y({juytvOC$Oq($YHlRK*^+XZeJIUNgNwauK;2z{+TG~TC#Uk84azR% zFh$wU7J+8~XMA|I&C zK|{F=qNU!TPduxS%#Bl1vrC`DrOBz50~9%p9@r$TktErqyw#wBuBE;~I$3@#oHd{v z%E>E7s<5^Bf(b-ZJMEBGB<& zxtCyLpulYOBqPIT-qHSpbhr9K?Y>N(dR}dai%}gr92{75^2;8|JSdah-y$aY(Cirs z&mlr(bfwo?Lq&OztBvGBkkFm|J|(yHfbKS*;pu!ttHaInu?=^2P;40fmg>UvilC{+ zO9=OpojNnx-k0BBsNX`xID3;aCvf;dIts$3^{E2uTRUjuih>eFtfnzI!%8?{E%?NF z*O1}>MOqL`=Y@t<4y(~!?iB~WhOPzIyl>0TDV3Eg4YG0~mWZ`Ss+jPc(roP~(sDUB zya7!qjN>H4AK=TV3P&Acu>Q^lfq8ORCY48)mCL(0hiS*m61G*Z*ypepF2scmS-?^8_X>OX7zrwM+0snON))y|KD?&APT(mSid`tHBG!3T$ozO9Y;{=Bv(sU3UASkl#>5nO6 z@Ofx{U+|D^@Da(booV=`XQ`y|Yfb%lG(p9-prD%qbXDY$H@J)*O?_}RIphlQ9`uBn zY6(RNQ`cx5RDpaITqEVZlj>GYIndTzW;Se=m7t(eBG@PQy58-JasA%CUk07qsf#nz zU5~j-amX%!#sp--*v}W72-)_SMu(f1Bxb^a!FfgO>bBm^VHE*OSQf{!~hX!v45OAAv8n4z4 z$knz|MI0$2zoyPWmmN@O#nEGkMAv)a^7|LtWMtiX!!_gQjmKLO91P=^ zC=~%H`R+7)-nde~Sr7)5k#Q^@9jZ!U$)L&~ksVYEu2x~tO;8+I}zCS_eOdc zw=>tV2*#Et(H71z{GPC}cvY2PK>w}2!Va6z(7~!W^pnEqWyvSBb88hN6vqMmflezS ziNeUZ`ou>#ipyLsUtAr`AX4kfiyEm%!^wI8#3`U%+p+G5Hr8XMcM3#w8F1}TNzS*0W>ijx%YQ`(XpETptK@cmFj401rB zNZYS=gktyy?r~W0Iesdwl^@C+V~8S1G#>=Xsfl-oYiVy3|(ut3qIVDM%gCBY%v2$x)zjwSUYQWWh-II$X zj`ASaGIeJ)Vyk(Ak(kHxDh^*ROSNgcsYriTl+Cv|Ofx9va5%NSTRB=8$$8-8fb-;> z1K~JsX@P?P$;>E4ohh*s!t1N9`+0Fyj{RB6qE=!+y#rH6WpiwGILr zbbw&(v;Cg;!7zsaC^MBs?!6Y@b;C9>TqRUUtV&&!%zFOVc$rI#>{&=Kx7LtxKZD|| zQ2C;kk8ku}q|vHM1tdyw>k~HoCyA0NdqP^L#VueT_Dplgy?3!~f?Q4t;MWB^Ejyn? zJkb^z{Vv$M z@E~YF=MQZ#ev07*vdYOco@XY0Xf#wl+&f4p(F^0B11+d0i$jBRMl!`Ed@Z}{%2G45 zi4vd?>UeB&&Djh=COI`;a?eq?>r=NwO^9#&YHS$sN^*Q!#rj&%QSrxYsp=)c>*3 zm&)qlTI3i{iDPf9rMJh1G;qe%j=1f%QVT`ADKogGVd=6tQKZe$mFqfeg%$$?`8=~> zs^F_9%0QS5uDfRsn<;`-1z=|)eG&v7iJ)e6es`2La@}gXA0DCOZusPT}!K0xv?4)c1 zikel>H+XpfC5E5TKMyXjLQ`6dSP4>wx8WX%w)uGM&HR-5%?RnH9s{v6dTf61KvpfByE3%O_@%(y(+=ahBUmIOHq)*{;wAV8=}ZD| zSb1AwWh3_*vWoNNlgRwM5NDz!Eo{O=ayY(^-^BE?#gBXX^dl0jM-4$km<_y0XVCqG z5%_CvKt3mZS08d)O+dP`KHs~&I8Gid$-fC3c*}gri7UCC^sBtAKK3_$sKZSX0#jHv zF8>U1)W{CP((Z{rdqcex^i`VxaJfm|U`aXaTU3$ya{t77-7+zuAjzX47=d5xe1>k< z-hM0Z`_9)Q{OmF0!fh?k-W1eM^HzsRAF~_fN6eM2t&67muggVJUj{Scw5dIVHSr?) z=1;Y4pt1NnJ=*`!)=l3MtBjB&PEPP+2iuKjL7UaD5;lwxg1b#E9P4>)ZJIR$iK;q2 z_LRI9G?s~)fNEwz;ig?szn6JB+!o=^>-06#_qDPZ!imXl;YCjZN}^pT4;xDILJq))u5)BWXaV}mlcdoeGfe=~yN(m`pA0#bFwDI%PmohA2@ zICRcL9GA<%X)4S!+c&5yzhWkx7cQOyJCQfxLCbjBGnttSXSKh_MP3Z!Bbyz}GwDU` zzw`}OWf4ZoJ^^jA+|<-|*>1w**04%8q9mQcuEV_H8e+DBpLH&ymDPF_Jj)gs4K&kx zw*NoruV3+9d|?s!X8fO<#qU~=UKqGkJZ)o7|=h*D?Z=~ zQS^cW9dOlTh-ml&q;f+(yh3aagWYl7QytLYH~OTjHc>5%GS9SPMTfZn>&AVXRF7~c zeA5mrI+US>uhA$La~o}~VX&=M;?5TINq1m)JQ6iEMu-sITD%d%$*f{NGmej13s#?c z_p_hA?o}VR4aUX&)D)2|UQ6*>9$rj*FKhSdp?rjGAd96K0>`}fEOio}sjh79dbjx`WMRmRhh9JIlA|z_$>D08~ zZAuEqO;547+>YUzqZoYO6s33>-4+J>L3?kpoiQ@Hn#lwYLTRoOcb`wJ zMFcTi6DJ@rG8k{Zn?)U;4}HqUCWVNo$U?{7y}8r;Vk>K-X8Nz_*z;H$EB7zzX19aXFyxCl}Esk zfBNU?TIWJD0=ld``+;|H)_YpB!QV zWa!3pUFuno5^8!gh8O8Ebat0L#xN2`3-q-92A>z6H*Z?xM(Ko}&pF7xvI#?lewbui z1L4m!fvAm$2e);Jde+qW1G$DBY-#%vpwTrZdB`b#@LGhodx1%tUFX4M=fpkmzM9Ae zzl%j-?GfkNJ!Gh@!;{Q!ThQtH#fuDa-iUbgkUVN_c0&Ih$84eN9N6g32(Qp($}KCu zBkZVVuG@^gl9}wXIgw}ho)#1yOZZlY?#EU$$A>>G$sHRniAK+X-CrQ8S(DO5XRYwl zT<>fNutfxlZwV6i$`X;#yjegwvtd|7cCi1fJdw;~1j4Yvpv5go9I&*Smm zi{%l06V#(d_0mlUw3uH)S!oVC5q}?eFqN2daSx*B?aZq4&`J!5@pYl1{EVVu4mP!? zGkFQ@J`T9?r4S1dpHe-2!mZhLefQ#>e&F;J#@8+8>4)(2BZF%*+N0D2b>owlDY2>s zsH}d99lCVe)+uEQpDtgiVu~)4JJ_5a_NEZtTj+`;B0o+=fGL}f0T+1xR)s#>orulo ziig}?2rnba8L<`W7p#dgMO2OaFi*j4%lOmyU>tEa|x{HW8O$a*k1TQ`9}CvMy6X9mLerSY9B zZ!ukQd7UknS6~+#Of$FZ0j$Gpyfa()p?rUzhkpYVd)ae z&f-l6L4Z?OPxW)f}Gq40z48kbm`z)Q=N;r2r_UpFV17b z;;AB0Hxot4Nsw2?oSfHTut@FMX*aSvTh3_L858V|g`HSE2ewu1@UIuHO93bd#p|Gt zbXV@nkN3rkLvC!Cy5tdgwQ)sj1$hU&R=%RnAEXrmZSbQXCTYNz4b;H8rj<7q0<*Guoa z!(iiTch|b;fn^tJA`$Kzn^e4K9=ps(2!?Iw0jrhR;X_<()q;7y!k^wup^Hp8R%5kbdlkT#he9<4BS?ttj*s zkLvEZ$KbGIkoubKZ*)1EAN!`7ZDaRUYFpyTt4t;3*W4#$HJlm;! zu9p}=hG07?p)XJ z`Cy9A2$u51CAayHE9_oAOnE;w+(gM_j9SoW&^+UwGpUfYxNL_kA6m}mW z>OwE9oqDzI*C3T#7A07*-BLc7N}gKt$M&$taP<@0b6u$&j+Fd=!<s!m&FlK)KH)+EnFMsn502MY%~q?46FtxfT| zB}e!}a-$u6mBE+>mAOymwfvb4kxO~RR{0MJJC8~)CC3(9$PET~l*402nyq3aKmm7{|liL!}clb_J#Aa2!9-}>lbTV$;e zzyiqG+qUt(!26=^O8Sb(dape4kRT+kdjQTOVme!fDHW-eZ^w87=!*nSGK2ANT6SFA zJeBIGv=>M}P|(TlVJG5<5E~T>Do`1@eFEd^pGf?Rn-(J6URL!sqNtn~R#XJl7ajH< z^6s2b(u+KP_@Y~n?!v3lhbZW5k$;R`8B7jcl=qu)R?kSoinL$k%l`|kmZZMol!dkZnNwi)Ktx^fUE{RyYvDX6Q14y@sL6M~ ze;FI}6QEycBKB1e0oKpZw#1V4_|}v4GsrY${@an`hfKh9aWl{ASKOQ;&zk~3`f*%7 z;rDo^EJ31bUJ*Z4wGPyDQ^@FdZ^-@)gqJ7GzsjQ)^|ef5Zg;<8+y^4#hPr6*nq|Np zzhc}_5^?anH-&i~5c`)fXL9nnu914vbt@u)c&2yFFch9a?lQpLj@9J;3bTKrjHw9p zaZP?jbW<|(VB-J+uh`EO}UjZVvY}1Wa*ye`* zEAGpL_*EufpX<_HIOsT&|26g)H%?Dct8#CasqF;~lBapwis|G(Ij*e$4K~P;mVC1kq zZndCJSD_et3muwe-y>Cvf`Azkw{-QeYw$AFMUs`f+p?NM3H?z z93@P>Kb?J|#4yatxPzvU{#~bqDI(;TZ4sa7uMQy$VRptZAYxGyI)F+RY{by2k`+b8 z=4$)DZ#IM$rRpg@Cn0gQSPQonm2n1dMZTn8Ok9$uSw*pt!K zGEe)mN__`PLCSQtPg|(Q|Gz;Q;Fz-gOpiV$7rp^AgFUOJsP2pEFkKbbUpB$UifO4{ zwt=S)Oy_t52w1@SrZD&=w$9-A!dm%q6G*?Hg`lI=k$&`=^o*3u*Us3fZi>dfs1`GY zX$GubeJf}ZjRuH6`K;rt!SPL+?d!i*k>ASwpEti+i-6iF<3i$SkS#3&C%plmp`i4? zF9#SX1z6KN2qHUU@S8Lz8m$-SI@lagCK*%AA0DMNFRpMwem6`!`+H8Esf+)q=?hvm zHI{P3OuFULEj5XC%FhZv(jl9vm90ZX&A#am;XFclc~u`S4c{?0`}TBf{(aR=Wq45I zOBUePw;ZW#GzOm)2g{e{`ytbhA}|Em2yb6=1a@`$RJ-t^ME<86U#+!>+|GrFKte?H zPEVgjG3N+8G2xaysXnlpS=*Oj#P-$ul*(Botb66+Y5w#|WX^>7e_G@PqgrpS^aRyF zu`5)djL_R)2(M|Lo-aU(AIwmdS&7*Krja&;V{Dk9;Lm99deE{Xn*%`?ul5L^E05N9`+ph}Mt%nWpCVDe zibxVERNB2=enHk86@>vWbm5n5#EB&cWPM*+w(RJ+X{NCUKwZr=bDPQdnNWEN2{Uxd z4r|$h`b=bl5sT&gy`9IWE6=9@VnO3Rg%)C4tMkvX= z2tEJvJ4eK_;~#;E%v8;uMe4CT+4CF7hoBkiJ`3LfEYu6%)5qD;cH>Y&1L=2b)Qmf4 z3dMh6eJ1Z{-%v2LNdS!tqklXzvu(Wief)9yG|!(+wz^j{=Ms(ZH{BZE zXLQeuab{67HTj*srOc)f3Yg*J-)ZhE#aw^c*vvu>cxyyOPjZsY4Db5;KK#wBXBOVG z0-o1$L&oCmY2j4{$MZiMTy4YaW?iy$1*UJB$yJa1@$;{>v!wiO9a{O7T>XegE4z~n ze#a$pRSRbC(y*2YdFGXzbRrhfQlgmKmsL;)I+0>z&#Xh7lAZ*;FT>d7)w z7vvHC1N%#I>#FJIY!(?p1>dSK%%0Q!X4E+gdNaK$q0FAWoW6-n7thYOm6o#yqp6|R zoe0gmS=W`{4vGGMLq~{c;L@y%=wfqa_H-*c3!j6DuHWp-;%_GRf1i9$fBG|hJ()$K z{7=#Kzd4H%Ypw)nGf5pC(4_ThoxxIuDr>N-HZ5@-3!FsAWcYb|OA)h<9Cjr^5DM)0AWPT%q@dcw@Cc@|VFwlzX7Uu(oxB^V89;@PGiaJYZ2lD0&2L z=UiTbTjPWg{Ct5*`T3@R$&ghEC1dkD%0iB?7Eu@Bd@ zUjcXB0UJsy3CGtG$?~N<{q{kR@ay(?M)^iRR)HA|cBztRIvG=N^|xD9FF5Vgkr-5g zX-(+XV$+Ekvn{4C;!`JEc&25DKRPz^c#jeH)6WLzf}+nD8}Puv$S%a$$io7pUnJwr z_15}#w93fUY#;s+{yj&#rm@q*qlG$D^E&yUnr{zLFxG>(Gx5UIPk*%35+-q@(1gg1 zgO@kTiG%8^k!`HdWsdQ+XM`t`-f0T?r*c2vGT-_eZ)9y^@BZsKj{@0wJ${9qiHXW0 zU5`Z}oXE6zL@6P6$t;Hlvuk|i`CeWEX)>WCAd@b&nk=+W^{=%>j9?`GDi;fcQ5Jb0 zS9lmvrTzd-^brO~ap4+PXlf6q>o$5v^=T~SHFXrhu)Jl?XqMSm7FM7+Iys!zdd!iP z;P^4>)8@ns?#YR~pjrhE>rbxf>;uaHK6gYa;$G{>WnxkcLY^?7yB>gQ;CJW5AOH4j z(mPig$Cj*XxaHdS{rBR+n9F*YY`%x=aRu3;5ZV@$R;1E@dVD#1e66HWEJ;;O+;zK3 zyQbV4saW4zSOrf@%ePh~t5}>yj^kjov5K5J->*5M)7t(AoA`Mc^W&rIF1fKzB3=I} z`?7-0)}Q#STCaAP=a0nC*BN(=olUt#usoAW2%tPV;6Lm*9BC-@-yWOgB#$5=-W&Dm z3e{}Vxb(wqe7J~2W!eSpGV6le&Rcfo(VIlXBlpIYn!Koa%j7>;{EK)}(4G*@vM8kU z=oGPr$h%fiH8<^{I-SO6_6(-zijp4q9JPc-bbhT?*Pg`Ix(dB3*RyN6L)A{7gDNL?GW|U;4-9})-tg)kH=fl}<_G!Alr;3_Y=1X##|2bfPcibzo?&fnJu6d}Y zoqdm*8y}4}S1RVjafL@4;s-I?gY74%SY^$A?ss98%FcFf*Dhhe`2ab8P#N}#3tUd% zDjgO19k`}X)|*>Kn zO!|{_0_pjC2iInE3hmWXZ^w3qPU65X2ZV2u7za1sT4_o=+^WmsquG@d2Oa7dORwAaMHUX!FdR4(yj?fRH`l zFA%6_2m3;EM-*6fK;B`RdxK@<45FxPi$*Cw0K0AuU~{06X`V?8*@4_=>*iWnu!oxN zpR(7v$pO?0BUeRdp@s8q95r2+N)$yie5nARafSobMY8bGxo*Nkq>E))^zs%m4=RX* zT!h8kUEZUkq=gz@?Q7qHI(q78bKuJk?L z)KSj+rY*>AQ!A)gd(4gOalkq7rsJ0jjQ?0+s`sOt!Tu;%0hp5o%GbX^xQ~q1b#oa9 z!SZMLQf|U)*t}Zxm9~wN^6UXW+f?5s^w_f}T}^Ak>(9)$e&lk0&kd88Y+8Zvi%L$ za|`k}*&`$weodu0Kj>ocfw2VVgV_VaHAR)tP9usgIwoD-B{PK5#y1Y@c}mG98@iCN zw>5sz4~9?ozY4)! zUF>*4_7-yW3H^#Y-fQW6uaCh_y5g;!_S^w{W?m@eR*_Z{MR3mNkG}7Wcb$`wd9+q+ zrF=*)wqhx_xuR0WjbYjGBt_&$ZDGz~p7;dks+@7Pmcu~sbNCr76}`jOO-PieseCW) z`3asL96Ds{`*9_fG|Ll05NGpmB7&Sbj@yQ1b=%bLG0y9I*9U`wpMI{z+Xc)+Q$4cp zmV52*R_lAAfk)Z0;b-LXE*TY*PAs8j;pTDC+;}No^*yQ2K-{Y;mY(lH$vzID1NGgS zuwKrLkRUOiz?VYn>3rOkiDj3m$QObr^7xgdy!hCGm@M|Z$AO@3e<8 zYjI0A*{RE8CqXWIS65DHQc@|6<4upFIo zLKNwPyq%$stMqGHO)VaP^vWh`BSqrn?2R9k7g3s92YEcE?~R~!fr;GqUP~Af>NlEkLhdL*mF8~o|j}|SN zBC~(t?z#W^|($Lph>fz*JoS&M9#ScP!#6A(`yIsv;ANFAH9Da ARsaA1 literal 0 HcmV?d00001 diff --git a/img/we_scan_blocksize.PNG b/img/we_scan_blocksize.PNG new file mode 100644 index 0000000000000000000000000000000000000000..578597952f45e43c7c7a15704dff6639d0824431 GIT binary patch literal 15869 zcmeHu2~?9;*RGcSV5NVIk4aA*pek*A(t#!>b*=J{AcUj;pgCEY49-{izS7vYCLf9 zwa4B=dzUOJA}pUh^R8%afgfkvn1V~-2jZ&6WHflu8mL)tJWTc?|k+s8(1dY1dsQI~ZQPte6kdf5LXAD`7Rum`Ek~EZQZcWrc zo%TO`_^=K)f4gXc*PhLFBKqgI z-rtL0P#Ozqn_Y%P9g=N)O96hl+@HJDQm({xwWv?J&N+EWOYsaE&y;KURwUqZ!~Gn( z5^B*CYgQc-S8?9v`ikjEI(3mV!xeR|8MS62N$H#jKR?`6!SzYvh2uO9z=@f zI1L$L_jKhcF2Rvd9$1IbteDmj?~^%f@s--m4T2wXqjUWF>U-&-UTfDXc!_?BtQ#ta zYO`M&$(0jN&|Nyu@XcD$j%sDFAp>5Ok4Em0NSo;^slKW1n+;s6T z`NmVt(U~#z!7*jpBg4{~uZLH84;|4#C}3UC9=~Jf$^7uT@{Q0;z8R)Vy;NI>4&ax0 z?O2bK8r&pF5XX872XkLyz3#P(xDmsFc$H#5XOeFkEG6VC?7g{6yY8}80j*7EZ)xT$ z(1^h?T0Q8hHR8!gR|~wqpJ_g7<|3q!O;am=9*NSdKTlr=Z)y`S9bW%KL zZ$*b@rUXI z;g*7oMz%1QF^%O?TaB=Wc_TKXMocPuuVb3;KD3>AhrqMDaAGaqg`Q<=`@Gm?D8zKs z^9@R&3l%EO9kCWT_W56NT_pxPoYa=0jE1u`_X_T``)A!G$M z22mjx-{1EKHB!CSJB(Tv^3_4ulxQ&GZ4Sw;$0n%}#;VNtVfcc#joC<|c9tubl|UYv z#F0^rI5PGKntJFp_c&jIi;IafuKeDGLKh}rM_dTxWd1q_ZXvIK4qH?R!fdEf z6oeg=6-zHS(gV({U0=Lv8F?lj#db$MFJ{gb;m{a zk)7gWVk1!`M?8?b%lljb7RKCx>SE=pPu!j^%AnGEgRL1mqja7OKTE_acyCT6x#g1f zxj2{z?CAwC@l;Kc{>T!a#uIl^f`=bf zhjdNdJxEg$_Kl0~HV$@-W9r6IkI~vs3w|}ub8jYbXUzK07#b&hyJ;wF>Ic}e2CRBV zxN+S)HOeL1ysTz=$uWxiobyTSZvU$Y)0E=L$kjb&p`~mrIaZf)nmp6ZHKL<&IzDTm z6}i;WUHW9EaA>Y9DyNYkc)+zA6-;%$o+0O|A9PXY(h8kA%MX%;xy@bN)gtva!xMdS z!XSZU%=BJIP*QnSpz2+{5k`NAdlOZzG@9(+YwUEi?@ej2YG|s#ekL*jmmHRjrX!8@ zbj=PA-)<09mKxtaR;ry*CO6vYE4sT6+tX?5z^AQM+SXxWTHZPtC(ag^uyM@_|K+l? zZq9BZxnnOaU8GWRhtKOy%hnRrOc+~n{b5_4bER;JB;;8%9I-MG061qmtj<)Y&zqz~P^ zOxDKt2|E)?VQfi(skWbGOo7Qy^S=3t&1J%D(nW1SV20$it3^dhXBBt2EMagKv2VDk zG(#oWdrw!;4!Dolr8?fVM4Q1?h94HIDsAhx#VW!sV5AW)NMlc)anI|vkj%jXky^z) z)Ea5kMWR71rHtH4$WFLgh^G2JeK(hDBNRcdAWM|f*v!C=jPq#De0MSh2@$#Til~0h zXj&-M&-dY4W{`l|sl;~2Nx3iHB$cB&6>W2uj&#@Uw=!2L9(gjm;cI*_Ye`lUYpD%) zD}obp7r~$Gd^t33kKM60m_LS~#25604jbZHsJqcNd9eUk1Q+_$PFK{vT!TXZ|PfR3($tPA9*9%vd^bklWHH zM7;LP6`)>EK}(~U*zCDEj3Ov|d4yeMyB&QR} zZVjBf6A97keA=;d{@jbzF4L}Jl8LSTR<0LGpQO% z+YBntTGO{66yPq|UA=jV;~fa{a*=+XT&eu|z-lM{9b;{3B(Hp<&!`7;IM_xc$pKn* zCj023dR%^;{vg9YNo^cpCfSbnK31HccNZZ zWOB|&et&MOs5b%TcY?Cn&j|n{-GBe^S;0xe(Ex#@v&hTCC?D3lJK_MuU%2&67R*b< zpW9j`e}$}YA=uaGHV9pXJ^LtOb}n9P51+}t%c*%nqn z=l!r-LCB(FB6IJGxc;>yAfMwqwsB*?|e?}S&# zZv$jbc#d0*cs;naOec4c$(@kcbcTAG+PYuG(QqSVxeO322A@-1v*e$Jp;aq3TXJBz zdMC zfFr}ruga!_iE=so-v&d&;ws|%22W$qa3A`MBaABU^H;@_>fWXTn!}I?jUs6tD>e0z zzAnI{&!&lOq}x!*$%rTpR*1Ic&m3h2bLW1W`%Z1aSj?THtfl!w_Z%w(kqD1#rNZ`p z!95qIqStO!y*${{>seg_N!9?h0k+ke;lxp_@u`5*Hens2ulZ3(L+5t1k7HC}$1z>I zn%R#G>KuINIWJ>cw{_sQPf+(k@0S+f;8(PDU@Jt>Ml6?$5|5>t_u0_fecHJ~i56B1 zwRu2o!ISzYH5c#*u@IKyW&Bj&T1w&-Jv$eWoOC!Zn6}0#MmLG$K8%OP3bUlljik<< zTbv|L8C(lGIWB51Gu&a+!#ZiO?ujfu{-7MVfq(V|3z+~dEzhY7@M@zr{JLuFlCGPz zRre)h%Lhxn?9C&cOoH|E_vXjIT?$K1V~N)EdyFglo*1eWuV3$Xp>idYa2=J&Hl`73>L#5U>vvx%k%+dt zCi4s-cF_B_kkjK66Qz~1!s*;u(4^y$J}xrSENs$?o|yIWpd}I%R$o^MG!b&QbXjjR z^^W)evPCrzOI&1~-d6W(VyWa*C?b=LuZo0&jHI3G7K$=;X3_Mq(wh5E@7{+NV`gf) zWrZ2jMF|PLslPJb^X*+S1Sz8YAkvh`ZfukdyeHEmWzi<3X+!s~1pZecVUy$Fl*~MK}gf6qHzpiQ_ zcjHE(42TQEK+?`XJ>U@flbfljsT3z3=|N10?rNAv7W9q+6Gp)i({Eqa3*dE;vKnyM za6rJNP%8FWS#_@g!mNbezK_8QEr~Iyfh+Io5cGfas$`wCvb2XdLZ4hxrvPK^{^~=! zc1wHGEof}IUG#M=2ol=LY5jRYDjSRUQu*PU^3Ho7k52M)D%N-71yWt3>>CC=A3^<{ zj86kK2Py#_S$dufVs&U*a#Et^rgdu~=4Pk;nQvv)ZB4I0A6o3Do^Zy;l5=hoLSRML zp~5}N*RTK6mHF^hOxvSZnfdu(=mPymvy)x^I`vOcy`aR^tto&Uw6NqjG2G$_85%YT z`WKi%<&tUPAYC<4R@K8SfrKbC$j`Q(&;2lGYrlv4i!~7*+4{S@)=h)Lm*U@73s=;J z2GvGPj|=)62|9AEop;tle*?&ZLy%Oz-8ahIo1PI?Z1_Zy6-ekYPOBE2`CJF$=A*wh zKKgb{^7Q!bW#AcH_c@w2_p}JwFu-17sOS!mAie;guwYDJ&CYua`eTJ7&=cc>o=#a3TaR=gw~>nSmhOvA)V2XAF8Xj+6Fk0!v@wri%qux0 zX(rCG75YVYWI*1v8iMd!24+k(c57k{mNaMxyk6vK(0Fe+=bV6D)uP`E1JiZmtsu5K zd!^=;IEpq{xMW8m*8*`H0f^tBD|Q7k{iXNZk}7$)v~q{W76RyA{X_R>o4$jb=#T9d zl?n6J*8^LPe-JNi0rnHt@A09iT0G(OLiBt)z6qZYxjjNA$JLMJX#A}|sw9Kq5&zT! zkm&jZn-5Q_6!YiT6!hJYxst?*3M&3fgvXLe&_{^ zkFXJl-CCHjeH(zNj`~_vn)*A8c>F|w2a5jByq`IKcy3QDh5bSN})ZqA$5a;gLQD7I+EG?^&>xgBo37*$;viZ){^8wJyd_I zHItc`wRF43b1a|xQKzpO*bi(}cN5_DPPrEd)M-Mnbtq;1*6RRAMt zav1#VQ%c$cT70~IO%atYhp(% zcaUz3Fi6)#VEV>zxU&EldlYb+32~Buca#Le$+48Iyf{ndZnt=Gew^hAh%W~C%7n!y z&3cqTbRN6?sUQ(etmm1*QBh#!@X6Ls)Cx0nsB<4>^<~^j+rEB%?-j)f zTXhxD%Xg$!4{!{#li(fu>)P@K0QtSYf&0r~LnMq1P%iuOjY370;)$g}(tj`|IJM%} zbO1y}Fi%Y|77GTtIkFje`q`$uz2541wfGq#P{jJ)xX*QRCtp0;R~Mn}qgITewFclN z6TE>SE2|4vKhrofGU`^$Hi$E{6oW($MuX5lW)DWR0F%!&;nzdbY(f0^BXOhg)vOYG z=)>>vFHGg^t)A2ltsBTRk%`BSy)hyKm~kMhV}T8vHDH7bF26OU_c+PO4%> z9n#x8uW{1qN#tj{0BmS>^+d|kJdHK>PHIs!lM%7)wB+Oj|O!OvN@fZ+6j<(7Izl)au zaF?@cvkTzg4uw9@(faj;SO2DgEYnKYh={`BaLD9z3YQ(+E}c**LfN{{l3cR=2$w4V z3zHbcf&sv+;0B~ z)*K*Y5WWHI;&E{sw6;8_-?9p`B%gD#0(6%S?86(9U-TNzu-99luC7#bo zG9N5G%dh_iUTp^eRT{y6@(sKzKA|NkeW0f!QAE^3>XA-3Rri5efe54^kg>%7u51@< z6U&2qQ(~QosCwd2%(<`UMwae>%N|w^3@eSs0u5%~|4W0P7zsQl86Th{v_*Hcm=E>m znrtl@5n<{VJeyd?fGd_3at365!2V?)fM6-!ZNHW&Kum6hr9xv1rlSSVcAe!#_WKH# zVEL1m{?Hgc2|Qk5>wD+;1hWpR#pSbWb=Lp+edl9i0JdZA!S+p>8E~n;cmf2&Cx_Bt z1pxEw2L5{f{gf6!0Z7`a`(JiZXy)eobhaw*(j^NM$IU&!+<%R*>l1(Oh}DvYZIk?< zq^x`)?^v(A=`@|dbWrXR!_g{2%0&J~RZv8j9%R}#hDvz!HH?4p#e*Lf2)1%y( zL$bo`TaZWm;Ikrw1)vLt-kX(4OhwLt&ts=b3U#}&B-D*k(U0pUsa zg+lDmwGm;oP1@)Gt?VG**j0_2qNRJ0AcH_1*g{raqZKeL$nOW~zl!FhGw2jEyrfZ~uMR&miDOjcmz?JI&ocASqqUbPC#J`t;`k{DbM3a%K z#SNGlyz7_G;4DbSi@jekAEh%qyw^LN5)D{ErM;xqv&?|O?O_^z){prQH4$|I$1ShKtb^zq1bF>HWv5zO+M6=&!YzrOMouFYuNIQ6M_E7} zt5Al%Y(LLwmxXL=v%iwB-2$ru)u0STFE0>#wz6YC`Uhf=nts+9W8xI8k&~&Qh|j<^tJrt?8r=q z`X0cdHvx+dj73swCZGv2S>KyamYP|3-UD$BbowrOf5Kz{#QAehFiBBr>n|?eP1hxF zj=QVfe68tIgU*T}Q%7(6d1i!IfWLMi)oWQiSO%sIa)9@}eCl|*ZkHq34^DZjmFmA4 z(nC%kdaDjRZ)Imr4+dnovW#+-MHk2=K&J6m?(zve$3l?C_p;7=o0()jMx?Y{5q@OJ zmJPB&7r=;ruaEbgyieV}gV-5|XK6g$3a$Bv&0@6Brnz1F>SAngBDTQzbQa?jukklT zuF}26u*IkeM8dqjqgfiuSpg@i*rQKz`X7N`2&#$A7i6H7Df?Y2^htCoGp4oC?WKdyf+wm=CQ@t~rx!Vgd_|DoU{{QWH6CDWdujk3{Q~ z#=N1r&dVQT81kt*zlZOqJ^<3{Ku!E61XJNpA$`B;q9x{hCtY?_Ba&NZ5x3Z}$uU~F z^SI5ei?nVTSQE(j$1L4)u~unH%u=cq1S3yRpn0B;0IN^D6bBClJNJ7~IYTzroJJhz zs=O6WE@<9_CjKKL`KQ44H^k;M96btk)4$-GRpQ!zhcZ2$6WLxPpHZfX5Vz}=nr(2Q zU_Op?UjSoJXSf^$3wY*R&~n|Kv4gVI+_j=3XrD^4_6=FI{E|Kl-8u*>Y$V)^LadMH z%Dl%uzD8hi)s#piFGX-q5L<7#5S48c8D1~J@P=$&9COiqIEc#^Q^BmVsowSW?CNr}Fjukv)SYekRvtKbq=Q~G zJJ*Btg4*0j2hNnyHhj-;iEYJii$pOfm`9(oRJ2GfEQRJqO+*U4QxabM=jbBC1V~8( zAKnFpVbIw{(wJ-&7vNb0!E8c!6gBh(|IqbO{_%y}o!7SFe$eCL>k@k)ck7vf z=+Tc;w=ZMd{7B{8NUI#i8D4SZH@RjjC=8E*I_}tFr?3KP?NO|Myu+h-LInaL_{pi_#Gw-xgy`{oxZ z4OlPn{zJvfkx2s49BCM*{{*EaQ#ItJD5&VPB3Q3qCLBdLdDv~*)HgauSCz1nDm7Kv`3U^9i2#|pw|?g0p+U7L)_UIE%PrNA6KdA zqjr-BHs4TDG{vWh1KFTR{1U%%&?we_bR1MQ4o1yf9s`lOx5@N|fzDMTzA(i+s(^9EVP zuh09^1FS5G@_LZVH?^E3_7&@%u^!&iq#ypm+F>NJ3HRgTd=V^XnS5NwenAES4ybMTpF!TO+We2Y{<(I3 zpE)n=qx^r3uAbi$MH3Sff}ZM79VXW&a(1fdTpifXnlDoun4X>%5u@4%#zNkma!_An z1f))D3hHB(bhZiyuEqQ)+a>|)QdsCeGUc_U$cd@2w|v=}@cY%zFNhaZo@RdQ!H$zZ z8mPr{TZl0QNQm1;OO88Bw(s`^3m^B?ix)Oh43f{^^#v(TO4xJgLyU_%FYfu#MRIAi zYOxj92psJ2yj50mDt)HrLshl#)&tIJVdphylh5N^O+Gkyb5FLrt4tVt5L*DvY0$Gp zcvS3jiq(P~^)df$H`vPo2SW;hcvkhSbL9spDv`Pd@ed9IakSQxeQ`}wr_2VQRu_O$Tp{GUb9C;UYzR7MvmC!>f_R1#Kwe}0r{ zb5wd_LW4^fs#~QReqZD8Xg%#-h2F)dJO; z@8!jrq+_N+okG%u366wZx(NIBZLVP3X$Ez-e?|Xkjf=Hw8Wq3S^^dladWT2+*k4O- z6JtcX^ZQ#pYy9YQX(05ES*QlP`)P#oW%M_*}jBbZ`$Vwwb@>(9sRFoS_r0PQA1jm_OsEo%CF^6!x;5I zh2sAj{u+bia$A#v@=X#DPF>hd!!G6_ z)B8P}(np!oGtPuNqZ_CMNoIFv`5n#eu7=&$4Bw8azPt~c%D5ZbMy+8#j|d3IAS^j7 z+jApM#F6k(&t>q=xdzw4Qzu7$PU7J!Lxmw3bp9{)()d1ZDclT-H4q#4`{=qR6ZCwD zlS&oqfV5>d3Y%3+LU#2J-dO}8r{@4-XAm?dq+AjJ5de)kdKRV;|f5!p8(0SDx z+S}VHU}2}u6?acY;Igx86mW2@#&wnQSg35DSvvx}{Nz5QB9y&jX2O~d6-RB}oRT4(G<(5u9(nFK+$C=2~&+tx?0j2%iy=HT0 zkM5B56OrtcTlEel7FFdlxBdXz)RfA@J%#G)}o;1s~c&f!s40ir{f^T#y@DCKU__aV#1%Oz8+G;{Z zcO$>aLnJ?Qb~VSVsc7r=~1x9J0Y_cm|fz2 zh>pZ=a&58qTcgmX+(xg5;D8Ig9h-ss*{VAR983&c(+Vs0yrc|qEmOlEioP8eeH3CY z+U-;TK}zhBuqn)0-1_v2g@&?lcTgo(i}^^`?2KAO1HJ!923=QaAIi4Jusyg{@L-06 zEiK4;Y0D*04q&R(Rh97z>P&rU8U5^b*pWN2T6Z+`l(rjqp5VV>?OuDCSZAmfPgjX8 z3`6!yxAh~*;B)2LcTfF?-lzi+=Rfi$-}`Y*CIFhzqu^-m<=TXRi1|4wWq1e#TncyJ z6MKMqSJ_byirh1@hGTMaazG_i>2C1E?QlEFq0!;}+}x`)w$NUNc>gLm*am$xx>Vw- zvPyX#oc9^w%u5+jlr#1U3C8QV15UeFWs1xUtcI{XtdpXd(K<>@jsk5TWO)W$TJw8N zgiG!3=kZ>S8XYgxmFI7Q$5%t_PoEKjvpjM?C11>uJLq2HYqpN}$gI~)(LjQ8Zaue$ z!=&xcUUQriyEA7c?=Bfe_S8cyBw+e~FtM$nQ< ze?}$Osw?PXjeA{51hhkg*w=M53+swX8Z)*QZ|IlAwT8ib@K)W$E0BQ|pd-C}nDpEZ z=J_=FSmP~2jC4A7oyzETC~%_(ev|>WuvsCOx?9@^<8A6zONy&!4Rv#i%GHysqC3_e z1Ktlq*H`w6Y6)^zMx0>m;r{bcwex44YjbYkW^)HsN4E#hW5eMdkuXVcO^xBh`=~W| z+N7~HgW}%YiiaTtD+B2u&h(@|_HI6Fw!h7HrT_!xp;_t`~ohq*#E!PprD{DC#dXilJz@qm4Cb&OpXH^VC@*ZR9B~`o^Ptf`43^F5zb-RJgLrD4x68e4lmCuQd%d5=ErD^Qe5+D9&k29nN3YTeyY5)TB%s zMF{M9S~HSCrt}}zP_6*yMY`c&PR+Kko0xUCmE?{0#6>OVaLAzwEvNDm&|Z6CNhSIe!NL)zlsBO5p_7By+;v#FQ_`4QG_i9X3s2kN*I-VYtX4 zOec?XOM6S~1nK73uz1b^hGjSRC$Vy1{(CFDVeidV+$1R(8b7y^#Wg+|r?9NYM&S+> z4Ig9+wVoHN3x~nfLjVq)2nb-d6?eE4PFPKCuQ-qD2ZPaMU#)9byYFgBSA} z=0s)5!LAfLhF+og4VyY7up9Nj9{dZ0dGA5k6?BAkevde}Ss@kGaBd0bVY*pEA3!s{N_q*#^9pIJi PB?qi+tqLtp{`S8B>z}!Z literal 0 HcmV?d00001 From dd3eb86f406d1715ba6f6a7b3e9e4eed81a29c34 Mon Sep 17 00:00:00 2001 From: risia Date: Tue, 18 Sep 2018 21:51:23 -0400 Subject: [PATCH 30/37] block size analysis --- README.md | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 721c7bd..e3b3ff5 100644 --- a/README.md +++ b/README.md @@ -138,7 +138,24 @@ To get a rough estimate on the advantages of performance of each method, a loop ### Varying Block Size -The performance was first tested on a set of data with 215 (32,768) integer values (minus 3 for the non-power-of-two test). The block size was varied at powers of 2 from 32 (the reasonable minimum, the warp size) to 1024 (the max threads per block of this GPU). +The performance was first tested on a set of data with 215 (32,768) integer values (minus 3 for the non-power-of-two test). The block size was varied at powers of 2 from 32 (the reasonable minimum, the warp size) to 1024 (the max threads per block of this GPU). This performance test was to decide on an optimal blocksize for testing these algorithms against the CPU scan and compact algorithms. We do not test Thrust in this manner as we only wrap its implementation. + +![Naive Scan](img/naive_blocksize.PNG) ![Shared Memory Scan](img/sm_scan_blocksize.PNG) + +![Work-Efficient Scan](img/we_scan_blocksize.PNG) ![Work-Efficient Compact](img/we_compact_blocksize.PNG) + +The average time to execute varies slightly between runs, but we can approximate the optimal blocksize from which size gives te shortest execution times on average. For some, the difference is minimal, making it difficult to choose, but the final selection is as follows: + +* Naive Scan: 256 +* Work-Efficient Scan: 64 +* Shared Memory Scan: 512 + +The larger block size for the shared memory scan is likely more efficient due to both sharing memory across more threads and fewer total blocks to stitch together, reducing the latency from the secondary scan on block sums. In contrast, the work-efficient scan is most efficient for smaller block sizes. This may be due to the simplicity of the kernels and the decreasing number of active blocks per iteration allowing better thoroughput with smaller threadcounts per block. + +![Radix Sort](img/radix_blocksize.PNG) + +The Radix sort appears most efficient at 128 or 256 threads per block. Since the average between power-of-two and non-power-of-two array speeds is better for an array of size 128, this value is chosen. + ### Varying Data Set Sizes From f2271e641bdc8b346e659ba3f549b082dce0f34b Mon Sep 17 00:00:00 2001 From: Angelina Risi Date: Tue, 18 Sep 2018 23:12:13 -0400 Subject: [PATCH 31/37] more performance analysis plots --- Project2 Performance Analysis.xlsx | Bin 24706 -> 65855 bytes img/compact_comp1.PNG | Bin 0 -> 17385 bytes img/compact_comp2.PNG | Bin 0 -> 12253 bytes img/scan_comp1.PNG | Bin 0 -> 15807 bytes img/scan_comp2.PNG | Bin 0 -> 12699 bytes stream_compaction/efficient.cu | 2 +- stream_compaction/radix.cu | 2 +- stream_compaction/shared_mem.cu | 6 +++--- 8 files changed, 5 insertions(+), 5 deletions(-) create mode 100644 img/compact_comp1.PNG create mode 100644 img/compact_comp2.PNG create mode 100644 img/scan_comp1.PNG create mode 100644 img/scan_comp2.PNG diff --git a/Project2 Performance Analysis.xlsx b/Project2 Performance Analysis.xlsx index 146eef79853400ea95f93141391fa6af77adc0e4..fc00e9dfcf3b56ffd115e9cc14cbd32d70cf29b3 100644 GIT binary patch delta 32664 zcmbrmV|ZrG+ASK}$rIZh+qP}9V;h|(wr$(CZKvaoZ6}?ib9%k&+Gn4&_xGJ2d;gwQ zb=9c4N6j(sd(;dZ0PPEez*dq4hrj@V0)YVm0U-eyUpA$t2Lk~qMQ^|+1p}rp*e@|* zgx(~0hK;vTJMkJJEW2ZYGZ>>CfJXn=5OcGgdTxEba@ukgQMKwm5VAjSjhK6Rmn9vs zaO$x!zXSQnu0uc3otXfJv0A8{Me*kK($Y;D@*oFG*%C;x9z?cUwOWVi;R9X^)0Co{ zxZ(2)ox3toldj<(r?KbmlK73c> zyAef$Urg9Pb~G8&LHizU%k@C=?T6AnPRS*#dOFUWlbh*Q7D~{3QtgTCU-Gtnm9`ZF>{g{QY??(9SSZ zgk3PrUx_z8jBNc!Bt;%s|Osv8m(I z(7ol1PrSvLln4doPMJe;E>H)Ayt*_0XFsR^+0U1wV4U3ZIm>erE>GBzhH881BTarr zaC?Pm!r^?r9ox%hHqLGCs#1jW=%+P04M*9=;1oDMp5*rBVz7wO4{*hrB31fyDw9)b z^E&L#+B6WSKzY+NyC=LPC0vXqL?QIc*yHCRJ-nyKHt9M_riUcp{dqwa zZbrW#WXSHTgsvmG1Gb9l#froL*aksa?LPWF8%SDK^ya+^FI}ta`0h`3LcABT7x%+5 z(29zwXQT2!4UVGqH$;99*_q}a)19H$Fcf@)v0b2@Q~}jGJtw!X8O6bl89>lvMA2yN z%ajdau~BD&#Ir8~liwMBe*G-9?eamn6A332C~d*ALRv;4qshKn&7cHUXENuP9)D1e z?FFxqy+Jr-)Zd8}7B`U36_jT14>S?W!&l(*v~uoun;zcHO(E{z#*qIQ!{A?PCu z$9j=<(dYd=^GZK*ba5+GRT;7@z$ojgj84t|E$Q%1m)Vk?}kC~MTN0;1GweCXaTIV@Je=&F@U z9Y5$evussiiETcZpGn^7AV2)|?7x?FNI zgPbA2jZH(F{Wd?|&DA5I&P_F+E^-%}Tj6u)wYEWRXg(ehp$KeS^Uj%Fw*E%Lsu}1w zv`*u8V%6xJL^t$MjL^h1L5F^W1MnFKquAu_p^t`lxfQ{`%#LAlHnV?@zrfu@P{>nW zRH24gS`U#vJ{Ybxg8nqH+6;o?o*ya@RQ#ZG{j|pHFwTUu$DIXhLr4<_P^>iSh*2^g zCysx`6)mQU2y6?4nu`|qCI|oO&%t;~u)`7y$c#k-HAFcW-pK0Zc}M<-CLIWBph>?C z5HJ9L&sQ_5>R{{O?8404ghhc2+JuOt z00(qFfu-|@1pygm0|EKZq5c_>nVG@U&h}DQ_b(4I{S1x1f+W(zv=PXoI;YB69S@7K zZ8xB#dBHF7=oex+nu}G`7zi$bJ3~>n>n(UxRsp34d|YVagb86`C%l|Jzpw0tWAFwu z3uz=N8wr+(%{7jI;@m_TM8EQM8m!px)3{63r6rE}t)_Qk|GdR%t}}HFMjQU*B7;Kt zmSxPrQpZ&n79^acp8#opSxHI3m!)nSz`fP@Mr`ql8P0gx@0RUI9Fd+0;s+K0$uMcp zw`t+ao&~_kP}A-rrqFCM3N}H)x2%oqJS#V+HoLaKz!Dz?R+eTu@m67ii&yJnpok6_ zbySiWsl;*%M;zdBrpL;S;fG5^PIXulmptvMus2rHyCjwYXpMQjKe5&w<+zP0;$^B5 zhpO58FnyeS1J>Pib(ywtF}dk22X2lUXNok1czY`SiAEvc)-a>l$XG@HL_+>C#n}}L z^4vZD5i&vw1g_os$SeoOA4IMorQW2da*ZS_IJwh%E?snAc6u{F*|Ug7Cz_Bmcn*9^ z!1CBn1FWws(DG$#ynS-BYK@FgV$0{`U{>#0wC|my2W*k3W`q4f^84Q0Nstk&5+4Fq zEZhu^Me{ttW{*=V`Te0`Bsv|6fz=W!*6HVVG5&P|P>YOkG1M@!WGYLidHF={V`}0O z?E8{$U~Mn>L&l^w;J%F@g94x3=~BxHQL?t0;V8vQF9@X{g^SA>w(%WLy>?@;AJo~p_iBIJwluS+)(eE3m!{qcj}c|8 zl=yCi}# zAz~I=o`!TnMcDVL07QIKW<`dHe$RHOV`LscI4^ky7nID$6q@fPvA+$!@LKECl>1-AzIz~KZIEE-|_dBhA|xWo-p5~-Zl0s6XM zQCYAB!RS9)n>*{QM#iFzN`Bu|X@tvhWSgYI!K|z%iR3L-7s8<)gTS`L+eQ%J*x?C* zCRT-Urf-)4YJTQ}4afTrTMUu*Ex~C%)oLlQdGUX9gR+lZd_W#G50xbu;#5goNxCNY z1xn}b(t$InSbM87^Ib1vl*~8w&HYN`woJy0!St@gN#D&S;SV`b@x_(SqU{T5j za$l2&n_y_c@?>cBrpea~%j-`envBKipO~>Ie6>@M{G8f!MCG5JkCA33U2i=&ZeqBLx>GMOt=m5tv+ zX+&>C<6gH4_rp3jP|{EPH%=Woo5o zYhJv!uPUv~2eS5N^sd&x??qy*o_K@1A>XGYNsM}pyGhC8L3!V2c68ZapEk4-57?3A zSW$vFo2~UFi`p51(mBx-j3iNK>C0Q-V$1k!3QaN9mjkvCk#9N{fHk~FD9YpYXiId6 z`fKU4Ak2)lv4{&q*~?Q$mK)LT5E|x+yeD~i7%Jj9;bqs1rvTAp(`}zvY#YYm$8W=U zL;ChjSF>}khNRM7fZQPZltl~m8g^C=PpxMp6ykv@? zTUYtsl;d6-BKkq1<}lZGxP(FqMw-Y}$iyBX+%OrJcG?8+@w|SR;{&IF7@z~RbHi2F z1e(w8pkRZ$4h?O(s4$uYZL(}IY(>qz+d*{@_ZSpCp(MZA7I$&e06uTPAIlII>DzeW z+?w+qBgaBH*7V*B6DsGYhA@x|?6Z?HASagyREl3AL*lt4CdqK=E#} zk&P}jwEGT8+GviR3dWyxohb1xcXZ zK^X2h@N6z;zPrL}MsFrCqp^s*ahOSs*9&~wxmaH+n3vmop7aP?M&P{&NanS}SAYM@ zwN?Pp3;r05VAi6#K&FoF9*rWsUg(q$7&W(LQB765c5T9`tgOq@c|E7=F8IWgoK^n&gMw>Stp~FwPS;!C}B+mn;2t60cw8BK0Fm>|Hf+%hD&2 z_ZQ;?JuR!4-aTxEA?wNJ2zVr{8EnAUWzX2)k*HDd2cyF>-z`T*7sa`ZJ7) z{`bB;J)>-yD*`aFhP9h`yHdbXrU0dgBnMam$3~2!-edKRan_zXyCxU0pqtjb#^0`W z#`bLH<|2!p$B=$y#75(=kWf4$u`@d`mGR3LJW0@SmoPOGAQTZ7+;CHVehN~hYy2d; zB0=RrZX=$MU(QK7$`}O|XuYGJu^_x?prF5rY|+o(B?Y`T`u6F+*fpYZ4rx*hXxUat zjEjZsqY%@TRCOy7rIBSl=>TMbT-ZE$vD{M7J{_)nw>);xVe`Jl&bq#F^WP(z(7B(xP% zNbTkghy#*55bDpNBjVQ?E)MbC34`bMixoKk)~D~w5F)|Hdes2S6@J&{vp*#D_Cs10 zo*dH+v!@TiSU8A)k%|iRg(B?ZJE;N(jLrM4|I&b=mexrA zNOUZJ$&=l=jx;L&CQw5|Et6_s6LSIqCjL-pYsO5s7E7E-Cg9ybw&Y(&dI-Gx4Drnm zjpCgPj!(`nv`=l`HMgso9N;iu_}D%gla%~QQp5%P_tK8(A8Ge5VF>t7HTQ2}$kenf zRSN+`QAlE+`YZapIR9(W$MkQ}r|odSh4zo=(?}1)p(DQ#h<1mHy<~&Rx<*QTM5uRY z7USKRPs}WIc{KX-j>#zLZX1{6dgJb$4K2B^Odh5FQ5jjnDY!Ee;S^-0`*Wq=)ozT< zF=Z7ft~~>FJT`?+%N&zhP$^Vgd)wbVR`d3-u=Djyd2^x}-WUaSv<5<|zBvbd3^0}N zTeW$h<&&(@=WGw`w30swAE3>7f{U~dc+Gy~8;kG`46lI6w?IiRYFzw06ZpJ1Yo5L< zpT%;~W9EjBgf;08c&@b7k*D_{JLfOtVsruOv^^!Sd-2~$n|Lb=O(XhDsS|)E!421E z@dP*pjrY4w6Ec#QXHJ0WX%jJ;k184}12Vb>z~KYSSo$M;q~}!M;i7(xl z4oW~3tQU*reD;GZqh@(OmBrMY!qglRIOmg0J@_@~*Nnv9?nWjeLYOk~PzFeP6?X#0j5R2R_U7GpLb z;BC&!9F=myxLG40p?Z9?C9)eJQh-Ais$7lsQP@vc%UVyXW2lrzV6Io_@?9_0g-aRX zPG$;oIejh(mSvtOH{gw*9^tk>*mk+T0b;w(@_SWijrFX^L`tNKvo^?bIQ78Nx7l^8 zsdm1gt;pQ$)~$n};XCjffD9177S4~{5mG;?ql9>Z|2VK%5wF?ki2{gY9EhXQSrYpm zx@;@OoGAJAqi2^i>4hr%JHE)uL%Etx=BgO!AOdm0ULt)x8C4Xf5(rk~J=DT`HRThW znm@Kl{A;%%rpI}IN{x+r6N}F_-=~@{>kE+}0`T_z z_Mo(X(Y(ah;2G@u5GfF%iwN@}J$TGRoDK&wkAy4it+0eUWd%~eJ8D2nNQ@0#B9%IX z197T9*9h0G1cPlFWFTH@z=Ic#(7gxsW0{^y;TE$t&DLhWg3QAa=1idDAVXJ-3Z;ZM~HeXMpex3eG})ol}I%-8xM znB=qboTGo$d>hu9_j_B~lQ-PP4#bc?GU?iT?59S<&-1+vX_O(fHjZVU{Y_Z>;>}Gf ze(5Ww*dTt3f+yf;!#1@upS?+?phrpyZ?jaz93{-Sub@*SUHzakMAvC?h8y2AzYV?* z8Dq@n{9Gpr3O^25V5~0s6PEg$RRB;~>#NHk5*6wAQOHXlJlvo}+sBX4BwP1mPtGv= z`R^L&rdAuy`M<#D`8Tx#9xyJ`YA8dYTz3?T0N~%#&hiKNTbG6no03~&o14S0mfgBE z;G&yXqP2iJbi$`;gGOFCtm(NEWDBi={X?)3%*ea|bvsrXEFB?xj%YsHl}?a%&N z1zfv})>*w192{lVBu7W=H z(iOKfaL((&Q;!1%AI6~heOD13Cp%B|o!osKh?}pF)mqp5V@-&(tsM9@%*kCk@3`Zt zgzrEDzksvRdsaEG=6BSbgAp^4Tk*FJSvaJ+qvh}AZ#8slDVJv*6>t+#7iAKuItFyI z3pq&Co&{Ot#Nv}QS~cNI}1=>>Lf5>>Pi0dCbc+NPT#aUIYh`Bc_vcGsdlXq3_tF{1Xe8Nn`qgi?VGRZR zMq&o_hRer$$en?6eLNj));EoipfIe8o|{l-BuGLylqM<$Rd5x1|>vem`-=Mi$+eM(!_^ z`I=Znl22Z%f9b~w+q3DLsVsV$2@xPZmV=e{Ejf*qOeH?CN{`S6#JH&sFGX>8hzIin%`fVG>k#Y8tPfKQ};L*N5 znI{0-Slq5PS!{**dmA?Gg(az>nv}WKaW~rxw$TrY3N?scG4Bdb9psr0pg3y|GT~6v zwqs*59fLz(>Qz`|`EBK-D9vOmcc|Feg?o87bhw91oRMm*1~x(2!8C!kAKKt{2Ohb` zc{`Oxb~M`9-9FcBwnC=8*e8{R)d>IC4W+em9xxO=6Te z*|4<@02W40PwO6l<6F#v-n$Kuh=&2Htj0?-cK?K$VS3ZPY3694oBQ;EIHmVmV>x0q8%YN^gVmyxh0}ld{q=Jj~S3;sBK~bQ9 za{mv-h=TIJq!GFQ5sOK>*ni-hA*=hrK>rFxs3cPy8sLh&^$-&ZS6IU*S@Vu?I5&T?AT+?tSU`m{#+9P-$U@9NKqxkkjLT)2+cJ(BK|}q_557(J?ee>s^BWv_UnY@L;kR_K zlpAK5-H&I=tyls@EtEI{TAneQleIn)t?X>7TCgZnF(#E|*{U``3e(et#F?D+xe}n)IfNlH`i}1^Gm#pth7`g-Za5Gu%^>l!q=i2Qr2IUQ?eTb>J!?7EYr^vBLi z(ZLz!|BkZvDpyFF*@-Ea%t}jYcUe#XpYi_>CBpv~WfCg^C*Y(M6XIiY`!noafz~F9 zh}Qns8W8+nIdVjNivHWBzX|Y!zney!?C5C^ zHd!r80*@;pAO%8U=yUrnCa{K{JZi2Gmm(sd_uWPJ3+-LBCQjzT$U0&8zN;Mz+bq9R9UMi0}*RME`*{DFT-X_#X(qRVtdn zCex?aLWis`Vc*;C9o+xEg^;P7DvC;87>oof+xv6<8oIE8nMi@59FsUQsAA*wQ|BXa zLRu2irAuyjVujsS0oz^9jgbEK9{?qP0ethMEU0kLO)Ry7^DDw_ zDzm2Q>O$^~&apjbeKjRSSvfadA{>T1ehK(t4!eEan^7x1U zSKp_OQo!z7=s;;_?t@4Cbv>MIt%c`-U0q*}f|llXccv@q9c97`$cT2Yaw&mo%S~dmYqktw!=>6*$1tg{8UBIb;EazjOfq*beC*=?j z0rj+1$)pqg44XcYv2Rk$@V!#h7fQRZnf1Y1jbF@X)QBtVp@yZbD}7dv`{uAFx%l{! z!D6br5N;U{Zf|L~_@B4;ueZ3Lo`7vnr0qcNtm|0}ex7~kyKTMmv-aMTfdp{dIGwYg; zruK1?-;d6%dI##NZm2HOR!scbew?1&i5+`+-LnBb1?(kX+r8(or{3SgHb)%2-p7{q zy_S#~b@*mEwZ49j4NYytJiETM6SyzwkGd_k%|0qte*Uf6)Qh7}%As+ptIVYju)JEed&GwxKuOn3^J-^V|)0- z_T_pX*|M#plecIl_l{Sh&TJ=RB`?Q|wV7e2JPQ1h$wRdzX?iYD8JqjS%Pmw%Td zSsV0Oh()J)w&fx*BS2mS;z=Yyp2MbJb0dH6%XAP%E%oHhd>YEz9XM=fV0I}nfENGNRaWVQX|wo70^7gt|=lO#$v=YEme zzLcEZ;;Aa6PIO|xZ@~bV7s8d-&icAdRIxexQc7dlOihs=&bRz)IRMRug3nB9rc?-VfCv# z+Zqm+gqVfV&u9}sr|=IfjX$Umc1MD+$G@TOyenC3-w}a(@eSc8717ALGNDVZ+SVSUK8Jbgasg zY84E8EfLdR)~}4)e&%Eq2|p{MZlx+J7Uagd-6acfx>T|DkmVy?vvX6>LBA-Pca3NJ z`SpUTD%N-_$1jqFdm6}2S|Vpj(BAJ+8W?NV$Pzb!kw2VUPVpcL%ji-Aq3$P#$isYD zUvXNcQYw7I*4o4p*o_!eBN~b#SU9?2o%iFp;cw6mm+2*ba>pRnvk=0271x%_l+Jn| zYc1@iN31A9J`R#xu-u>X2KS0{WW*=%uaP_=eRF9TTI8Kg_7Y8@%DEdQD^VFfyn<1v zP9{zV&YBDBA$Ea1L-~!X{EXD^^dg6) zABsm0o2K=y9BpJhY!3{NWKqrrYRlFP;K(1uvnlmCj;pX&oEo$)^PzXl zQ^))dZHd~Io+w>oT z-NJo_GV5swqgf$9%&30qaw~A}^gn9b`&;$wdxtW;2KTa!R9(R5Dz--fW5&U29j=i9 zii134;Z!I(LPDNbA*lp& zVnPp$5}J1Mh*tcwi0*_ue>p*`xLiq7{=2bTDN5B5Goefl%ljZfXZ}v*W%YrI!FuaX z8e>}2$Kw|mQVU$zWK-O;5b9^8vu6AakX)3raLHQixR+aO7>{cIZ`Ej+o#E%5!fz;3 zkw2x;B5%tI$oP#a4qA`KQNd-yF-$2=k|9&~koaM$)<^g@TdEjvzK;u<0j> zjA3pRsG%!{(if+_EaNKBV_=z808atjDG@Hn>K%nerKv983r6vXId(UM%jIWnq|yQ^ z;glaXgz_gFhX>ruvFawQX|1PK?0mV9ljGIZe~EY*e_R6OEyN#I*e2(EILi^!WJa*)tm|3%c}oY0xx}Su6O&6`Ayk) z+@_GEJN)=?2t*eDoe=TPhd$*v+h51iS>NrEyh5&=|4Uuw3RIvs=gwADGx)7#zG+h?L(|GBUYt&CDq}WlKp~kj9 z0=m#TC2hOlHya3mJT6^>MVKUEKec zo!o0^GYt#{7=y#E`;1s{ywZEojE)pCil@J9Jh*JtZU7oFC{SqpBJHfYr$K#}=k84>)(941-Tk2(O0?)b;K&R0&)`QH4@-{?C^*!{;#xj6rDLtn6 z3P)CwP%~M^y{#>pUcsysI&>TY&tc!mk=Dodo$V4E_&xr+#F}z0H1YLV=Vs9im`#3; zQV)Y$CrrNxh35SflcDy=9@3NvDzSG=c#{QP2PM3X5=VGgM}b;Fry@pyx~`zaew|ui zuzN)+7cn`m+(Wd~2HW`7X#go<=tUB*-!R}5Bx>V3k4RO}M1@83%@&fi?+MKfUY>y}V$*S&j-92(mO0HY zy>k*U)7db(Qw1XCN@$Md^?G-$oupR(xdrJBajKt!V+)~f&kO0z5MDf*uq+I)HHHh2fS0?dp*Oz3rO*9{ z+?&i)Nh_alTkjZf=TqGe0?a>J?%#!CQq8XwEeNK1!szCS>Dt!drMtaLlU87A+DVcC z*EL5wBXVk|=cQ%fS#7&34E;dgssfDc^DW`vzXAqN; z?y%-V(;Z()fB?xd$B#dK59GhiGh#CVRtT1B(97jG(mtVp=vLd)D3h}z7y|6$!w@Oc zwQC35yH{0XBy?2+o}!L&sxhgu5O_kfB*L`f=oF5$bBnO2voz_osf*_qB=8^X(aO;# zCm#w_WhFWa)ugQ5t?s!8aEf+X$}%=pHB!`t&pD^1^-%D&me=2X&z^G!It%%L^DVu! zfub@D?X~2x7cx`j<769`xHqG9$nArV#qElPAnrm3>9R1+So6{$VuCCRunaY(Vjf#C z1iQtj>~Zgr4fnZ?C)oq$kvI6hDK9n zb8n6<$5KIf8!68mRZ!@rv#TY2MNY7`Sr$34os7>u2kt^#U6?NE>)`Wv&_c#sEKbn| zCZ%^=Y)4)1u;w!PCj{(%K{Xg;>>2fL{P*Y@ET(aXFno zBt@+%JoAqf@Gk4};|Oj*CxXtspsseLP0^pp3M>4VJRrxIQ+npuV#@W@{9^L1#*-(4 z;25EX(~h!G0TkkWX8)B6?O zki4j1OlPh5t@xoGg&b)(7~BC-(qP1d44;|${YDjlPnntKQ`_nU#_g#f%CR4`90NN zgltQuo$rvB#)L4kP)*03K$m3}vt_y5eJ(HpB>2`D(+^f{DSUPN*bUP^eCB0YrOa=8 zp8LGj<52~I8ut|l%egrCrU@_Zf@Gp`KS#OV<+c?GA@1*)!3OhWLlqI2!IN zs9F1vg(P}-*eFgE<@VW{h`cBa1KswI!Xyy35&=w%AF#`n5>pao!u;?;W`sqYF&!_i zH^lj*e@1;)0$A^OQ7aySElLlq9FJV&K`Wyc-p_5EMg5_s^NU`-*~aBldzpwz%O;9Y$xRZmTeYIysz{YK0Z6S~JTB(9#= z@Fs-&u$lz?b0E$mQ^yQ6*JM|S01Zl>p(e;$jb16*tCy~uK;zUo=-@W{KM)Fzp#-}*ZETbu_!Xgbx0r|Z@;FzZlk6$Me z?A(`SpGDDXMa|`V$%->I^iovdK)j%>rW=^tBoi5REB^r|mgy3ZBkaRCU@F8{Xma44 z_AOqciiJ#DWdW_VXvr@lmJ*VzIk+IP(oE>DmF1o|mXWzfdj*`hEmVB=p40-_Wc_#o z9-FtL_OF3i_ue=~buoGd!^;NKGtS(f0I8^;=ox?+)vY{42uF$)Y#UbRzBWrM#Wg0* zJamTm*U93v>G}!G_1O9jbwe>6ZdjiU5YyXw=wnl?Q#9CJsGcNbs1T+dD^k&Q*nQ96W4#s<9thLv`wr?au`IJxKU-qwkyfort$RSDI{)7)aifUf> zPT&xsW-|p2vqh>08e*i6xIwv#U%(SKBn8X5PDXpYq0vQ>*jxSb$93V!z=rxg#LfgP z4*_LJkB@=lXzfSF%UmzFnu=^vaxOW{wk7Jf4Dl2>c1diE3U%^=>HzS+tvScVp^YK_ z@}PDqo^F@kh3lknsrk2=r3TTU;hdl^BQcN(Sm-L7@!na!7K}zG|0UY8Pns0_J)LqZ zr1HY5wwbL4?hF-k7tNYsebA$Lq!N@-h~@1bb1AHtrV>#T)I>^Xe@Quan>L76SCfs* z+EYAvIbbvC@V>_>Wj}H1wkF=}O~PhSz$b7CBRhd@X)|=sNhc|7Yw`~VP#)7r**>KW zs8n34bz81Szmv#qxczZ>o{19r=V|GX$^!zZG(CZJldm2)U=-mF=0UD5p z^Hb5dI+J48Gh<(adR?6roQqnX#zJX0Mj)93Uf6~>H3VJz5F2PG8dL;)cFw*BNf{i}_?+Fc~N}?7Wst1GQx*a0z$Iev= zY)iI#0>N`)h>krp(*5tYnxpmuZe>9$n_8a)qG$BBV_$sIzP!%Vcj{+7pC~?*y_UyG zJd#_-3N;V}P>U4?`E+ObbiW_+_1R3~$X?o&KTWk1VqaO6xzV+L zFg+p_i(F*e6%f}G#2zdoKjAXh1hhg<;;&ostM^%3{%Ecl_xPmdDvtZ5yLvCgZCpB+ zR0?2k9vp`*u`1&hvI2+q5)k>k0NSr`vW)b1NNi&*ovYA zATKII2Zlc+gzljQZXs4#45Db}kZ^!R1)oC)y^tZG#=gRc#xjWooQ@5~t0IXi^zk5o zVqG}&$G?JL?`zG5qQ|_r>2y_&b9+)=RfA1ky(*m|fs|lTv|uNh#(*orI;7i+5FwN)5=D2~p@6k$6x7=a%CN z+4Umm<|4Y{XE#@E5Vhiopz)|%v9PVa)zjaOD6f^&x5Z(CZAKP=pEJ$hTosx7M_KN= zIi8<1L|MEszaF!vo~J+-hmbUqV*GyA32Loh_K4P>Pa9#N0b3Yg0K(0j-cAq!dT_Gu zl0{|@?gjh0eZQubM8APy|3EYS3@k|IhaPX+a_;*)Pu*-2dgx5G%=4P3!P5%3a^tR84JgQGaH2t++@Gs3(xPXcQk}>U`k%FU8dk_h&FQhog z9~WbiCtj3XUFm?BaR>uV4=gj#yVtMy8al-8Hv-XLok^;8yFl&0b(`)j1j2_C&ERWa zKWS=YBOuG{nKcLPxcZ+4UJS00)Mm)quW?p)dLJHY4Q2NP<;tU_fTF;OR|BGY0wQr# z>WX`sEJl%jESBWn3|#;D9kIhI?=dSC1e|`z6K(MhJhTp5X-KnCgFPR&?>B7IV!0-! zN3VYxRlH6z^ni07X>yR}-YP6l4%uQkN^smt3>Ra?-{$6DiqGN_A?wwC(z`A_D9WUY z{NDGSK+pyZH@TRqmYn2Q7w81Y_l{5DC02+;x;77gRh@S6^>{&+&@FzCK!Z(?fP`QD zN))ELrK&NeCDD7UP5fzRQM=>67`Nu(yCmgRGKLR_zYBEM24|M<@TZA00j0q1-aYs> zf@$f|AnvNw5K@k^zw$SGBa96JXXz_D!szHOn0aAA4LF1>z^utA*AooGqK{O%3BNz> z7TW6zrdc@gzgeUt6ooxsF%X1S@ z=uSp)cb;6&D}uq3Ie`aIu`-@@p$!|TaafLs1oo$uV*@ld3o`nw?(Xk~Z!b+f7i2Kg z+IAj}<+cBMZYrlH?n?ziHcqm>7b+m5+$2*4>a)xGIpXF~Sn@cM5iM?wM?p2j%t9Np zg@m`GhZ>uB*rb-fpH`MQT6S#@Qk>r-+uUNnguwmbY));#D}9tE z4oX{?zgBIOra^_?e_hu@X@a64289JRLa3DadzCsQiIEZ$_)it}Qb*U}pdH}HIRBLo z_dZ`3Pe0MfFuJBI8H*Jq&oaQ(1^HXaKh;tfk;0mqrop1&t0xSLyK!uSYoaxMng=;{ zcxqv9Z_fis@P7N{A{*NyT^)x$v~;OK9vll+zR84}4cMEZQ(&cr5BNsf+%M`9MEmf* zkoWb-fCD6_9e%AAgXIvCQbtTH=-BVZckNPd9OSO#XLrl+1l4byDBEJfH3y3Kag z-+b;;39o?6$ATGG)?oWwXZ}1hNucl)N#YrU)YM^;LxF}@tE#TXwTd`kePNC8N@dbn zfV?4RyP+|j$rnmry0MpJ2O}#9H>GOiKR`+-umWc3D@!Rgs6n>Z`{h=MNL-MPSjlOF zE-hH+Q3WyYr2~j*N+F?ZT6*q_=yJ*2b<;4*bq9%0a3)?QkT&sqz}N301NozoUrlqE z^$=%Q=MlSVb2YK64-jd3Nwg5h8}yksuK&(G)Ns5XPmEhDpi!~ZX-iS);T)clkL0(L(Al$1CXB$Ud!5Kz2OowRZrOB{_jn`0oZ(aI(%-3 zb>CPTIhF=CyL3gia_WgiD~WJl)C-ic;BOZXEo7VVJblF#G!O3_;i|uXzP*o#0A4cp z8NzLg`-iW72+?H??q;Defr2W>O~A$)`5|!C#d-Vq$zR=nWI0VIZ~Rjz=5U_ z?BGp{)mUu1MK~ZR3Nfnt zd7XASmUJ@AD3F>ws*F+pU^IY3DvG0w7M+Zi$T^$|f`a-Hfk{?6S|@sVKO!?-xmkRP z4dO{9yZN`D`)s2m`ju3FA&opNkLp9Uk1LIR#r3s!^dWwQ5#+ zsplhjXn#z&?XwcxpbVj??Fp_!%7l!)KLB6}-}JZz{gAAZQ5xcJyD<1dXtQ^Y8M~;w z?x`uI$m#U{$5e%HyZDB$5-8L67Jqg94ZPH=KGHiOHCWP4*HN1}R+RFObSAdnR~uz3 zZ!BK00rGy%AD3P;m(%^VUi2Rm_sY=TtKwl5X1RC^VjrFxOo>`nM8qmf(Lc1mWsJ@HC{q(bgGKEZPu=6@uY5>Ik*m2Hr zV!Kit?ZuJ@CV7P{YlA7W!l5bylCQ`zgEP(MMc6W?*_A)-UB%%qUW8k66};VV6~(`b z9snot* zT9&m3cXxMpfvech}&a{DgDgJ2~gwf3p_MUiD2? zcTex8x~J+Z&fI`1{4=Kb(;dlN?@#j`=Qcx<`qwGx)ta>A?<@M{~WD$ETf?0ks&h)&!t_dY~~{lVdlrIQW3 z!Rk7qqIWy%H`jCKAMjGQwI1boXMjqNRM5_nvP;vPGpa)CKD2Pmj9)*VmSec;uqbUH z@l$YB8gpYSH0pb!#4TJJ2>|?!J58`6x>8-chUkH}1Wpr4{#WER`MkD8-bhL1d?Z8I zbq+SMl#=(5uVoY4^$-g$X_yO~lM9feO&DluSwm{@Ds42XB_2}WoAiw;Iy{9*!GOx3 zi8s_t6ih-djnky|1j?wI)>eLgOac0w)k*lo@)HDF*kGcx=#cyvLSWy=nIXWmZBh3S zW3%b34i|*9DNT@|Pn9%dib*4p)zQ9x`e_wj2u-8=*j}YZO#fN!-K&{=_lHBGh{roX zbl+5&aQb*3`!x;;QAX!MN9FyL<2dM!kgk?*8>dKe?$$>C>6mG7Nip*ph)Vmd5&C3$ zH>OB_pXhQ0D&<{xNUYY>y1a}=hLLad*O92511VL;MXuf2;0^Q{wOCPK ztp;1rh4|>1I8#+dF-{Ox_V;CSc;jXt-yY9VVKJm8;1{2e9?y+8gsFbH4*c6b3~a6Z z!pwLLN3gESz8`xMh$U^ZCa0(2nmZwb#*fp#bes`Z_F{=ihZCub=AYOBQ_~Q7mmrea z3Oq!bJugvgWvmXK&;Qrg~EPnJ=J0A05m?i&p(k)EFKW?Casr1_J;PvH}430C1E(A?PSo z=fi?Dw%@Ae!Y{u^VY3rF?9aB*ttD!;K66hHi5w6tYkfeln)E}=ouNB#dtNd!&A|>Ijrefy*Vlb;M<`XUsY-ZS2-qDvdiMGxU_Z-ZI7P% zWKTD@Mn&vNi@xvW)O-ivzE7gM6#!lDsJrqKq7J%bmdBk-_$HOFpD?p&$iNtcFRs@} zlN#<%J4tgScAZ!B#aZ!>Rt?ynxF37=Bb1iMeP2l9Qqp+6WmKnof9R5lmnK(DX<+xp zsx5sj&FBf~z~~hOtbs)}0#*FvuGdw;4lq_k{`34$aqsf?douWrr5fio{a2tOtw$~6 z1QEHJU>c_VxWT(vCqb)vDJNcoe8 zyzjfpzT%C5a|zLt6CWFr@UYPPKrv+np2d%I;%!N{ximSSHE@c^oX^$#|^oGtcQvsMl1E z@6B!eFX7JX|EPU zF9Sgb7Ak2k>D+MiJr*r@F6{n}@q(UR;sI@KYVognl7)O(9v#mbjDlKFm|^&&$}8&| zp4T0IJWRDnEd!an_g_8keGu79rI#}hArzX zAI9p!v{(&~Jp%IZgl7FjgDs1OnmXV*GJ*t*C1_Lzt{VF-ksR&>fuELe%o5U_0MIN+Mg|kx9_t$s7dfO|b_|Hxk>U z2|&yX0X{GEVt7gWy6r7_?bsJLcwKMwWMQL1SEWqv*eIee3Er?mKlFquGdDWw1>jMt z>D9R2q$7lKL7;WOh8bHGk6nfZlh8fBdxX2LaCd-4-Wop3M)g7iz0M1qD}Xv>AZcG# z6~lkuO@%^+NQuMpJ|sC5n`VOE?r7R$4t1REBWg>uOLi*$iNhHUKk%C4wSNd>x`8^&9}RlKV)5?Hq^c4F=D$UY{{Ib*b-efH< z3`V}6UMW&Ih?ft#Fk|Hq&Q&@Hk~SHXo44zCc~(Ys%_*Ca|D{%ORdie&8X}Ae${ren z5JuAJv#WMi4=&QL)bKgGw{6La1H3i`w)gV~btnCx-%-ug*<2of?eaF*{@UgJPP?e? zWPJ{P{hu5Ru9qAPgqj;WI;H20-}KAI?s%4bpJ8Ts5)(#dp6$_Mu9wP7)+9Zxe5X6r$!G zgK}JeewDqrJ&&0b;$|7iw3nu;K?(82c6^wi32K98|Gr}Jkdd;E!cSi`TT*Iz#3Fd8 z=)xm*9bq^XQ&ZfzgIx@gt363-SjReL*5v0~>_JMC_{qjS)0!h|<8o!uq3|Y~;y2m( z#0W=M%VSMark2e>O&$=Y8jA~5MY=&@Q|x%*fMe!6jSeT<2)61D1dL>LQO1GYsC2(>IwNfWAraAz7LqF6mdIfOoA|pmNBly_HfWIRlc+m-m6Ev#sN8|>s_%MSL zbkzxgcLQzXFEpDp0i0b`XHKUOH>YooZJNh#A-x}CqdQ@g7y-RArzVr6fY(4{x*H6$ z<@XGPeU-Xuz!9!Of!DF&3e;d{9l^lU4Dny~sJn!8L=bTnKmEi*3vHVNu!%X&h8Yl^ zn<0_78rwM~>aMiQVGJ}MOEx@~V_aJXbLK*<8#792venuae@T8!+Po87`vmgK$w7J? z!q)tFZ}$lZQvsq~j>6$xbHd3ep<8T%pz>o_%u6G1`HO0a4PVZPH`>R3 z=9Cdra`8t11=uG@2v+y|8aTr=I1XODQI|D9j;HZ~)hGa6;c=noxzV#}%*1n2++kX$ zlCqhtW|-Pp%l0$9nW4ZMo;%D9YPi~9dm3!2{FM01fNg=ozf?Chn=6EbK#aaJ)!yd& z5At{w)k^7rlX`JP-usfzx?Zj632I$U5T7xo19qL$L!DXksRFu+Q{w7)M)FGgo_&#~ z0fNftOshJOzpCx{UI;PByZ6(N z7u!NKmcd$#>LBel^2;7Y^z!P5P%g9kLuokPf%9x&p)S*&ZD-Hw2~ioCFhksT87F zAbXB(X&ElArOGZ;abi12S*bkP>TRDb`P<9R-Z!`AK}8-@0xPtb$rQd*UJxVPdLC6+KQ9?$Hi`~~ed3rQ4yzsib z?*|uGGtWY~Uxk0v)s3#2PGQhs>MgcaK;1X=A#f>(;6%8EvAxqf6!&Sl*`@r49% zJ(=VQfPi+S_VER+VyZZ&M32=79^dLa1uKZUW_?5)dVY9VI0+~&hl&7|Uf-ap7tv>` z*H{(ASgG}XS!OTXkmH$yH!0z>RlPgj3(y$J!Ww{7m9&F-opFxF3K^9n-_te98LwBY z41)az@}Nh0T|WyFz_$&m>Q-8!bB2em3U=XyKp$B*Q7zPks0PynF6iDl4J2j)n9jgvq z*BKbmhJTITi%6S~ShG4a0%Joja@!!OfqrRh85djJ>#wRsS_i8gYV=0CGHKU#tVFj^ zpT&($SuT<|iiEmqHRv6P*E;528$?g669kmsiXTK&K|;UFj@yCciT*G?uoWHY-&oRI zPyM!($r_NIvhx@&!QK9(A7uR)IUd20GapG~=YvGf7XVmvpC7Mkr{wZ$2nDxVf-cYm z3#;F~C(KMD8qRs%BKWaQz#5->s4TB7k*CgQOa96`YU;CJfh;UDN$h0AVFqBFoGC(p z$+OC!bYKcDxTnYM*nnmbu#3@;eRZ4&jUy0NLA_P5=I*tCh6H8+QbOOMoU+mfk%JWC zPo9jQ^>Z!Q{HD_D0Qc$2mY=VqKOQ)~67WL|^7IkBuNP)d#Sq=$=fT0uhHYxi{Xj$+cA*2=R;wamX)CG8q?~rlq{DOj za50D*2vy>6hQ;LMA(|s61EmT+7`*wQeBKL5s$9$;`$k*1)a0{?_Hvqr2*~x(rKwYk zN-2_skFaw_MDFL4g+?O@%kd!p-s1scgC?0UDO16hG0z({St8MhlH6-e*%F(G?<75i z0;&6WP{n0~>9d(Z656W*H)`!k0ubs-&0|BWm+U_qOx&+{(#u%n78bw2d@Bp_*BmBL z(f{FzoFs048#@0Quu{zQ%LlY;q4Hr#M4|OL%N`@$(8FlsQtxyq6pSFc=S$qnkZxF; zbhQT5`Ei6*({Koz7Hr8J!@D#FjsKB|A*r@Wa`&oTkNVCZdkr$ffu_4hxX7mIMiB-M zdcSUDcFi4SY%>%(e;b7b@Xn1#@Ypp-0`;c_Qo%}e1*Z>;PTaDNP_VwP#3bLR`6LqE zkD@WAQZ$9{+hHm{E-bQe1T+&`#CMvZ)!LMgLqaLn&@)!pH@F&$I&ZP`Dcb5dC@4N9 z;hn1mXR@wB5|J~=NPR1_>gY|?Hqc(UYgjt1_>dR%pR;K0m$PWZSVk0L!sonD_ct#w z6EDeEN4n3HZw;8fBJDrn%EyAM`C^z4So8WjYcsy*S`l$s;qiT_krxz*rAf%0%nyG$ zwQvWU^=f3~AM@xvP{h(Zbq^wcOIeS|{N-8Q7ilAYSjHSBZz@vZ@>fdHdC*`WzYxOkp z{Vb~ESlI)01Wpwge!_R-d5pTISJH(fhXH&6{xBnm%b6#WZRx{R@euY*D~^Gh#@)Ivk?dQv{qpRzd7G#`9wgRB-Oo0 z&bmWp5DaB*yHzoXgT}IjU2(XT#O3$lH6nckIWRGVi7O zg%|WM^v6G9rSD>;>z_4RsoR4}Dl#uVrGYzi5O_B5m`(UG@OU5Dh9?F$cN;_)it3*k zKxj6@4X4yS8lB8S( z(-d}dImC){4B_Y9oA<4eo^fpW>Q_4pyEv+z#T`VPgH$=akXnmH#MD8n{y*g3%T!RR62jxvMeS!V za=M;J!E`;}-caXHFf&>M@XxPqZw%YghQHQZf{IIhc0_uR8s@NJo6Gx3+oG^^Q)Zp%|w=vc#b;G5}65*$T$?7QteBRCt8ZxIgE8^Z9 zLY8OJc3N=qM?30qYo_b7&rd#o@{`+z?c$E0o=tugw_^PxE&&hpXfEm08G^}NJFDr@ z4;ydTc)FS>H^ONJ3fPa?ENUV(la-w0Ombj{6Z*_&fwU%lVTOV(T6n(BU0C16$u{c% zITr)yEqb!`%meCZHRr45_LVBi5HtvrX12a+z;+~U!%b(-vXt3dkatLh#O#0V>8#n1 zHDwC3Y7EM1rF90)DOTX9eXb~cKcwzdvTF7| z-1~{LcX>WV^cw=M(`>1@daqkt#&W1y20o481J>csApMqViO`a*MwNzfs`p=}-}Mu9 z&^i>);RmIRf{s7qBx6*icht3Nc`PyGCDa#)RgGA~x?Mm2mMdwl1`J0b38 zi``>W-ji4#o>BY0f#)2<(qb6Rm21LjV-W&T@W4td^tcaU@in3Vq1Cz=@aXNz+W?xizcE*#$(=V(K>eHo8&YtICW=TCGku!>s@OySBhDTDzVAtn zzTM5y|1c4AiubD?;KRZPqvlG18ecD;M?$E0VdA;h1#w@B0q^dXbx4E(f^bKIO&t(( zjigS1>u(?x1)qNZr|mpHJ@DoK6FIw~ykr!`$v@B;hXxO8Bn zMe|pzT0$w3NUmfrAiQ$51r-Ysge8E&l2b6JdffY%9kmm8rRiCm3RidPX%};1o$tCi z7DaMWW9^&G>y0~aA{e!N&9zE0iS-6}X%?$T0BU0~)qMY@{8~iT(=4))NDvC?aUGov z`;#DPE8B}%6maU@#vTdMd{ScKg{84J}UBy1a&Z0-?*CT!8&i(|%^sHa@I!e7CbbK! zAb=zZl}U&ww%+bJXHtKL^AziAbsykp(zE1$J&OjMnf#1!wZtb6Y|~6@R~}W`@VZTC zb1R*Fs+yxOo8T_hOt7>Sf-4emUo(jP_~c-C`7QR?3()gz(d7B}2M@mUOsLI=w-9-> zX9@lo{&+aG<+VO#isOCrlgU%$p|&DgZXoefBY#Rk7ffIAFCaaK7ALxg6YbNco_vT& zs5R{i-BXK50rt-tH%+=dOg2-_l<;Cn3%D36_1xBk3W{eoE_k9U-;Wb?X?5h7$V)^%NdPPkJUubWZWeEoT<7Cl>3} zquIiyid-AsXs%f1mn`)YOsL-C6t)h4(SdJiK)f-6==6 zd;LbV4Ly|7l1sXBwmr4nS)?$J8x(dg{dEGb_mfzBcWi`^PIP`CeVRY~>5@#5A_quvaO5sM zA-;K3YN`GR20E^-lXrk)zJw3Q$j64Nm)D{oY&^3sQ>3xoWDG=x?f!^{%15ePvyewBZS{~;bfdR0MdA&7R)PhN^UtzYp(TUpvlOKE@EMAJYix<@}QuNfJD373Z|=18C0Ov%DaG1 z@HqkKx_yf!aEx4;rFfCJIhhzA_a3v)pCdbFcHL~&FYl%6lu7P>3Xeis zg{j(ix?$(joEG!@uBcVh6BJwfg;N@$yo-f(HTO5$)3^!dFC&)f5g2b0#j$!A(uK(q#6ug&tG`!YfFPF@HN*RI>*)?TS}rR;WOd8)0IYPl~K^y!oKv>CRt89)gC={ z%eUfgH&$i%4an7cKMUobglN_rA&D7K`=uLaGlk@SZm-@q-ne&@OgvH{u1PPm16}Ga zgd4wu@1o|m$VRs-N0TV?$t32@ylf~cWhTlDcY`{333oc{nyK;MbtnyBI`w~?)?Vj_ zcsnk{WPOH#7*fo}h4M28Q;-woHtyr?=(iLK(!QO?kyjlw-;4XwKH;|5YW07hv~5XW zY#xR)iTd?dR0r>~Cp`iU!TWNMhxlF*7#H^(gd8OH;!wX3gSYjgwk7 zndv&qw85J*DZ`F6T4g(nEyq_&H7S%Q0;+eZ~Au;n(g?sk7a zo5nm{0bd?RNw(Kl;}W^$$RT(BUG$eb2Zy*`DgKBBABz*`*aw+{U!Iqvpq8kIGfa>w zP=6!YlAQX>{h6o%r^dBcE2DlKvZlF89C79_QmPSby~}u-iZ@DUb&Z8i;wqN73k^U- zT=WyT#a`*g%3x?$M*fXd%YgKVDI2{EMaB=3F9r*hp2AmP%9N2XY(GZ8F1%; zm_Pg0RjLX}2leoC404t3F=7$a@&w4Mh5h@TIBv|+tvf4{)z<+FQyykZ)T}&Jk?@CJ z1}ZK%l`cw)F&l1!dHq+JMxKp~D0}t@f$6kXo%Y`=ukvQey&9Meg=^5ZE*Nw23HLah z78xAK1RylVki6>dE7;#}T{8jGF!{+%=e=uMcUp6iu+j@vi4D#h_tor6>PA5xUBmrr zl+n<-qP9mxMP(-;b^%tZNJ(sVvnAlrXiws_K4yB`?Ws(oASQq+aoXqj&FpVP6tGhSk!Rq_S-Pm`9 zez}2H1HXp)9-%G^?;+LoknUfBa9U-%Ppz|eE2nO&^7?6*8(aABG@F13vH%6c^$?1X z16AEvu64P|fYZyf=yH#_= z;a=obiI8OWn=sz=&)tSZe_Y-|5#%3vYpT zS{er#9v!<%iZUZbUl{;a#q<;x{Gsz!-y1jsC&ChS>mJ+TTsNDSk<@tOMsI!FHFrOC za)(iW7YGyIG9C^#YOZ@V5eH_^DLJKpLO$$D6vVsAl1fowaqZTqj)^gq( z53%1Q^?M(Yz8XY1?7u8Jvo~}GY)%%D}7`{6*wr7;IAe`o9(3YDr@c| zmCYRYXe;>3ps0e=8%>XM4=G8TdIOVG&=Cwc9o$X;-@M<&u5f z+$8PQW%w2)xgq!+zXe)%u2irzm*% z3slvUTz!&QNTU-Ytg$!q)|7T;!ERs?c3f0SnOrPq1P#_?02Yq?1Gw|G3W+2W@qWK7 zq?iQi_1b&|(oN0OCE0mmWCyHX+7$Vy=cQRRwQW8eO9R{TdiaCndBxG#`!fZbaz2FH z4;=4jr*$?jSUluqC-o%5abW%UMvCSP$#vT8+}64aYg9HV$&Uy3Mbceer!~%Eeg0gz z2b~Gjt76>;oRs&O!aL;5aP;Q)!fX>g)abeC#v{^NGdzW0tOyUth=b79iAOJrNDBo5G>o}59To9Vq7h_}9`EjL)bu}Vsv7?#Tew-vr& zn*ur2a8cyY9hVOQZRklCdE|2h5v9xMfatNh^2#gSYXF*{3vmscdxP(uOK#|mCt1Zl zOk(|_rdP+>y0^{`^u>EZ_s^KF;@zJGqW2{|-hAnXLw}1~smJ>k*QOVP@qN7qvE$c# zkVCgC-Oha2I)vEO#16LGFK5PIe(!{6&um8wQR%NFlxTW4-a9?kILla$pwTA8pB|ju zMjf6yb8a;xZr0fMx6&mL*Dx>c`;C*{Za4bRzOGvE*ir2r2b{OGqb~9-m^XxHjmfH3 zpyq3tL*pYv&9l}Q+D%AK$iv+!{+J}j0yQc@RnG5ol|u*GYUyUh!6@^u#GnMA3Ec(x zpU$mss4K%_M7#0n?2`OAcNVWt%)%IGM049pFDsmkPp30z$_13ARrx5$+Qr@!BM2^ z7t@{;ra>DFV$D)Dbgc&a55wP{Ea)#+cFYuI58YV|SulvQ>Ge`%0@>^-pz=8(UV zW{X|{Gk>eBk)5k8yhj*yIZowpT?$LaPam&pDHZ$gz%wER^K>OP1)1;gN*77r+Kl2( zQ613ahJS1|j#dw(^0gmmynoWJ>%RTe&6K&-tx6kmeOh+EouXwAr{Kl!!7q{lakC|q z6cp)myl%E<@`P#mSAn~^ANM=H}0<)idS}v5x9{>q*brHv#`QEub3fnY)eZdq218M)yniZm0>N9DK-Pq7lW6yZX_)ZI`cs z#djCayny|wiH^wINyl~{+i7LM=^0qCcLPZ4i!2Dw^e8eaqWT3twB=gJ(do6RJ!p(O z%mL%XnKd6^n_y|QmN(c&$?(ujoqc!r(Bn4VZ7EP8>DMuu`Tpn`Z2Y$PxQp<=ywZP> z0`mNSNdbBOIx^P#w9!R|3;;NYzj(MW)K`CcxSGkqdd$q>xfO4~Vh7ZLVjsjNfqp)D zW((tQ5_M%1!M|PHw_)+7yzk;`GVBD;BubEENaA+AA4k@WvL3x#72jwIv28HFW)HC< zpvrtRPpPO0*R#l5wAqf2LnDXB-2cwkaOp?q?cI-4kQXLMArC1rX_pB141-W6+{(pJ zH-6h(AO(9I#-O@$h)3vqvsvO7t0lu9_RN^{4O!(E93 zi$OEuvrXoO3J-0KcUwY6G;gvSLU=#gvry{WgC;fvqRz|u5(@~-4rQj3Pp}E8PV!Ht zDPXNSWEI6hg9LhCx7sTsLfw-qUDRK%?cW}IT(OVV?FX=#Yh@Y>Xm8JsU(z#pWabuY zB__sbN6vbVg?!QSZCs9^IDySb@0HPA>9Z#aRvNoLqU>^y(7(7@}GLROeNkW;(;wf1CjY*2nOor0}G+uz8rm@~+d7-biM24&)R}$Mm zzsb3NzM=g%yt%jW10n61yl7ZUc%b^7eCOvm$s$Mn_)#sD%&)|rZ;_2)wwWHaXy=2W z0_hN3C_eFo?Bai9{$&{|EDHZ^gD7 zJgmj~6QuQH+eTW|TT-PrK**Uvo%xqNS_F0>?GS4;n~{?hko8N%I88i2-H@*esp>Wz zM=128m3Fcc4@(ubetM4#<%b}WbFDbdJOh2K=XO*Wn1&yu>}p$9#JM&5*6M0R7yAl{ zftzsTmsa3{=JWH;&AXD?{h>Bj4_(sZ)t%FiL9J_Lo)|b@Y`Nl$A4&1=2)NapTAYv* zD+z|e>hO42&_<~eqgJ|Y+WReOYXqZ_<|@&v;_b3ID>%-vp40f~3ye*Zr-PP{lFf43 zQp}AXODJT%v(KQ7X;WtX*p7RALg&QM?)&13XcMzsl|;3{ z4#dNbHw<*YjZ}gXK6mc zd1dLvH16sQ5>N8ZCev zUBWeuCLK|Z?>S#;ExC@Q4%E?qs|ec6E<8l7$fq0QEq>Jym#Jus$u?K)%30`^5^skY z><2k5GNCFnT4U1GeaKK`A;9i4Xd}0Etv2!L?NIyK{4+3qg zi3m@78d`Q4i89{S^8wf;Wc$!kcBP3jETcEt;8iqM(WD|h^+ByRM1nd5qeX` zx#g>Cj!D*>VL#W>icAY}1*S?Xw?)rfyV05_a%l5|)e64XGv!0621o)7$i$j?cBmF> zHv|R#+-dwKH~7`iW0{dznaWA|$ zrCmwL*lX)SL5!Nl!A)xUKJ1YoYHBALUux>;Aj$Fl2h2E&fEb~NgCOW|-ViM>|6`pY zE-@ZG_KC~@?r<)NaGsE;(MvV!L@L3`$3$sgC7M<)l}V^fw6c6WIuPDr3LNZAqvx#m z+%^7raQJgtN#r|A*?L^f*qZO$A6_lK0%>mi%NJa!TdtkdgnzKQcH3%Z2`)kOm zc_w=;`WTeNi$tkK0y1NzSF0YKyeWa`#?T_9Bk!np@2HdS zVY+Bvk8Gs0?^KI)XkF^nkN&L90_2}{IHwtqO;}c6t{u+%kXP7^zv=>r1@7!3x0~GB zRZQX{C>j-sS$jcK`r|%iXnc-D->xik=d@ObSqA3G68cMd)qysFiIdskPQbN~8LQKo zS)xPi_~>C@<#c+T9wnfHM{%q9N_=y=_wISP31*RLyKPbt-Bp<%p2fJ6(H4ZZWPlIN{wRNy-9wAF-SuTR^GJ{ z^n8teuPD0L`~`%X18qb*QTWjZx&}2iOg7T^rh106fgu@&|On;cb^t<&&g?wTaZ_N9`k;767PxpLr^Bm6On!lILQh~*WF zyxoEv38Bj;?nCKDfa3<-GWkl+&e+YVc`ab8y>9V|7!uS66t%kPfh{)K6`l2~gN$$k zYe5_N3?2gkZ6>0H2DmXaYA1jy&s^Zcz~f|V?Pk^%a-SWyDdp6nb+e9n{b_P*<8unm zoeXr1)tex=>9>a@ck5nJC%-W6q7}CaH~X{vo`M=$Gb>jDB|;YV1M6TnLvfIEhUVTn z1_E2M(Lhj#3Y=i_K6Z9&)!tc*=9_!dFC>~+f(~}fr&uy3jghh^V^0{7+oCvxbz%h{ zPzkN=yjaD_ByR1h3ycGmJOi&)+65Wx_@y(3T2KxW6?AMS(7oMAR_#vDTt39?Y4xVA z-Tx5QK2a~Sjm*L}#_<~CO}eq=so82l(*=;~Uh zDhWc?a$@2z*J3s1m9{psz75X?ReM??Jc zBNj^ZIdO4-_$-+-k`%7ZTgK-VkRn%S`?^l`Jz2bJ2y@n?D}M>+t`z2?e@@k0%zc`j zfB)#_WgMGYl_Qs4^)K_gG`=Qpf0t9lYEuy5*C8J0PS6r2v!t?insj0z>SaE?y2>qPl`qbOP?Q_@lM{2CN) zaPCeqf~uzW&g>iA{>_I~{4FIxKEL9~W$WhkGy#h9hO(8Wvg-_F{>Ghg4dOk_UWwbi ztiuh1ReWc!shZPWHmYxI0?R=RFcdL66$hw$OKR+#T4srYpFzekTW1>r9z70kxL)emsvdt zcts5HdYChvB0>M~Q#l``R4LypmaTQ?t!4tMGcxStHjYjN;N2HKCN=}L>F*@m=y90k zYP_yGw{t+%jE^f14L{rtf6;G%SE+fQlOcasBK+StSniiw5Il|+LiaOobN@4MGx~oD z7#i7{NLU(K+5hI0bs5(}>|{sxTX*Xa4DzTieh{Ce44@l>3UaYU(vroL{O|$0Y+y}} z9Q0``5PN#q5-3F`U;!gvO^mO-eK5iUIa0J~3dJmGWmB9X1>uIQoUo~qldlo10Zc3) zvPB=$@0mO0Lo1P@lGyRA+27h|iRG~K>5@2AnaPogcypnZ`K3VyPgIM32j-|~|BRKw zZ+~|nxb0~Ye9ZkTMHgH}X(lre5YNx>>RsH$%@wcn8amlJ21y6TB3>j~ma_2gpkU~F zz|L;9ga$nX=tCeme(H3_nt)WL*WF3$rvbH=n^x9;f8S~-m>b-Ve)aqxmb(n%%N zqpy%q3xy6@s=zIziCL+W4_E z@}$hgdiMjNSVZA90J>LM0%|}@o-zQ`geq_HONgDi+MH@x<$*WDW!JW&*TI^sEImwb zR!Wc4(j64AS{FUqr7BT;ujMEzMa#Dhn{1=tSjzahz?dM@?i#8r#Z^b-Faz4rE&Yo1 z)hN;BTOs<8cBFBYU+FEhH)z|iIN`}S2gXdbL5IYF>)N_5aq`1Y_t?Q2j2i`S-<* z^(X8<#WtUDIN(KXLP&FL@QpS1?I4=umzdUE_WM*UA?7WDss z{Z-kM9c*JjZvM~4Co}vB{7)UI-;a^9+&@R{-&LXhZj!}w642kp0sbls$_}PCV8t=x z`-kME75vYQU?ol9|5Nf}_Jxe+FRNe`e8T7Bga5fqguzz$G~j(5bmG5D4FCXC|093- zYa{};AovYpgv=KO`xD`TnRN+?U-k@x~ z@xQ?SK^gxKZIcqeVUQf`FSvh@t^WhJCHbO`1H7t3Nc_TR{$H(LNxi^$U$nh&i2qj@ z%-a_j&kO8@g!{k3aHL;g+%K>f2JZg~qmg-mas7sY!McQy+_K4Bz!1L4h=ar@Z=coTNV_v2Owyf2kf|qH* z2Ugc(gRlpG)cgJH5kkfT!R?B;U}ghGNMkRsxE>wr|7OH9{@YHR_n+!s>~Q^_e%PoV z{}jB~;rcyeA(ukIF$#F#56{|>!@;-)w9k2Ogpdo-U@=i#@PR%$@jo-){_K=a@^c;> zBbfae-<|=k;KKbq2ro_MF9ff#;eyGZGw6h#djMAou4KgnuRKeiZ#8+#{%p%%>%vP~ z|G%HW|4jh-KmD{Q|A&g-ivsuWeroL1{wa9r=S$+qzg7PoE` z&!2)9)x5ulP;+eHPr+aPc?lZ*x6!}92miNGz|cQb{O%9m%YxE9^AEwxg2MCC)63aE zB`?i+Id=TF(Z7!-|82B$=}!$WJ>_{Zinad_!HdzC=71y){;%Y}j_<#HmcQqX^Wk4A z{`6q}-R9r!faLQ6W&{ZUVEp}7!1F&Yvm-E(Ar-j8fRJ(R1OPDoV502w!NHl$$id;S WMH~k9_ZtMBf5OlJfUL9McmEG=!wt*; delta 5342 zcmZvgWmpvN*T6~C8QCAp+g!Z1aS#zLApbvluiMc zS|sJ+=bPXE|2)qFU(ZEXsy7gNYknNqC&k@$6d1NFA8NqU32sC;FWwAS2WjdypYHK zJjY_Z^G0AtA{nPOp1 zPsA&c>a_qhn|4|}pbKa25Vu`OSnVG`t=Pfofcs^!y1-23W#;?HqnH3S=jkc#SaBW>H-c(WN z)OG6SExfjtv+Ks=@X1DT^&fa3W~~UgC5qxtf_gC zoP}!08s}hF0#VFkf|6k3$oYMoHI1Ig1dRajL~9AncDC|~z2fey>9S;nYZRmKGzRpN z)x;Y8kdsK#(B!UblnULMHq5m$xl@mN4q|w#+HsVp$ilJ| zv3w?p=ZaKNmmfjD8d{;`$-QQ)+#}=(z+;^dC?bE8Hlt41d8t-+5@Q|NVLG|BtJ;splwT&&lKrlRHx@jb82LAy)&&nI5> zSNMc_o2wM^R*%=U*OCx4zp3Ey$B)7G`W%P{>cDAP3aDj{vLp#8Y?}AZz;*ypWo$5ZavWKg6Ute z5T^t(sanz+{aTKYID5ckO|?C_Umz}ec!yrrrp``>p!%5j`0m{%)6T@xw%T@l2vNRD zvO}7RQdrS6ThBmU^r1**w{H^fnn6}g+8c&+n2%b@O`qJlW0LWiMs+#7#zL`VPNzw1jK6f$mybaT|uQ(9UKBOodi*JdF+ z4`#A8H}8`(<8(t;MVi?&_rr~TH%z-pJ3ZHc(ocMokkEj;bnB!}4ftzg_j-1lqm9{O z?I7my!6j7L3CgxR(sOkPGg_fivAV&c2i6)D{U0NU6lzE~molCiITEz^ldifQbDfJX zh-$eytPT6I(a9=uPFY72Mi-R4lN;=N#bn11@V*;k?75MSb!9xikv=Mm@(=L#1IS&d zfqSP=fKr(nFgYax?kaHr_;Sqv&pEWnMgt3_pbo_%`toBF!iNt6vC{+2noLMz=frts z=pT#no6e&?_2=F*jsdSp?=B~V%Lj-cCQpmTGNgDyV)oZ446f0+^{;>T)Q0A--@DALLN;kt3Cz+OqIgEwVD_op%nv zVQFk6UTrgW-!4l(P!-FTi2s#*;04=1PF90Ehtr?twRK1Bbvwf>x?Etzd6s12)*q!n zab>=ATT4WxrM1MD$6GA{oh^{T)2o=02u{ zxqiDu+?A!LcM~^jHLp=*Gt9vGG>K_}!uy4Vq96+SCeu`SZBzZS=Fs2jk5rG1^fOO) zg)(0vs7^xA}l} zI=_PxmU(4s#!xCJxF6E}sd9E8g>xgehq~F+GRtKOTi*HPgAl0dKrFkNI&||;h9z7x!}oQgv;O!HP3Ri58b#}~Kej7uIFKtvP{0^=(-?LQ8Fv3{KmBb{ijWL^ zgJ`H&DkJqndi5hOTND;$8IF`IDC^G{xGdOA@)M}E5u+iitV#(7krXjHTl|TNE&c39 zw0hRKqHFGkbyg8-8etMM2kvmL>OJD0+Z&he#1;uD--raza0`oa*1LRoYvF@EUO=k) z-T7xiH*ovl&iK0>?A(GR21Y5Ki)jf8kwLm7)Ril{k~m0#IXanV7;7??`1^tpcDnxY z!v#WJ{bxO(2Xy=tEn<#B{E^EY^0-g>*S${JRg3EVUaS<$c75Lwa&3Pf&^7tiX|<_K zMQMuei~X6mGw}tqy<@_^xA?BYg3{aFI5NjaXl~ArAHoBx46Xv?*hxF}pOs7>5N(^p zTQS9uAyfmA?+V`9NKvSCV43Ud*bCX)5USIS+d(k4Y3*o{{2~AoZMPui{=>h~-$rIG z>1oQvGH$2w?sC`TSI@F_3OYvJO-xgCmR_2;lUW)lVU4Vr#@63&wz8EXHSeokNIt3n z$N5D#17Xhbtqrzq6{pWE0vEcWu&@t&vNha888aJ5F(J}|SM^QG)w7>BhTR4=yelX2 zPbDqlJfh83M+aB7{Y$5keXF3gV!4_I6=Gz(RNNDV=RRJ}aW^q$5@U{c$7ucM)~a5J z*6-LBLhFrR&etCmIBsaNys%j^V|kHg?m1xQJrJlL{WN>okjNlw`733pnf$0}TTZvr z`=od^#=1BnC)mCIWh<LVbbG9rs6EE+C+-Tir&j5R9%1l zqw&LYar|b465g0z9?Rs5E_|$4%hOsCl=@_0hKv>ZV`I4Em^Y9b5pAaT#=Jj+6h;RC zTcn4)Dbc&Q=CXl4B6Z$%Xg0wi%(h4he2qIXS(GA3z^HXUtU~i4#~oGS^KhIKvl<2o%131JQ@^^KaM(yj1NI61 zC%?;S-WR_Z-L`OiocXMn655vuAz>q4eUXC;NwXiSFU!4iRv1Nl5l6QO?3W6%I=7vP5E(QIms;NxrM}aJi4_IJ(D8(l;C#&Cub7X15$Zzky#EfUk3oOq}8QBpWI=5OG z!eMsquZO+yCLYSO$Lt-Bk(NYa1f$CP*UPowv?xd0M(XI)SK;GVjMXQ$e!Du0Wb|QM z+l|aY3BNOIBeEgEQ6ROPv>B2fpHKIXIp4~g2CE-Q&*cWUeeN;Ef#F?!>zrj%3%X29Ss9RNb5G$^+KK`a!9OCSN@7Ql=Wt%PC$&|W5#eQ6C2 zYBbl6fYd%3!o3a#fyf{r5G@FByMiP4j22vq1aG(Lh#Rc)AGmZL2C6;mRow5JgJ!Qm zLyy8~+-x0yik}@dLgS@B@;MBQeyRtl;YkVGjAeRc@Q(U;mHgN&hwP07hFVcEjqH(2s`hssP% zWr6w!W$_Hi!9B*&y;%=G!UsY~bjrTT_(-#DdA?WmbP6G3NUhAseq-B6Wv$}g(W$zF z9Sw-2PT3!q*+Opuq1vn~yX3XB;8Isdr;E2NqxI7E{S&YrLhhL&jxx_`Csp<@A}K_? z)@8wO<%Th~+1x!;oD=xY9J13a{V0b|pjo)rF&CyZa8&MWJH=eclQE6tLTgbf*``Jc zeoTfY3yNB8vk{)q^Bht*W!TfpfP$LFr+3O+z2dNL{`9hnr3<{6Y{5Kq|76%A-|t=6 zE?v+-9I`%Qe#4ck&;1$_=VW3~z)?n?`KPfcb3B2R{b8nJRg`L4#`z%JacJ>TKFcKS zGN%mL8C%YeuYOaIVj4eWiIvODMk#LA!DS=`><&D1wxt%lUQ1h{c&Q}?)=RzbZJAG) zguNG>dLUe0?ScPGexFd1>a* zlz<--{*ZPRsz7|l+bM}Qv-;S66fN{*VAE$bxKVFG%`dP#D-UdQI7oa$75bU!39~e8 zq&-u`z9WagX~sQ~iD2`j^Zr56aQxc98t?s`-rCG1ImUhuKS9}d(ywCZdfsM!p6%pf zE45RA4G^-#&^yf%Hke&;k&Q(j+}H&R-lR%%lA~{icyB;|_uSiwp#*=`@F1=R=dV79 zVg;~Jo!+L{N*PAsJM7iQWhG)Hh$WPrv_MP~8-YTn*D1*N0N zB0=kJbc9%l_vPuNcctj@GI_8pBje(m_oo%4?Vq2{ZPO=lDq}e;5|mfcx25d5#ejD+ zXsTR9*0(`4?_-DJ;u}-M@!o1qg}tP3T2`BXvbgE6Wu_t25QrZ;*Dqd2*7#a?_`C#J z?lyFWUkSSRGVnFE1V@I=C;Z5LMMah3Wy+7&A{Y>qUI9W>hssdIx$$J6wY!lPWJ8rE z{~Es#XYkr`>g;oSfqtJ8Yd)j&J~kn~-tjp_VEQds6{I zp3o0n^T(?lMlb$VcUiW(j*aWJo`33a^u0$(D=2Negamh;JpN_Ok@@3T#1J;Fun#tX4T{0)N&N zS_sOWMEb5vbH9Gv+m)#Lpy_hicF>X5mmYgi=0*29Z_SWgDx+Dgn_6az#cL; zwoYkz<12GKCN3N(^~FaxQ1%OIVCG)-!Oqg@ZL++%!3ODQV1fBS|6dCW1fsnC!2ce= z$l*iq4K_e>B#y-fC?4_A{nHnr!3KeT(SKuC bJ^jB6R!0N(_RoPp@Y`ec_A1u+pS%A7F#5Wk diff --git a/img/compact_comp1.PNG b/img/compact_comp1.PNG new file mode 100644 index 0000000000000000000000000000000000000000..56462ae888488963efe7a623c837c014f3b72fe1 GIT binary patch literal 17385 zcmc(H2Ut_t);0_Sail0Z0wTr1x#}nokzqtiP(jf_K^-X)5xpm-h2e0u-pqfb2KoPE|_<$c#)du`6_ zwYODXsJT!^Mn-wpPMiHQGG8HOWMusoD1e{5bl$xce8`6Ex7{X_Tc_0zF245Nx(EEQ zfS@S#ln2+}1noQ+A|tcp1oTgq=c5}YBV!%C%Vz6=FtmVcl)`pN`LP3UcR#V!=2FiS zePs9JKaK2*?zz^P9{2WpMS6b8b(?mr=9t4_zK^Y~{Fad(x*XiNG^A_ou2UY1zdp16 zx1XoWbZ2|(iKD9Ly?!D?nA{8;G?mhNbe zQ8Hyu0+C`DW;8q0Nhi4Dr$sgJGU-T~SlB*0A)Op0Sj|dDd-n%tx22?>!YH@!A|psQ zSXO!xUMVi}soo$8JZ-LP6&<Tg@OyrM7^Ur&AZi{FY3oM5 zI!$oZ;wa$t3&}aT&O&FRb%?k4R?kj};|rE^#T#x8dmZ2hY5O7DC8af%!xuY!bINpB#G2iii4^x@rE>ws zV|kbaeleDAP+ySbxY56wds{9jDT?0jT60SUp?iJUqcTKxiFZ5wBxS; z6I%+-SOu>rv=?9WM0Qx@EagA+?n`8Q8IzXRWlx_m-=bVwxM#7Ob851S0o{pnK4NRL zT(z21c{%)fl8T1tdNzDKtf6``B3-j>@mNRb(t)d4v}55~GGUX&n4H96+3coRMc8U5 zn7#Irpv=jRK9xxFZ9zd{Ety##C(|w-IN#A*Nncfhofet)kKc`gv9p@)alGtMct3Mf zthV;zu%V7Y)HrS6Okb3o_+m5GkQVo&-<8hb+|h#=ING+#62wfX9Q?OOg)pjFen;SQ z`)NU{`%Qi|Tia$}TohlNONXhKhf|crl^5b-kg0J!LBD#KIxPDBg82~l`XHP6Q@bl( zyD5wk>xy19-l33)WGyn)Wv6#qa1-*erIp7YF;DI_(V|Gc32@29L*&Sn0xYpJOj7w|+c$C!%HbN9Ir+x8D@Brnx@&GI=JFFn zNyc*m!81}3XV>Cj%YxudI5^K#cOO<;`Zy><=k{5CLRmUXb{CyrfgPh76y}^PVe-af z-WP_7JcVHhUwYuYiin{{T)4bK2^vEt3I+$AQHer)#bi=jv36Q7$%rC|6qLenmY#eg zdW<6(#qYW1$1nW@NxoE)8B>#=;A>-b?7S(RkuMq>`@?g+eSF*@nkm|gUrlPNSrR$P zOWssx+r-@ZRd0Xn-6NZ%$}6&h%Nym2b78Dj$PEweW=7?au8bR+^bPl5waLU_#C1{tP?|uuI6bgc?2)&|-_l2( z>|Afj@liHIif?g>YdJGjLD2^H3yFG86_N3q*9-lt`vc97j1jmKI|bPBM~iw zAU-kK#VWdA%k*~sq@{H@429&&q~*cn(h{+5``J&4V$EhhVN4Ix9cJXnD2YR&5(D{6 zkz8FbV`$wH2r+!p=PQBMvjn({}zp%Q5nKvgByByQ);D&~2q}2#- zH@HTV`+^<#X>?r%O(1Kko9eaM ziEw_~rZfWY?x3Q$3_E?5@QZTVPG!yl(z3b`etQU`G;v0%;dxN3A#3KboJ`CvGxKU^ z^S>E1qSsGpRhr2UKTRJD?NksrH`Nso!Isieild%jTB z&3c7XrGc>4_(P5tbvx9QV;&(gR7wf|9x!4ANI#6$!l;tH?Q(YsH--vS_gib!g@Slb ziGp8G1xZQV@$wQ~k>yGyr!co)r+AC;2c`QCw>X<4XC8p_4e4MsL7FP z6{N`#&y|o4y(qHD#!tW_&1hb0*HUg%$pH&3l<@|PGE(Dt&_OVFt+@dM%DZyL|85CU z5Mn0h6>$xCX+W=ne%;zl9>Gqy7EeV(7f_1c zVcO-XNM8z1?t2#Bh)f0feH9;!^m_94+N2rjXnik1G9-UOg%>;}X`h`5xdX-tvw1sE z4_!?`U+I^!>2Y=$_5OFEO~U?=hrzNKK>@EtMsX8drx3I6{zDck@|! zSKQ5NVnmapy}G)H$S@u1gv1%lwjpyxzUd91ii4Q`{P>DvFHq~KD-tU7fkJf>-}wq! zuIvN1E$;zZK9;oz8CG+bS6FH8GNB$PD)`Rc7EHwpQraKjq3m=sf6t4RV)6hEIgy$) zVXr<@rQSE34#iT~?zeOKCD9-fpN!X4Q^=dD6@-!P@XiprSg5>J@Ey7uuc6ZXphnIB zia6D`xVO_?Z^!#@l1<2&qPdE4)f5aP&$(o+($ni2e(O>8x+EF;GW>EKt6ud>;8Ba8 z-VTmZLf+?$%r5U*_0%sVTLc-y?X7H$i3U=q@W@TsGdGT?L^Gs?17_PDjHRw;rZ&#- z^5c8*FU(=m=)2Y%3jS_ZY&kKb*4rlZZ!S3?TK~O_uJynimWYB6&Uxv}cXfm^UW}qE z+nB_3-ZQ|_olu*gNUJh>t7fo(X>4p9@nq0enH=(4si{pG8upy7VJ(Ls@IvRjztcf3 z!haz2@9eggUWU!ElE}y98v4wU#ktGrOn`lvWDKdWbFE6bbMAN{kVq%|dEC}g5PVDKWe2SpH zzg)z8`T|lM>&pE$;9iIwO`-QqC&P^N4rQAE8gxjTJiS?)!n=HF&Il19PUhGaAP6fM zw!}WCn`D(G$3p%`AUoi}F#c$7{ia#IHI?yad+;*VVdudum1nS=@!k_15q@tKfSJsX zcQ1$$dPPkPbxwvJ_58I9Udgge*2o=NuX!w7e}$^*>=W&srUe(+GifSrf&Wpsfi`Jj z!Vfw(NvH6)l0qv-@|LX9vj|<;S!EU=2{8+znHbMGq3C7K$tU^M8+JCE9~9ZE1M*8P zlKW;2Gnk#(7|SPKn!|?NO}>SVi69O&KCU&0*MEHZQHB1)SAc-K;GDqen&9Ll}!*yzgq1J9TRkWawPgn&xVb9Dfo0*c*40K3A2;T72w75Of!Tp{%ooES=L)jJA zK6y6D^vXwF7&li=N?C;5YptBHUJ0#13DUG`AIV$FFL9UWH#AP!w=#iqih_>rO-`b~ zKI*>8qsw0AG?uPh6sEpVc&K^HY4}5eRC9sS&Sly;VJ(%5A2ReTPV{?6gnx1taH0S3 z8Ng8$AJ;F!zf)VE`g=TK@W+XZa6LWJr<~Uy(pgcrv9o^acHVJ;`ioV1ymcQBh5z>&Q)t!hj-=1t2HG04IR3 z66owbh#AO_C#Z_b`Z}LVg%JZ1&t*CIMTTWQMM+7#w6kF(5mB5uAWZ|uR{ z@7B-iZPXk5q>^Z708y+-5T5N1UkB)rWBy6YdAC%C7W_ID{}=L-DBVEGX$m=DWCK+i zdw+m~HBt9*c==@jq@Pes0`sSl>lge+6l1z0QUs#6Q6DWLuHC#~xVjrjILQ#4~%w zvEyfri(`_50W!Y;F4sxL$-dR}2VMgyBw#6i+q9pJB~cgY({eVm!`T`n<`p!Vw?PGzUAH@6L&Lc_PU5BQ18g&w{_D(u#m|D5 z=$QGKfL`Le8gTv&h-$%Ob94+9x1nOKUYE}QBnT<8<*xf-P1WGl;;(Y%X9NYeF_bV= zLL?s%vZDy(0RZ+ul8YPedN=$(CTSr^=z@*KUDfaUJ!FkbAZ=RS2~jz1^IRGNN$4|J zoeUwsKfW7KW>qaGS^Zr{fb8XSJnoFoX^4|WFVBHdv$@sz@%U>=@LIpIvidVgk`6T0 z6v{&0abJ*h?EY~TlotTi{Q+fEW1fn$%TG){4699Cq#JFxOjTS(3k!Y+c|NXXkn}`b zTO`~C?muhMi{*ic*?7CN*D^JQ$XKU3?C?1g1Id-o%&A|bR%fO79|n$|JbF;ceL~}Y z2bWC5Zl`Yl%pqjrYW><>B{VYuXmPfOG5m;?=Z-fls;5~ySuZp6V(BMu&9-ONTcmBLVn0*3V?(V^3WckS zWETdZmG}0vc^4sP91mBO9fzYKRR6~?IXRGTlDy(7dbBP@qQde+?2yh(OGZh1ezQFT z9P#6z@)~Z=Jyc!|at6C#n#o|qjbPg5_Y!p`;CeghzRVSOJD%Ue(KX!HbQy$vjzm-W z@j^_e|hh7?$&#?URc+r6|FDo6AM_}vJev382W80;_e@^59mJ%B->EIeFxlO2_RSm{} zPRsya#zf^U98yUw?fzC(T=99n?=&51Ik?J|4YF7C~b^3f<226*P*gjR^Wg9auub3SFPpHAdc|>8Q&rBTYSnRvh3V2bUHl{N9mijAKyD_D8NyL+7N=-#&Ju;U1h)YU*)LwlcbK8nxw08M%8#5Y;< zMD~UNKM7ul9h|A#i_6A4K~2NTVY`LG7d#ssXMr$0vEeiZr55-*uJYY9?OmmQUn%OshwXWe_|UDqTkg*O z<-6YG2zNw68n6>>ceroIU|#K+S#iyj@P)FCBX&^b6ZEt%kQDy%OWD3g`dc#vKKKoN z-a9^v6hfru+ST0YOR}#mOfB$BsjinAt#ViE4R+9&_wM=0%v0$HWDBY-9C6_~R%yu# zg|$J}&rKLzzNMHGAX$1-hZrfQtWl~#_42wR{5YaP11N?1^o8$_Ae*4@QD^x?sBXcp zeJ|@u8#mHR&`_t|KOumdVC9+sn{@G;dnSzQx-&MskJ9tH_#jdsAgQoTp7>2#NI$v7f@5oc+_`?G#S%pasWg(X^~t21_)ex6&pdWQcMYKd}xSoT0a`r$_T zEPk8-4^2eTic=tG^&85RrJ1by$FH<45fNN%RLfg!}|$ z?`+R^2^-Qf-KRGse&O3`N=OF>2e5TAL37o}nP4GSP8d2zi|ATkC&- zPA#_TzEO?jjAYunm-rhrUiwmjz&cs%@A<5v1UaFeWZ@4Ofh^51aC1&HLDc4=NDAdj zNO8p%xmYqxUz;>GrW5%pDRu!~&CR_&&M&n(T;ps0y}B=VSVg*Wv>*=Wt0+9y97-_^ zmfNT7NI8cp@lQX=2<<-k z46q>)wKe*MHnFAm)4ubOd1;N4%ce1#o9MEf{@MYofR+-C5JemJe=m(yPaR=nf9(`N()wX(i=5WNP$;tS4^V@SJ z9Z)ZtU!Zw{{kjd%PR?AX(1XLEDC8EgXG|Q3s+{dv6wSL&_E$c=>_q?5|4t{+#+qNN zInwq{Xw}&apVXAwy>aR7>BwR`V`_+ z74&9=QO9;ZpB?E<@m5U8L$-UDtobh|54NFA^b0iaj%nXPrWoK-u^nF)tS=SxRYE`& z%JNmdJWBIrYk^q%5>XH^5C|8yqNd${46p?Q}?EUagb=K$9NNFO*?(GaV# zkpFbCNfTfih3~eMb)`GkSaa^?w;Yj}iUk3filuF)zH)lKdzm<0mr{JPRr5huQSN7g zOP9jZJ)p({KofHQx%l}Gluc7xhX3teVT+F_MyOCh>fRi;qN(^3**1W^Yx6-k55ON8 z))NdHbD%+Za1O{vyq8U*+|oZ0XqPw@d*?j{DqaFl-xC~n=C9X9QpiSgKD3wa^oeZJ zcXjB7rgymoxZp3KdzSo)XOg+~B< z{}Whz-W*aq`N}~bUqgJJUJdBX6`yc(fO6LJ05Q+t)u50KmYh%t7pM?|4A%tWrt*)& z0tn!QcexZo|FfX_TK&4L47_EIkd%;tF(0A;fT>@D-Wcz{mI46%&bck`cYyxWKK2P+ zfN6aHJ`R1_t3F{7pv)y7PzGQjpv5`0@QI)2TbICY><3H=!$R%|WiG%rV37~i!+~w) z82Up}0lq2k@?Ln%YuFQHYmiYOe}{tRZ=ntJqadwmBn)Nljr+U7e4*`w_uPFxrZ)f( z0>VIC38wg745oN8!az$=$k~j+hyQURpVgpbJ@0a>UWd$l#`i6V%x_NhOh z=#MEkKlo=|+4y&$^`&mEX+*DU3#c7}KPsR_7<~M8^cV|Xl;imEu zp2?>_+tWFFBS#XKxqQa`NwL!8NDrglj#()f6y288bV}~Bth#5}In$@Q$Nx>NIVKEt zy%xZ35V#LmrRKfDh>P)3-G=&`l6$-h2*h9!O7d?W!FW86C@2)U$Yct~zl)+?Xsgd{ zMT=as32~(ge7}lYHIs#gf~-jP&dyQJ$>y3`no}U{Cr;P%@kR!pG-j0D7uCRLb!XMX zIjP`mfJt2X%ggGz`f=Jvd+HwCIvF(cdncm#ciCq&^u*W<)6AzV3?)ZK4(zJ8%qLDp z=ey4M7fzF2SvuhejSWF19J^~U;hIQIf*qG&_A5WxyyK^QeM755v)f?Nh-1D2QuDH2 zPfM!6uOuiilzWLpV)49Ut%}GzLUBE{UAyzLAifc+8(vnkF%pGh0ohFkNT}IUqulNb zI)gLI-0En2l15#2P2`x-Hea4jUpYCX|M)G!qUe+C75rwN%Y{7LO(kTlhi|aMhJEMLk2FW_V+(<{HWV&p5DUbpW>-UJvX+vW9y#gvkf*@`tVlGz`1F^?6Q+scuuvY zJ=MKA)3#C4Mat5V;fJ5M`ga#eaeQ;=vvkiuT;~Iv{tUlth7V^P6BgrbOT2OYGuPQV z9E2kWLrY-~?v*?=8H?^a$|?bSt0P`gP0-=}3We65yhrrrMvUu_TY-xAo2#L}9rBGI zUi!89ze~e1mAQtNyF|MP{HCK)>cHayU;dfo(RIy5Da|$K^Gbde=(5Jjy>Wy;5%%kc zW>6e2S<`xL3+~p!I<_lIH3&!Hc9GVMi(C^@Lm7u=vl+vq|E`|~E>So=P*9#Jl{WVj zpug^Q45F=aE1?A;LLb)mRdc%cjRj|T`kAw$x&*ywIDbki%^l5;oPNsAAjhG_Ciz?v z$&%6=Ss@B5VcjcMUYJ5`oCqK9zPE<%w5o8`@KPo76~fys;RD~CTJw-+AE_|bj{B`x zvOx5rC{=R9;dl?4G7+5@eLb*KUH4XcE3^B=gJP5un2y0ZLR;A0)<8Y9pGzlcj2xjZbMUs@M^ z&JY8)M3IX7y3a`NGCD_y(?JJBR3BVtL1lNouitX9X?=cSI%*Mlk1S(^uaR3rkdI&; zl|ClKW<;N-VF)UH{S?MAloFpPIm0+6Y5Ive)>8MIXSS;N>u&9D%ykSy8`kt3eCEyH zb(WDEek};THoH&pxeisMNs-746kiWgAUiA*xCPA|WP`ICN1bWzSWB8!3puEiu}`cj z$up1X7$1|iB@S5xX{(|y6p3g|pj<1VrJ7QD(W+>sAeVooGeyv2R#1#@1HCEyE1rH= ziX}a0>uVWHtGO^YvDNITRVF;7fvwp?GGd}?ekv@i<(|CbbrToKzqbqm5$(9pp|&$= zM0J9(z;Y``@l>gYzySAvw#I63sgkp_$D=lPJ@GC-4WCjmY+XW#<#XxYUBugxpMa~Gx z3ucUO3)L@^XEjPf$4KpM4p{fR#u$65#ndpVxvtpi9&t1;);`i4>mC%{5Vf;&3Yp5E zEK)f)} z-Qv#V#wepMMqPy4fV^*Z4bKbPA8fW?Tg&7e*(%y_igF+=B5hJWT#Y-vjF*XBj*v03 zP@Ce^cs*vJ3J4fXo}aEFS=cvi`s9|NI@NLFDyo5x8Z{-fxo!%?M`bX*BT&(9S9R>_ zm@|nC^eB}z9TVuwH@Q779w=(wBZOHwoM89;WV+<#`lkoOp`J*+E}1uNP)SJ_SN)&r z)VBGDKKv@{3BX))t?@O0gz*_6R!|Ezhd3f&lQUDLVTkjLRb9bgzdb22<~zlTp^+NU zK5rq8G&!>D1n8yV*8eiAK?1GvW1|ObEUtqdi3S~oi<Kv=sDS&_5 z=;qd<09Az*(`!}f>V|m)O@8`nePl&=q@}()gEPP%ZibFdsIf}y$IiAiDM8> zUe_uBrKT!g4^;6WF?I(`ZDwsjUE{T^1buDSd@X$5LVn)X90i?HDpNQlQ_cFp!rzFd z9a=&OP{>V^K!+(^vqS$3TTpRS$AcrIpmE@YOvpsK@iWXnfJrd7wWs693u zGBn*b-s1go&xmxjl%CLkpSapBtH^)J$c^PQV|o${N~x3{z3Yjop8Wp)AD~H`^2|!m zsg`4a18~IO+Z`(-PF$_*FGei#VRZzU*?6KNkTLAm2z5if!4>mf(blTDI|g0?_5(wS zLE5)F?E+zPOX_Sn`LII61on&3u$HdSrw2s0(J`3?rAd^)Hy)(2;kLi>3KI{{okiX) zr(8?Bc&&FUcGf!=8=fA{uaN@mqf`32 z!V3|yO{20|ezXN*^HZQAwh(rfcI}ZiWfD=$@bqiBivbx!4h){gtSM0OSyozF*%M$T zlT*UH7B%cr#27N38t%c7XI(+jZ!P!XUU94tM*2obz|t1NlNtH?gQ2Lu$Z?n=GCojw z@f7v0_n1X!*+h|JD(b#giY(22Cs#;R`MExi&5}Gm#-%D*AqJ1%#u`UybqfG2;6Xe6 zg}drCmSM3K$G5wm7#leyDpuc9?u{fTkNQQ0sa93U<3}*et&|S4wllCZt)MQwFEUf> zMD_Gxiao|Ebn{TG^{|L<)Qzc3Es0ZMGT7hppLuJf)bvDH1tUW9YVJofRj|d(8g`82 zDJZ#Jpo`TlgYn0Z>KsDbpi6>qX~zTf1^ypqi%uZT1`7^HNXmPM1zXc|MV&L(gtvMo zMy84eQxVLoS-wKjebZXVy`l8 ziAw5_1Z;&jMQ1+)RV+Nm$`cb+&e;absnv8|n7B{c)b2ZTP`Gr6RC7E+mv7-#FOa)S zTACQRG)pwxG5R2muf)=KCS(~ZQs7j05&G8n`oGADBF{?;`!+aQ4xAht>6!{KSU~_y zn$md43*d($pJ9VGqYtmcZutQT^UyC5bv+(Bb;etk-)eTX<7AO73N~cip0Yv*>ZZ3n zYIRGzNB4K4q+2B@p6pFrgzxf*6%C6xUP(F;9u-qTT)P2}cxq$RU z9yqshAugAS;Tz=&PB&sVcg*yScVnAQdSP|06HIcQQ=7e_j+%Gg8#p*&A?PrRL#kw% zQ1He2eWj-vh0dLr13@@!=JNIwcG<*FB+{RJBqIQOt_CbbXTZTIEo`mg0`!XknLdi# z5HtBAol&grSUxH3%7Q0E=sv5VEw1kZ@870wx%Zx36q%IF9*@^$yKXY`K$)W0c4Y4c z%)-=hc_%I>VxeKTXn3mOPjah)q-g;#DyeCjo_R~hLHby{6)4?HsH2Ejgb%Rf^*ldW z={*%T6y8>WP4G*VL#Z8;e9OxA$b80Ug-2LTl5$&pIf6snsmKx^)bPbOvDw`re-E9u z5dAg(YG}VqvbSja0{Bg=%sz@G+8w1kAWt-`IhteZE`4Ae{Zf5*P6&V87xI^8*%)Cc zcN3+FOj1l)@1kDUEy&HcznPoJ?<3KV+H_j*USe@7gSz70ngX+LrEcj3yDbBX&p&V` zh6ra$6M&hKLHPW1&NQpno?BB~UoI{(^o`6YDJ_1G&Jj&olv>`dp)nI1TQW}+nF#`% zQ|0~Pyi76Lmm`wpKV!e}hGP_O*AP0*ILaims&2;XfUQ%b=6A&3tb`TO)GMyDdgTao zhF%W3X`N`>L3qxBsm}USnZdMEh*UEwB!WF{DCCJr6jvlx(c4R~!(4HNS(Qc>bzv^N zx>IwrQld0|%}h7_XYEI-`f==RzGoEYezALTJ#;*(b-#FP<|=IFvUXouhWS-~xaeX4 zCb7-_a>TjG-pbx073`Ir%Bur5(*hSvQM`v;MoY`dor5dtSiEvpc2k5O(jSw3U9wrc zjuMD%@5VsC{~_;GoU_WxhFBHf?EWF$K#0}CmT6hkS;TC2QxxiZc9<#hhK~z#s=Mb) zHrs8gtj+y8HshWphkW8JP9kPzZ`b1pxt> z5=dm82`UgIN)!wzfiOe}gAkH1h4Jh_TW$Y)&$;)UdwJx+$ zVvCNOz~{~0XKnl-kRATQ-*wFE&>#o|=6mk+NsAzd@gAK(=}!+z`%}$m=?e1Zs>Yw( zOjQwR)N930zSjujMSb>L(|3n#S*&|IU`dIQMz7!Q69;Ks$d@qqG0Zgeg44wLXZ0&V zbw5VF>YqNoC_c~e7$%peuP`H)L!{R}d>SRM%(3-W1Q#ytG64TlNDlfP)Ub6Bi1+S0 z$`HuS&EJ7|AUh8_jhs-?-wlD(@AXj-gFr5?U;VK0+cDdVz{mqXIDwll{L;K00{Q)B z90>8}b@KmbH(Z?OvM797njR9F()(qQwfNDw-t{}13PsI(Ve?ZXf~AF78i!dMl81xp z>4q)MPYX!o<=_<29A9K$R^9Gt6b9>AoVcQ;G1El!Nn6($8y8OC&dtq@4a`hVPWm2M zne2Z>azxf)&?!4aH;(Q^%*CeH{g$;%UmWe#Smd*5u=cQ(<)v#iGq1N4Cx$7;)-p@S zkM`}}UgRy||9$&rd(`*4e)zG+;N6^8Z(XhUty3KwhsB=WN|IGWp~&Uk`nv;joVES$ zvogZ8^*hksU)KG8FD|@CV6#nUo1Z9RmpTsiplWA2?#S{YpVoP38FwnY$rZa?aX}B8 zsE<+^pq#M{oK}u)az&>yX_LdO{n%$_?==S~rkSpMUw$BZZ)NP#@WraQQi*}7!%1D4 z$3`<7E5<$3LftgCb_RuQ+$&p*iQ)4~)5_+JUR~{NYQyX!hkQBoq!Ii1nuUiy61h+) z$;iWI_m*Nfodwv0-pDSZcFdw@THeyc(YJOgb|z@txNV^kIgp)tXny!Xz_l(b`so9T3wNu2L#8n+~67pPG!F+>XBu#PcM`~|u zZC~8c$-ai?9mBcI;|q?7o|Gbbm?f8}ct$&fRC*@DVv|%guL%mtz9IB5sfnfSIq>SL zzLDt^-!FM-EX@)JPNtj9M7+D+jIY+T`Pk$U0yewu-OnoK)DDeO)3NB*_s&v7tV3ld zWfFQ66yGN2Y*396LVY?3w%u+jtU>bx!T0&(>tku31B&RA+PGIq=g04&wqXjX>H5Rw}KMY1k5PJ7*rD6}-y*g@~(={6TCjDub zNuGYu!h9D_hvYYgg^wSl#joW2povO@Q|?AK&T4ya-BlzWxj|lR!RMgnjDF6_2B&GH>G}J-8*69u%qR$QvrXjKr8Ohw0aORLQM0mOtt=R+_`iC9C&z$Ubf! z)0+9{NA{QwDMh3zc#>T!6dAU-2sl<5t_KbI@;W!AsFb!Vzs3`GJffh$qtJXIzsuyk zuD^@rAkN~Jjxbe-Em$+{rjBzRi`-K(dYY~vFh-Szi*?;EHlE4LI9pOcI#rTaBlGsC zZQy5DKSr$j=(bO`b>%SWQk~g*PpUBQC$(ECv8qJd1P1~MPU(KDF%4WA3{2kOAj3i9 z>pHyQWuchen)cR5%YyQTHNro|8j4HryPR1!Qw5l_ zVEF2~Y)EEcONTC%xAOT(HE<|zfDd`Dpud}V8}>l+@KAitGR|7za-nC$R+4i@g_0petJQ6Z?Tu_ya$iTxPO2S46;3XMNg*cdf_Z2 z8`u0o?q7d@1-zvQwEk>Jz8Xv27FsTIn%io@=pC= zVu6Z$n6JmT9NCx`pJ$qj4|3#Ev^7tRp;8{|6b!xXclU3b;emi)iVl$-1!+sQXTa8C0)9vIpbE1+dH^r9*A5udltW}di-$?jj=CPfjG8C zdhxS`-By$tN**OuH#quLEpEzotQD&{>JC%jYgdM;-RLMh?v`V*@Mh_mqap{l2-{5i zJjJAWPZM7w9ly@Km2kyKG(E9xL{?W^~JgNq3BE#JX&*aN=d0Wd}xK$ zNjv-M!vRFYfPn+q3VOoT`iY5$rB=xnctTER;K=QViL%fs_yXVW_`b~L?V#oxOQL(7 z+>#>*J&()xm#uslG-8^HCn<_}JeoiM3O|T@0?O%<57f2G4SKKZsVlxNaG_Wyw)T=% zR+sa<4q1f0jUF;-j~%+!AW(b5+4QJu7i066@VHG+&MI&*w_Xk&vUWvUpz?R=Px$Eh zd|6j6ymVKrCty-GW>h?tQ{|Er^0H8*8+1PJA65N3U(f_|R~4|@&wS_FCPy=K?V7&1 z_=+Ytc+r)dyJ&wA^WCBY=#D~j{}<}uzm^jRon*yDR%OYF|E)&p_HUjS@Y(!o6sM(4 zBP1wjPTiio0Rnjwh(ZZ%za8)G5Dm{+3b%aZw7W!-(W#!l% zY0|D5U&xsLUZ5RMbYq%%hBxO&%;I*4;G17i+d+VDo*OmKkCDg@`mKpDmMj7ftC_kY=4Ce`i+Vf-@aRhz-%+$H!=Lh+ zc76o8arCju(uZHU!3TgYw#Th3aaOb|Ez6CX+xRwu%0qfJ#A-XtGZ9FAK%TyZ6@jte zX+%6>!v4@jfog)@j~{+sqJsQ;>)^N3>7#)m2dGFXJP)C~Nd;@ahx`EKdx-)2!uo8NCG5I!#D;O@1C(wn#5ryl7?pVds0i{(v zsjr31u8No=iLC?o;a9^@aT+l)T+*?roSM{vKq}m4>8r9> z^=96u-;&!0p3#}L1_Vxw-$Ss@!Tbgm19M*TDq&Y^>TH;-DKJFCc-6WmR5CSMKfx_} zMJ{6y%(|ymLVQs7q3`;dE^5h5Cs#QJ+b)q!O_3^vebbn${V#_T>-^n%&kh3SNkwu5v=U(2q;e+J}Qg*bp zlhf_dwpdpTNdEqL@78bVs{{wW;S>DE|EL~Z=b~LtclrCXTTo@8$LCJlCO|z&`nyqJ z*`)}#N=bJQX@untfxIIK zGgB?lvJo{FwIq%r_cd`3KLq?>ts>deLkS0nFRn3do6Y=P&Ib-{Jp_^n|C2sxtz4@U z4I=bu9osvqfImim_(9$FY;58AfS;KVs5MsgsLC55GoQj_ra6b5gdkrITcv=g4OpJ? zA+pI9h@BV^_xUFu?6RfiT zyz5SWnqHU*dJ;jkxVCNEHgeMl3eYHN&2wGV52rYwerejhYvWB_x+-q7{9POGue1v5 z##Mch4E(M-lXM(3AhHE|Dv9&^NDewlW6Ph+7@zTRTsGG%G+8Rv1RO9u0Wvetm*oS} zuCeAg);>se{`Ysi`zr4cYEC0IN{`5Q{Ry$y14d94OFm)he&z-^W*IN)02q)j%;H3b{1_<3uz(5Y zqI}Vd!9VCO3*>aXp~wx8n+~gBHR>nS;ZLq}&n~F8kQ!R8w0|(ZQGYkIeI3NRdNt$1 z8uWCPn>ld-8ZyF#sHy#R1`S#|q+K*3)K ztFB#yQH)UFXv^&c;m(HzlN*eps-p5ugYXx^?Z2%{KFAP~mwbB)Lu;)bY+$bUYK|8YSb` zL%i+Q1}#)f_V?cscJA)GYlTw30~xk7z2iBP9JaCu#7$VpB-kq9kj!T=R&TPLYmaQw z#C38&plWLw4YjX;L-j&2DMdN&p6Pws=RgciN{~oQ{E9ayhpid!21z$B&Y2DSez#Mk z@GE0_;LT|c-Q0S5kNqVxkvp69}RgeYBCi5J;|tyrTsnZ~iHPeh8?@NUC0Z zmevJP?ZYl3tW!GijmKYXfT$LHV@nH^78_9uu4+6-ZR8VRf@kuW}4!U`COu7#2 zW6g^~Zc=mgU*=kxPoG><5z@WWg_jNb-Q8s%km9cqgIOvNPe`SYL?vFvg2wG!QDeWP)*}&(Y7!RHuMC}d`afmCLn_2&l zMZ+u{q#9oOTL=*#^^FL44_Lelp%an?pfSua&BA@>_jJHyH>A({D>Z~2)iF(^w_Lv? zZ(Fw34ng@++r#0X@Gb7U(*DX2+d(+Od78RJZPM?URRHX zOV$&?OnLF=bQyoFj4iH)26vib!+%YE--faoVV#E53oQOIPQ~fm7E8sR#RzCUN+1W?cdC7oenwKZ2A(uet{Y2g$iw}fS0?8Rki+5Ip5 zEZ4HhbSBLmG5h!DN{ZQt!UamwilWvzy2)8M$t8QM{kryK^{tTZ;(z!k9BVCl;yzF{ zRvM>+kBvRVi7h#!OY#+eakcS53)CU`PygXOQJx@bkrSaz&`;oaH;Rn(h|3X0ol|wZr_J(W^=|)&9IPS@`dQ>)T;vElPc=YcY-D64 zs~spG&phE^y6;3KfcW3%=n);KL*2uw^ut;8{&@4W*XwUS`Uh*Ot(%hyL~Sx0=OyoE zS33dZ|1Gw+k;*IRRO=Hr8hyz8OFu!ntNs?2;yeqdXF{|?9T&=f#^0UK1v;Zg6G#>1 zzathUR1gKA8&``Ol09#ex^mPN;Ov7pT$)ZVTU#-T+)&^&{jz zbMg{UT3ym0XNX*rNkqZ3uH|oLuJGb&oJ^qpCiDYtbKLgRXHZ``FT4o10PB~TGtKM0 zCGk5kR!VtgCag{cLZOfgdi4K%6x=fW z4pB8vXbaT&C?TK#oW3&_3eW$L0e|H|*sdPI<jkRL%S5Ae7 zh29R2J^*q-arG7elrU(|(*f(H*Hpw`&;Y}Z7T^h0RnJ!a5Z(en0lB^@eJQ2>k8&Y| z;Tqk*+oF683+Qi(1buz0L<^uWE&-8cAVj*ndfPV}M!Z4l*q*+!zsF+}@VyM&60y-R zz!k7ZkpN1W4Y(a)CbR)`kpvttlPNs6c!iA+E`jPc62O_;BhoE9vwwF+XpNo)2embS zHA7JUfOD%xuD!C??*D9@na4y&hVFo^Aw&0`t za#ah~3<+I(`VYJrh)Cs4ituNB)8J5XUbHhK*zcQ)8YSe@CBQlW`#9&%$TF)+K*@dd zhl~HrHx=|&i@Aij>8YuyHrLf-*dFAFW|*pNs<-g0wePnHO(zeX_(#8jhFV|Q6VS}- zJU*N=hm@RG2VW?I0(Y8r`H*$x+&owrdv8WTK}mqj{CAFES2tNpiz6c=!vw{8^9xY} z~@w3j%MD~vIbX!Aa!pEj=RACrT!|IEOX?!m%IECF_a;bB|&o9|mtAX7NcK$l$ z!^T=aS*mEwhUjvJSo=tfGvjp|^*dsxnuf)zT3nvEJoc%yHPwa~j`hSdH!7AE1fOc~ zm`UzbDD`CGx0Nv;f^(6Wj*xS1=GyObWzl+W*1CJo9VeAjDVC+!q%+vDks{B<8@eN& z4OY&x4~KAAf@7hnS_*qw zn_>oSoOZzDKA1cjr_U0ukI(vW`@?30@v3aD20Vz{IY zU32gHnAp|P4GorE_<>wj5Qr3&cUK55jt!h2HtwihcT_gW!K$3;zMY|!q1){B(T#~G zV~XCKQCL6LggmdC=gMG>-Pa~Hik%$lw`26iqi7eHj5^_wx!=!{2@XK($PvfsNuOe&R z`ZFv=DNg)J;Ij7{>5Sl0=hVHXBmAL5q2|cnI>0%9h?!PtZFVQRw~2cxwX>`YW>vZJGx{cr`^iZeT{cNkoYZb;vUC9kt);O*NicP&sxoOcJy+G2GBlJ(utkHS~l|AgwWoLCK zGQ0o9>HIjMy>f^e%fh?zef+-^Iq!2C{|TxVy3;e|tQuDPW`^67gaSuJ*~@{W!L!e& zBbpUbTRM_qX`gF*xEuQ%ZE3qz?64K(82$U*$7ZdVscsI0`Tes2~Bx>&18B0}Rp0#^dd}F`TRdR~`FOIx^#0u+8J9K{WqoXfb+; z{aH6^%o}++7$?9EdEg%9Bx0B`YNoTH?-!GZW~B>Hu0R{E{usS<#S@jUvEuTH$yPL$ zgpW5<<_gDCW8zBVM{j;Ox!qLxzUg&(vX7bx=IO$$c}I2RRpxf)Q1^tJL+yKBa*5_) zSEe@NLk#(0!4hJe5|nX&!kRnx5r&>OM{>@2++%o+CVsT4QCetQ zCR>b{STRprveaxXtxZ50(o6S8w?8Sm0aakOy4l9GtNFGRIh!|Y%81{|HzEX9Ik*R9 zX}k>GtAP;$EtkHB?Z`x{$SDof z5zdc|HgSx{n!HhVPrh1%hr= zjoHE&J}mZ&ZS1xJgJE}Vq52e2_?Hg8$w?%2=;p zcdv`Jed?gV-se-InMdY%4>b=fbz<^ys5u(*tUvQ=wzQs+?gW3c4s@0{+K|5-MLBhx z)FBbcW)Y;1w(Xz|g{fmH+C>giO?>(V_d zHguFCsa%T!ZJ*d%qEo3N_-L7jMN5~*s!*w|)VG}WnnnCdf!i!NKnHE`fABuie{tgI z`4VhKr}fr`%$ju$QE*UlS9UN4gOTMg@)p#~_SR#7$UWXqO@z^en{Atwg{NV%ReB=% z9}kq8zcP>?>+kO$L_yra(N@J~QAN=sqCay@-ZOVCSD0^l3J&+Yr!nPcp?eiw1~0O3 zJ6M#!`5qF>+gNFN4=u~JfjaA-9cLA;Wy%p84gb0Kbrcj`#X}I^Ye0nOs(qt^Onsk= z_n@@K$sMmQZLzxl@D!1G}!j#ax@@WWt$g}ib^zA{OH+`;?KwtoCa;6$*f)M3m) z?-Ty#Bx)Mu-IZh>=lK0%UU82K-L$`5+crWGyf}g8dzb2%c7|&~I*tE0NVLz9xp{is zesP8`3_72Zk zw4bm>e?srw=T@AE49~|Xn5db`{j0AGtT60L_he`6u>b?5c5{A&x1Tr&m(M#YYGdm~ zKVY2G519}F3jlhT=ubN+c~c98@Rc>MM8u^>OxphvH8f!G?4yR8GzT zxefv#X&qoua5Uj-aoBM8N$`I*3cO`k7p%3(RmC)`w>$!|Z)w}jc~;v8gZvEtIphOvQj{g=TwlG zilkYK!lhZc4*zbp$5iMWyelrit+{PEdf~<3l5H)L^)vZ+HLLIANNm=%nDq}ktZGHN zW33%b=#oBqK^SHPo<~WQ1P1pI2 zL(?Pd6TiOPzsRLPU$vCPbPWaFw$hu+Nbl#aAHZHg*Q6YE@<24iX0d+o?6pfMja-?Z z)O*!}$7XbPd2yHeu+Uy)POj?QzMgqCRWgj=nAlNz5-W{^y)R5Fr!$~ovq9vU<=3o& zQNIEu64d<%r~X7m>&6z$gS{!;zIWR=QrE$jgmJRz<@=1!yd#(a>MH|Mtv&ok?-uy; zs7E8~4~w3nZ_@I-s_syxpRmw^UwQ(w4NjTOgk2$Ogk^K-K0nx&CEjNcu<)36jvG=& z&vwNQ{zB0*VZpj)tB=+P1#Zw3hNw}{`(+yb`RN*Kvdlr?(b?MjD06HeUP0 znF$}Fj^AZv*&@}q{DnDhQ+fuM`!a8*ag8;bn>|uyl9aLYS@t~z;?=Hwf;)+wSu=vu znx}+;B9OVY+}r78l+b3Dio?%Gw$6sOaYpT6r#waykfmsMWO9?M?xUQsGMy+YjWHxP zpbWAFzvuD6bQaY;G#A1=a1Tow;T;S9eTWcy>&?8~nY{=Ds zu>+TdM+~tPTgKXXYB5H>cYU>LQni`*8BgQr1;gf-uvH3tmVlK>#B{!H92C3ffe2C` z?>Kj~?VeeCR6=op{QKamTH3hb*$Sef+7ol3>y=m=W?7ACSy@iibDJ+lFjKvN`|o%I z7`m>77H^XQwY^3b`K;uP+le7DE9LTiUdcH(l+vsy>{K@4rN_WuZ@&0D9vUK>OPvVzS~{qb|Txy2kj$pbN^Co9-G&D z2^hDl!Opuf)8_Y<%X`H)V3Fg|8W;4+XUYSBzv!@rReuLRj{Ib*Gyx@aUDG_kTa-&E zKS~_?{c5%?DIulTi=r$x3*Nm&3Q3o-Gt=~Zc6=HvuqPbeJ`|5Ld0OY~WQ(*AY*61uHT_$NBg4Jj=_Ct7fW@YALRu1uf{bAAzTu@;}$LyTh{` z3u(1=QDv|)y$Qqe7gN=YeAk*@U+7>U`f=tsHX4+Ju8i?ZdT>TsuNL!+S20*~&K(pd z4AH7W@k6**1M>|FJ=Au0SyQwonb?t>BQb5IoodK_>=yd|2v3t$fBLmAZtm2Ps{P&!lR@&G7x1cM!oa$~8)M7f=f|8n#Cgrt^ zp!Ka8c2(@W+Bu6h$7mRn>SXmgXc`E@;Y$ z`N-1CT!TEz8wB1i1bJ*(ZT>nqsGXICFyG=gT!RN1R ZsMq~6;dy*dEa>Wxa|R}-iTams{|D5%54QjS literal 0 HcmV?d00001 diff --git a/img/scan_comp1.PNG b/img/scan_comp1.PNG new file mode 100644 index 0000000000000000000000000000000000000000..90fc4089621ebe046aad07f947fd6335732aa1c8 GIT binary patch literal 15807 zcmd6Od03KJ|9;DiPE+bEy;i1DQ|Zj4*=CuFW;UrgD2C&$pG@*er)Zzd z`EkGKe~FoAiU7aQ-rWNJ7aUK|n(xC|XGxpyleyyAe{hNTU%2yqG@M?(hjj3x}xFh?V% zN(%SBcqBcmlWaj7s*)m}6zbggFv7@gmOMeS%g4U%bZ%9gqjv?C*_xXbIUpqefgCMy~`A#x=8;lgahyS}VCC6o-SZpj{P@93jP zE!h7=l6!qauyOIa_$Zgb&-c_ahnZPXq9W+m=74<^7~VfY7hG&?or?)4;$lL0E2Zh` zwwhjiLAg{eJtR`&tQh4L7DuNzk_G5$LC(l?Ehn@)uV7`dY2x#XcdnUwk8Iy>YJL8_N>DLIQG$Hp8e>D|`OFJehnd2{bH;}eT^gqUG+OV0L42ZkPCOC#r+{7$!Y zSIg@(g5I6k{B2xSvHDsZhZjI>PAZ5`STcQ2ID64hdr2=dDw|GcG!D8nCWZ>5LThC~ zdgwgUog7?;G@f9(SaWA0Jxr&&7E%fKy#u+f%h=)MBs2cR)Dm+b#@N{O97coQ?QVJ* z*xfp3tpHhK%HanS>za9JYZzuu9PEdWq zkWkKd!y}BvORgv5s9Ci%Ypml;8E$J_e~+``JQ7Z@_^c|27OVNelc+f^0*&U3)DD_Q;$_HqAUuV)XQx_4c&AP%mX_B~{7JX3$%EK7hhH*zX|u4@ znG;7!guSY@fk#=js3tGxNs836X1Sn&+9MQlwnacHYtPw@;RrLh(WF-uHXSU5WpUwM zxu!b|;qI-R7bRw#e!QgtZ zP%NIbfLQTE^S*~6d3RrL-#|HLPHL^$2tO}#CGpHSIH%AczETXc<#r-!v!Rw(jkgvF zFepBC|3l|cVQvZw3;#m0`za^%d0s*8rwOm*_mHYawB+_L?PZF*bEeLi)$zwN{TmB~ zi@MxCXailEn{LMlwBo0s3Yj`OjLCbAj;*b%y3E5!ipjmzG<=Jwu<7v^dx|Kwm zrP(!L#J9tB1p!vm)xqm-s?!|Hf+*Ypp9t*l)x95Oll>cPtl!D`nN) zbE|g?#_-&D?g{Xwi+f#&-!;3M7YkBYynJ4WAXl=#)TWZaDgnH=y zBi5FZtm#h`+G4ID+j|9?ENC@V?qnbPT;(WEwx*)YG~LfO0TTPkg6W_0j2Uc&6--j#i;+d7tnVc zBh%A)d@71?D^(f)pwKV3i7}E7O1ie==94X%^Tp-~_!gA{5)R&$^)!-Fs_TD{chwym zEMS|2_lelfm=>t( zwHEXo_XJH-4cBjpK48$D@m*CU3yZ}j>02x8^5*|d)5W=@NYKftOXlgDI`T?;laTEn z>AO7HwSu`BC!XqixK7&g?Poquo=ww^3^v11Y`kVb+*-(PBsrk)BXTMM?f?eu&D9S& zYKoY)Wtu-*tCAA;^>v2-=D;3#-QC?lY_5Ck*vTfapO8fS{uF3%@e_JVlOlP|L(oEQ zjJf~SylJy(PQLa1zkDWB_pxAP5UQldNEVm!?FoM(+y*qpd^xMy18c+Q+wWf&6#U&Mim#eoH zMOH;V?t1OW$@{oGHKyg+exwDtBqtQ7(`xoFn7!@9lAXEaxf$dgQyN}*bq(a?hh)cRlU zsZq?UwI~tvZKGuOeCxQKt_-Qm>uETaIsf58{pb$hoMltQk^pXdnX2lvbjZP!T7Q~{ zzts@YhANNzu@^D>Zc(vSN$l~#W*6_w<%tDAgCWpp~a{PJg%VG0xE3`@f0H2t9?B5eS3m zUDO7E^o}vh+`ihS^RZ@8x%c|k0VsKSdz#ORWO$TnKdN&iZ1n)l3eqR~<%(!-?nWR( z^L;|sj}Dodw*7u?QnaqrQ-7V|OTI^j`}YFv!;%$gbCKXuu`T3^FafF@tVNc_4Cc7cA$a{f&SZi{|>mhb6i^L*Nt**f%E0Ps)7tnoUA z9%$?bH~HM)ZCuN<{5q{&&}?`Ab719j`l#K~yYYsFjo}Ia)S)JBZ*wk6gPN6;+(=OV z0D|zXK0Q~`&2gN0+YR9%ct#2>YiZ*ONQ1iRUzie3~2=8bjoTHa) zUn1*Q-c&8~2~&RJY96yBEqu?lpDK_!kHDb#v20ZycVCZSGtz?y!`3k^HOU7M>f-~B zEP_sZEAu2+VOIEokaKPx-loj&PkFqfr6jThR;C5)-(XBZH$@MOXf(jTw6VOn1Oqu~ z98}T-9vRYAt#Tk0?kE;tu#dLLWRsn)$iojP=olXRZ&<>bmjo}_yW~mY1rbcXBju=* z_1WAao#Z+@4E8xsc~2ijLwd{8&1a@Kr3OZw8Tw)*U)SboviH0qbqkcob=XoZ+s6O{ zZ|LRly4sM50t2S3Xij_TXW2E)l*cqX1L+>cK)A6z>r%bfo{=)5=|ZAv>8<%bs%pcv z!aOd)Cmcj)>YL~cL|M(GS!?R|0n`T>$Yp|fI&Y2c+##@(9%CCOsIgD4%#=4ssE655 z+yey1bgu#I-qM0g>RMGg9_|u{o9Z2)Hql0HolUzvql=)1RvrV=PP<^RAK>`@>AtPI zgBBi{`2D!K1|CJ~nqScda-W`g1OG{b+}74sH<;CWuw%Sv1ov8mAo@UWNAp5-~ci1yH*C_Gw_{^7VReJ&h?UG@zZ20p#Nm^B})%7351k4cuh%k+e_kFYJQ(81z&-2qh{3t zSk`XR6WhnZXWW+r?QrklX?Y{4@k3zuMAy~WPZ$|!*zW_xwZeDYK>4FCgi4nL0IRtL z%rb;xJf_0unS?yIFt8$d9mus9@zzdI2l`?LaCWTe%?(G=P#J6rSG&gFAeBFSSj67npf#j7pLp zKg(as%I`s)ApP;y(!BZ*`=j4B?V?8xJ*(Bz!|grOqD*#UmV{o&VonC*hJwLcq`{X_uNBdZI#4*zB& zUp)%Hetq{7LZ0k$!KWnm-PiHy<}j;roa_DEWPRh+1ia|YG__gVT>!1#nJKXyAIW;u z(cXD?4^P|$t?J!Pw~2X6!r6G$Vv&5E50JB)RJ1eoIoK z+Og4xWBymy$^H7u3XYa}Y2fxoF4hd~fJB=_6wbWvl>d9)f*3P##G2(uf$4UeV*I-- z)EQHf#QAkzC0B~;+i9d6Az1n#KX?Ps!VYEh&(nUIrdjo)`Q+)aP$bx-jBxwgbv%^4F)E6XCHfsy7RPpV4PcZRXpjwvkmn2*rJXG}_BeXC<~V z?elEqt2sAu9n8d-QEw2lYFj;$V>R*@~zuukL&n2=i5V2D(Sn8v`w?=AS%gYBOfj*bPUq6CXtz zTIw2gUeMqCYVr0}PrOwZ3jV^V>%zSH2yX7GlKvZW(|?|?IRUaUX2dcd`B^=^p0Rb` z^v-|N)cW{m{rR@$fa10WQE-mtHXO6pmey&VeN-3z5DKqB5_@F9H8AB@Kt&8Qh8t7B zzONjnm9Al3Zpz)4_&8IB5O4TdkNC0Z>Er(h=;^t($syp+WrMT|^8$5MdsU0fQOA2> zSy^PhcfwsguE$66l#NLUdHmfvj3}z5BZ55nefPdO;@;DphxG^=A4JOy7B{^CM!o|g z>!TNXy`s*$t7R|VKA{_{yOmDnum2l20C__yvcre)IPe-O@RFAF~0<(nCQ=I2lmmN&H|Cu`qm}4JWEArt?YfM=l|^EUtCVF1K5|77KklA z;H@=?ALw!XK{`SCPRi7m@c8r~&1M(z3G=8572%>6r;;;!<``4PgX_cBnh#Cwmw%Un zf8A>})MQTZ%^;dDIC-}7_Lzc>Aq_z0hfjd0o;mvE&4bIge-3m;4Q*3k}fzgI!NDz>%*%D#qXRuA9=n+Ttc~#K0K&>QO*~qUs*> zE2!sNG8#v&__O9TE9oPJDa$itcMY{>9Axw|zv|-hyw3n#zXG*!G5~qz+9j$&(8%99tZ)g}^zeKe z{l7d@ejYjWM`6+P*z2s(bdanNa@XNqa{%0n@doT+=Zp$r^16&=A!)Eu_t0NN&CjOs z1$k}T=4^998!xl2Wt;NtqScbZ^QND+)TBx`rf-E2RjGaW(hCGXJ&OQi*Y$!1DCIsC{iNa3*uqt+a`Ba z2DpP~K`NgP?R_BUiJ>LAZMe;5Jd6JBI;MMx!V&H6YpYaqhBU$w57w0L(u?P<7t;vHmK#Tk9=d($oHxxVV+-o?_uVJbOpVWfj41LzLcp%DJB^@fp#6->eL%9IJidX||vIIR0u)l)%yWJd4XjO5U5k2tiStY4w(54 z)$d@r%EvevaQ!`KTK{F5PuT#u|C2QRQK)`C{Z6RfacU#yU^XEBc&Gvy0!_gO)1L3+ zVDYDPyouaReUtwVrT@{*FcC4w=6So!ej|kM%i>$?u8bUqt%$dbh$CVG_?{9;{Dp=6< z2_WbjEq-5_-+VF$jr6pvvBu&ZpOUgne|l~XJbO>27^B_Xn2vn_EQXz;v+2Cg3#?Af zfp)zwG4Frcu3t913uAi0TNxAQdB>kIefeIQNXjrs)0iH8hz=?1q_;VyUZ;~++yq(u zQxx9>I)n_Z{tGsIdGp8^-=EvbSN}UUI7N>|-5eB&^nx)Ze%EV#DB9x?<^xKT{X0e1 z(*%gYXxt55wES$SLEjBcHYid+w?3e6|1=Un7ynNV-8sB#u0cdV+*kR+PltkysQ1|V z1XRdzGLD_|p~_>NmF#o~DEW7w2!{x|?&rS^$Ys#pn2%@7e%jmG16r;2HSa0}b`J3^ zLX$x26>2DP5A+pU@BF_GQc$=Kj^cCsndB8oz{c0n@8}GA#P?ueZU44(fxVf^)9>oI z^&kAP=S(~^uJY&+WEc#p_rH+3g(=`+rT&2|{Fq3Y`9Fvs!`pc6KNqX`GX_Nf+<%*W zJvqnLg#bR_OkC43Gg!`MRpY$>e?wV100|kr6~QBR`Um01r0aZKEIj}ezXHl+2PHm# z^qx;s-->&P3-*RPjrD0CE`HpiD^OIU6{k@iM z_>mNri~%0zG3~&^Q+s2|fn~wxnOa&pc!Zy8cwGDMoI`#4d^OYh;WEV8VQ|b;r1{b# z>=oDjFpf-cJB}#&QGgv$XL8NhjG=ut(iN}4iIYM5Z3DMRk|M2u##AUp-J<3)&D9M8 zFJ2|L09w8VzK=uI`Rjg2>)~LjS^*(Idq#z;CgkANWPUQ_g@dKOM4_eVxX?VN8|0}` z2x8r9cjCCIacnD<@~M{jv;az@k`%3qYM&UYuaqu|7c;RCv&#{QAZQ?Ym}W((j44x6 zAwHY+V#jo2%8%FU!UpfvEq%Fm-afJ;eGkJuR2Jay(GRIiv0TI3=Bhd=wQx-+o{<)6 zEUkg^0~`A&Jkdp8p@fqxuxJ+YxcR4&-D4ao!b^zG$CU{$pRNkd5;7jMi>~0{3P&ZK z!u=KFy0P{Znn}YHs@*sORSnI4sGZlL+yrN+JIu>NP+{fPLQT#Nfm%V7bCOwJ@O>rh zM4aGN+ZoluC!M_TX6~~Z!S9Utd9m?)V@FyuID^K-tNT+CBG-7-%0{}fBQu@1_VFf8 zMrr-2x{RZ_(A3^f%J(0v#3qxoq0{T5FZM zGhJR;`RWXE@Q9i&WoAKqkI43#-};E1i%4LFcs2Q|qQ3qV?-lu_ll!+cTSe0 zum`VMZ*sVtFKcDrP8Dw`g;Hf1X&cn{TBYcCN?aj@J6IX3xrV#u?p*;_$vasa4fgwm z!h}~vg@W|ujeLc7^RXv@C2CKXipkRnuu?RnE>X)h*AsT*ehq@()LlzA zZswy<>d&a@0rO%==z@k?zN0il4ismr(;Ym#U=b=Xmjeb!1%j zH5MoJYl-6~f3qh_cWt+OPJ%BuO&b(rHTW2_OG=>((vXsw7b+7vR)F?q?c;9**|o`e z+o6-&{76Vk@xweqMGfsGZxOR!T->opZWF{*;QH11on&NsZ=C3`ss^E{6zu6BA*f82 z880sU8uC@Ef_V*lYiE0?j4BFZHwV~5E53GJr*^u#;N?Zj3!68iQGxn63=Y)Y2hUvFW+G#6KeLTuoH}&n*JVMR=7;((o#q8Z{WvpLMKqgr ztBUD{EL--yU1(R00&j$Rx#eCp#@eC^-=niaOG}FztC&@+ERNIQY2S z@#qD7aE!ZKnN$Vdswl%pL{=+O16=VYk&Z8E41TvQ)Zf-wNJ|eYCFIC%J>f_CI=g|# zpRFYoKG1}_J;^CPbu(uAtGX7(H2-`Bc53Ebq(w@KcD=h@fk!FfdwgGRH|Kz@*a;jt z*-#2Qbo(?1$yjlEaaG%e5~Rz_Gl~I0@Av7Q3*;$|vI38VCP(eID4}`c$NJ$?&IJFu zX0)+=R$Q~Sye`ZoY~V2z$A8Ll3)61??++8uSmj*P}hB2!!Bp2)F&)SLqhFRK1Z-&#;+B z@V0IGBa=1BEwRn#32TlVZi<<)IqMl5${%s!Q1BXK2nymj) z^8^lQ{I_2V0`1@5fB7ipOPE{Qkp8IZV1b$mj+`~r?(54TWbMt?b~X@UxfLWTitUy# zkJ$+hvOZymCKyxFOIH?uYD~S`2gLzFhE_Ku-8yhqJ6tuwyT0)S#eFCdSBrOutVWaI zUsSp=&6J z+8)HArPmdAW}(phBCY?6Rgf}tSbuJL@bzY5PoCp_oks1B`rhB+Zt~-2UT)8eCEFyszwZF7cLj^iJ^PVi(aCon5@ zlxkU8jL# z?CduUaAnrf(iKx&a2Cfub@)~BkWTQFCOF(mZ0+W09GERtOn>4ghZZfiZ4aH{j<6^f z$Rs8w)BV)Aq*85QGvnU?=`)qO17G!77*sQ)< zsy?Sx_$MHJm>vbalo@GVO9j2pcMBWkDFvO=t-mqZ5{H$qTT5&Ul3p!$7(pg%r0)tZ zpX^D1gPO7w+Q!pZFE9zhVPzpgQ*Kn~K6zm9H7LE|MM7^f-}46#yE=^;Rd zZK-xZtfpKTtJ%CqpnbszB%ox{1C3-EV=&Bh zoxhaW&`0XR#upY=cKS9qHwO?pnzm86ufL9|imkALd;=XAS9qHtq6YFN)-8pci{+0> z;!s3N!|oP~`FaP$CkwYEtUa+1q9 zA=T@C>66>!bmk%A9H2}8P!@~Sa`27OF*LuM&WYd_^5gSWRvIk$J`N%149-&1VwtqY zaL%iJ=dn$+IF(dj!Hd1;+j!$7Iyaj1F|IAn8v|bp@bghyxRio>NLPPb((8y;@9$!f z6k<|*0$u75H-jdeOtNNE8El8(70?V~CWr}|!#XGFO_Jxv`w?5qm+|}vFf#v;b)5(L zG)S~K9J}Y+TR#V1m3#7v`7H=no}{$NdG$q%H`t+7z$2DJr0kPTg0sVtWIq)v$D zhnI?0yr81d!KJfes@>>t;f_b_MVm)6HzVLAJx(P@!1*hDq!AG71&J~p@L?Uh2?43P zSG7uunAXsqN)J>50d;*Bv?tUXL>Ulqy?jq~rC$vkzEQMHS`?n}LOB(FDxEh6Wc2Ly zR{qp)#b%S3+6{#=hZ^75cKXW38Wy3oyb-!Ii8H&7sYfuM{rRRZ9hLk&;i0KQUv_** z7Us@UsD>hC=$)Fv3^Ie<0_3mv#9hj&WY8RVn8Q0z9BQjM@lpJ28K3T0#yx>1dDhy+ zcAN{~dy>0K9!?e1sO19XZ+-k%y%EE{>b-Xx=-xP;i_=eu^0uu6|OEGtEYNno;5v03o z=NhC=!RQnw)y(XMt{c^?#MIPW#fFI^EX4>zk3efjMZ(&!Yp)d8Mf{plw^EYUgvmpT zLe=Xkk1P+CgKZVT(H#^sY_vMUsuH~SSDTQ>EN2sdXF-CU8VodeP$g(6#j1O%ZYgeQ zZfdlwdhLiEFxR53t^AmCL^)K7NmbXBfDb>}&H18kRd9r>jR6@!{S z(rNusrdI2~EKW?VO=AXcN@zK+cvwXchTOKQl*)07wu0mTA;HeI#L!6|z z1hV8Mi2^;5^bLLn(9rd?dXD|Hz z{mTA_9`@Z^$J+RorQi*n6HAB>2lM13Y~#B#8UXr4xG;ipx2B^(z7IC~AtaPwibyyi v?Q<(dEr(GSn%%I*f;tG0Y@ZZh?X(LfD#!JCTm5k7NJ@} z84?G=AVUxWVN6h~fB^!DBtjqo5n_yC3SkN%-w9%`-rM`He|_J*YndgE?LJ5IJf&Z4^4m&$RO4{_q;L9gL zyIprfAP-VB=T0sM-&dag5rKn1zBsG?ED;16MnfRmx4G=yeKguv+RM1gzf6$#_Z%MX zUlzXFJW8_VMFpoiqPDi7^NdbIaV^9x?!M)i@hKZ#pBARAs992>Q}Dahm)2hxu=*VD-`^PTJtdPd@`TL%!PRzXSp~b?(19 z(DO4)H79H2QBa2Qp(iDS1LO2K#bhZ9@-`_ou{LMh=-5D`VVIDv92;uQOd_b}m4#KW zgX`)3Fae0VS-9opwrDWbd25T=`bs$3FGa4fOs1kX+|~c2%|;> zQ19PV(*r4aSFX+{CNLcn=I2TU2TI&uLqcC~TtkV%+S6xIc=Kcwc_AzN3zAPd%GAV$0;siXYtX4ftX;#|DPt z7SO(d3ag{OsqhYC>{slC221kCx>*D-{BW3ZSR8RGn(-#cD{RUcRza^Ws$7kv6QltY z*TVQh!;HpyUhL4x%`ta`GS76B^wuw>`z7t%UJgFtP2SLo;ng5?jXJwJaZbq=Kg<@0 zBaFuOwOJlWvGV4$X!fr%y?aJ?So06_Z*c@=ym@P$-0P+2d1a=ePd++{hf;lY^E9(6 zAL=+m+gQcf<#S!EKcHjOVF@yNT!!w(Mc2;hZJ{K^2L_K)-~1vPBZLqv++m%&RwQaa zgJ#DECRhq%L*+fp@C1i2z6mb7F4i^30JeE8#&gmMW&#_56Rdf0ZWN(a^3zsnmSjzj zrjz#YwfPMt*Q?y;YpPx)?so8*--_h4)vnW}1RTitLFZzf&Fj1USD33-*ZfGBuC^qZ zXTayO%swd?s*Hvnt-k7eHQr5Rii-U_o9Xdc$2OmXChnO=U*qR195=Bz=YO6ocsf`a zax(-CJ%)?vP4cF6ksKoDdUoC}JdR|&{UwIPJbmd|T(Frx-ZRGRg>g)7qpl;bH?BD$ z3o>;Hu!4Mjrf%DtsXBh&6}Q?Sq0Y6bGk0%3+~c4u`2Kgxlp&uo?epW0Ong(`CW)Fw zc2P6ME3jXdSvufr!1_t&d)yEj^}4x!#xpZsHtnZ7U(E;WU%R?1`ADkm6K03Z9j<71 zh#xPl7*Jk=Vj7!}sBQb{)+a4 zELdPHwT;`(yi#sjytpti{wjuD0N@#K)sgLIluuzs}7X z$7Lm++?*@v7c1KPG=*xn-D@u)V%jRM%&`g4#MlyC`iNL$gX+T(rXr+freiXwXgCE; z==@^pBzu4>sdVKMgNkA7p$?xNTC8V-)Cf}AQw*%d4?(%o6{Cd;cVnepHGz^)DO-xQ zPHL*7{J48nh^;^)!?e4;-#T^dPuLGsFJo9n5q#r+C-R48j zBMk(qB7W|)p)}$gNjpUe!~9}m{I64zoW^1vHnRxILMChyjb+;6tTQ_|wjK`aWQi_Xpi+I1$2)sQ za0o$^E=7I_->qFz-}1b&+0xsp8is%SIxoq6z9r-8X8m7$-3|R_>U`U-S^F~C))>SV zN+vO^<|wOs?3GAv9U{ckAGKK7*I@RwfB*37y~XRQEY{0@`aR0=&nV5qr(=?ZLN z`cAaIRjkWbl)(VQ@apgaV#R<9|8>NX0z=(|Va(F#)u6yy=Uu5Z&#wvIrJ=7&S|Yac z(!2w?`g7{?6FB<4JGDm1Bd2;ey{kYycG5mqILbc}e`UWFOqE-U06bd`igMqUIa|Xd zv-IJg237Rh6&Fk);2-4N>b2@}ZvB1JQV8S)#7GkYIkD`2G%)&R4;K{jggU%zsK7CH zdemk}RR^j$x-NbNSiHMX3!&jEq{qcf_x=orNqcLfv;t=S0SwZlIQknWmas@L*j&jd zxqWc$w`+dFfsqDs@3E%CRfo^*SaHAkAx0ntPE=8k8w{m|9m$s6DvknPI%h4fHNAn7xTbDTXk|xYFLA-`Gac)V+=PSdwXC-rW9A zp+wX`YRZO&Zk_}9-J4y=xK3`!u|HCxeS0XHP6zFd_hVo+7^7YKueLq^KU=Fli}@+x z{EYueeKl}%`!tUsJZbQzSs4qK5UZS(H#VjiCfo;cQ%x&`s(G6Yz+t%hqNs{g@(J=< zxnhK&ik1zvhA1cbVK6l&YyWiXu*CT$R=6;44ID#i0ZWP!@0%9Y?r&oEZeE^83lq+R zHVGCVJ2O7m2)=<<<-i>=C(j)wU?iy4URr;R8+^+*Z5eM>P=@OH4geAhng8rjP2mO8 z#M`2t8~bY%-F=7F@9V`?9K6vv6eA@|=E%YLMHhs49&>M*hG3A4DH z@rRpUn(cu=xbLq?H0rHBe0$u-l(8@WN24s0NZyJDRojRX|Hd*orOwt!F*~iROm6vqe)9sfoCI&PYQV!1%AzGRh3nN_c~9=*&BlbxgDBoMRg@7(k=15L15Wma8o^O^b__;H zn&aFWFdt`ezOz&NPwtN@8{adsMb#e+3`kj=V^E7O^8okL3KkZTyQR^$0^aoy->)%i zZd&h?Px~|?2&1i%tc9G5dmlrFPu0=bQT-2osrR`VMF||U{u%;Nge=5nC9t=za^a5S zD-pB5b>_K~+UA89Y6_nSw|@fZn|;TWL{sJV;5fk>rD94;m`H|L!TN@2Yd73iv)8gd z7i>#-3z6|L_P^!yeAPiXSrw6QkD9Xpm`--6ee|*NchxvZ#|_$hZ2|5v7R;xq6FBs^ zIy;=zVwkXSsd}^1fTF6|0z^oR^Oo(ANfW+l)9RLht=S^(P=E2VQBiyXH>drTD)DuL z73_NotYfF@fV(5u#wEPNNxN(!4;P!%e8>5ui}0SD-9Bist#h0jo3!oB+RVp)r3rxl zQ7U~WT$p4vVv2u3$%rb`%l~!-1y56F_(CcIo^9@Xt?8^(tU}C8>uCg67p+2!UrUvP zh$Q8LbU5@Js`dj*=lRyOp)RKZNeKVPvxD+MW;#j(LAbJQ*)dH-PxDepA9WEqGRs@7 z09qtwh7tU6M3N*0HXj{5=e@>r#&+w9`?ht;9?N?8KJ^$zqz(+(fw-ipJchs4W9MDr}CU(PmUa6GBfLhk??Gm=W+^Noucf17(W1%lkPYo0&{9`HP zoN@tdAPotlK2hZW7b$v?o%5OJ7bv`F?-ylz4G9UVdFXCU$hoiH`2dp9`i^4^xfW5p zo(}GfU!bgyQ<}iM1xDU^1y;BW(m-GQ5tN@QtU41vQ17VDR5A-$Ft;xU{v-%+@5KTk z!LM>4u}SaDu2efV90Ibp^Y%G4X;F)kJ{X1vg_O1+0pF-2%LrDRuGWD*i+9PBR3!R? zz8Wac+dOr2YY9zk))6u#+9pZWH^HZc879pcDwF^GvhV2J$iDX)2=p{PcP;rexsSTO z>?oM*oo0hNt3Cy27+i?haZSYhQg(4sV$qC;5ozi@xqzJ*4BTkz=HU8QZQQ95p<%`k zYKl&I1J5gFd}G9R37e6EJ5FiCxzHdx2;}MlGwl686yYfSvI*PLFZ1V)X=Q+@nZDD~ z)reWcq?+)^?dPtEG% z@>%|4uA?6fgY>9HXdJD%X9a%jvukNL&^qcjoF1$0Sh0()V)!9OMse;5xhzhx9SM5FNeH9YOXH7 zS|Z=XoS&pIvWvc{2?2uK=RsXG84#rCn{JTo(nn<4Cvy2}6g?In`w67M`h6Gx{@l;i zl`(k)OnYb=sk;L7iXYt%<{Z~IjWdU%ez-3zpXf!*4s?jx!{z#3$iRF3^9ChkGz8Mf zUW{KE!1L*R+FE)46l$jKc{K@C0X?tx@dn9YcQ;c5@e)>#E7307 z`*>)4;%5a4;RxxYe;mDP(2Z!YW=KgI2Woa&ZGFtuYvk>MKyv=oL&I1eKeBux@9g^0 z^%o}kLCvgu{$Wgm(^Cgo`zqMkn-jPF%@|M@!N;I#m(8X1=cs&cf|B{d=_A8?ow%T# zoljQ{=24LC$8FY9ArLRjzc@c`csUd_S4l#-^W7zo7l%KZ0VD}*n!dhlIs+{HS316D z)7~)6A=Rd%)9lmqM}XWFiwsuxk`a4r6HVhR(w274&3<~rvG9qyEBq__7T6O^OEuOw zGK;gR!I2M-0zT`0?3k`v?cpDp1A2kspTs*6GA$d)Ry{{>+&2men^(w3E&-`fZJdY- zXjQr93+39No%)vx;7#l)fAN~MVNl@Kw}J8!Fb4fl0p6!m>xmmiTMd^%+IZ%d)HL0( zpbzgyXILVtJ)-8yjfBt@kkiD--3t=%F`Xw`hv(aCVTg1&O}( z?`xPFF;RQ((>{}TnL@egsc#;S&cYLXB$pz!5c7QskEl|;zQ837^%$K$)yjHnL9pL5WQdXUyYc6FU z#F_JV;oLm&T6TP9mxf&FSyLX-tVLf}2GKiIyHPEd%|!(uji&N}00M!2t5VLROeDT< zBTX>U^5f}qI}RrnA?C)=5a;&AnK5ccKtrI-Wk8hG#`=1j`ENc76%oC<0Wu@Y23_gd$7W%-y>+WF+A zZ8`1tD5q~ubQPH~OqdCeRzQXke`l(iD=`l+O(8$O(KWh4TeYO8_B}910I`P;0kK-t zp{bW8d(zUZmfbKA*+CkhA&b$pQtO|A`Wp46Ag?Tj?LbqvaU!c6()U^S^6hF)2D*YW zQ{xPb<-XHW=nN~t&S%%#H%9O!&&agliDPSQ55PX8k6LqLo`b8^H-X}C9q4)Ji%zFB z#_0*$CqcQ8zJUKYIcWbR7k$h?JuAt3PshKV?;l_jK78;$1;zx3e(yGLKGSpOKLy1d z=x@zQaQZ?yY~LTGkB~Soa4v5mvu<1ok&8o2t_a0JQ@X z0=)$+NR{hS7s(Ls4_!b4?8xf&OD~Wj{EH0{T#x}EQT(|Jx7;z3xM;3M*rOLSLD@uL z1E)Xo?cY}bHLEE5!&Q_`AA^uq0mJ~L7chv}vfu;TX%E-!LqFpE)zJRaWu}@=IL z$c`G)|A*Vux5|;!MI(W2?f{=|m&GInHl8h2nc(O;VLPPx?3yMSr#B`8x8Y!STw!c2g!CCML#fD4#Cl`H>-3F27} znq*<>dBOB*JWZr`Z7@J~%5%Bb5l190AU@ZNoXsr#eGY#dGiMMTUL?;*0yb1y58uc{ zEcy1+%(@9`0UBFQvq`K@ZB+@lGz)u3S`agtBzifT}(d*}V>fNH@!|sPSBV43UG6EJq z^1^?cExWx;E$M$3rF4M80+(igmxE{`_5ds3psK#hw0Cq;7dBw>JH1-WgA@QuiS)D) zJZuAflYbm|?0W|dHkIHc5VcUa4AhM;#Qbf^9xgkmYzNqzK)kzsl1vlgeu=-Vin;^Z z9LT9bP=a3X^wjV$Up&g~O;VqTL;vqj#JjsdP!eX#VY7$31AJ$Gy66G|uUJ|DFW zmEtKb5=(bWWJ*tpZkowAwpJ7xv(ba^=z32;s5uL4jb;0E??N)FQE@E?d@o)MN=O@u z7-?ai)^bZ0UaNe_vf@3Yx)RJh9km&`ma4bK7h8$Jb}R{=2#02{Vu}sG^VBra@)7&6 zso1Rg48AZrbAqGro6ky{OwUG+n7y_gIujNjnm$-rEaiAt=4RJFuMC7uL{Dxsts2x+nsM zzY0_3C|}F|boeyK)nXWBZ@@o2h4UEB+ssp3t24jWsQg3Hr?lhQi(o-rIPXmBlzSrc z7|eyD>{q)7bo$x9?&YwS1&0h!Ppv*8hLqz-QCe_|OAcd}a0|Lab4;yhWy9kM2RQYV z{B#)J>38p4Q(I|yt(anqE{69|!B|YkgH--c)f8N9Cv&FeR=6+6L6_K=kqwfgxR4n0 z3~OCXniR*HnYl{(?~~(yO%n$gCo}Q5n<-tf^YHMLAzYQtbzi^uY&<_`aN?S%*F}~^ zs1J^Ji{h&9DGc;;)Y$+xYU9M`<3C?{}S6TpzIv3kkY0%x~kkrk#s|?`dDw#@HRF?&Lz?+Ymi4NhVS?1XWbve4# zWn+=a9Lw;wvIW!mVnR{Bt9^4=rlJEWT?5H zmLMGq^@0uU`w8ot*?74ss(8XX@9BuEF(u3tpky2xl)n&=b{$>0-E zmn%`MjxD@F+eYdTZln!S*%H@r)kJOba2aiHQ}?g#g!nTv540fqn`9^22k;QHZ9hZ+?~@Eh zW0SHocozGQ6IWTDGG~Q5^1LT2sMw=ueOG?x)9j30DFu2k^mO-jT*#cqcol0M!CT{ARCHNgl?4 z0jCs*PTfzP@*K*qOac53xP5@Gn%pSqjr8Cy&%;}k3XidE_jvs%J=G?y2 z*8+6?nR&OjKTcYjz9*g9XS@ah?{9L*UY7Bw)q5U!a7XLCfc(H0$c$Y{m|k4Xk^VIe zSg>=eLVtN~?~;HuUK-zEeUDsXS0RYzB4#S>=}9JsmO3cUzH|B_)znFQ)DCQSdZ-nn z{Tp$iO@|vY9_G1*ut~#@(`liF%**MJ!CZ^MLEl{H(9aEKdpcg>Y=Y zt4KZj4B5n%RYtJS>aOXF2c1n%5&8SCaasEruwDL}C%*gaR&KM{fpS05;Yd5?=hnwN z{KZq`K{UPE7fG_UILAu_FAfB@+B-Er-2d}nPwJr9hkoS`j<}jXYMD;XxP!S^^kq+A z>!8R6LeT-+U+os!rX_ve0KxBKX&AwcrRQ*9&D*11mms5s>gN#uo3{i%BG~_EjlP$H z`k1MKfyR`S6yISfp9T|v7a2x5-P`8g0{&`Mq^rsN_M5B`(C_QG#t}c^zmkc^>JPb7vH1Ru(JrA?#&9)9@P#xA8^bPj@JP#zDDaUjbt5K|sjKGMyFV@D{Snh2zOve%nCDGzzm$e1-@sI6F> z{V;7DSe7K}bobUal4z*k4!~FFG9W?i9WKo#?RA&LV-8;qMsu!sFx}#RZxv-SYmX%; z_(G4Z1txEHOQe_=0`bCm)7HUA#n(4YWa++H=~o%X7#44`)$A>$mD>+fq3h32_P&*S z=cNxfrjL(xRwdvMWK_ZjWH0IhRI`SvG9=GvalD;P~1_vNJ_fS%U4DZH*%}Huqf3@)#J3=T^-)I93jgUyI483oWUp<2*yt zTa=3dQ+^pFPGH>-oC}Bpj{b-#|=hihRQ6>1BCMO z4jVJ3#710A)x#clY-Gzsv-hctx;6JJ{6@b(mS$A^S%0%mic`vK(RB*Z8LX}DuBqzu zLHyiomK91E{iPFk1NDT})|uNV`*yighJ_8w&sGroG9)%UgeB3b6+dVZJr5W8S+HWA zs&cf80*y%*sL9A(Q;#0IG8BKlLMvbBB}+rvCdDZbU1&17{=x~|ETlv3NmL+qq}hU;+1j>zaNs=VeBFmMAh zYzj*s8hWrrFk}pGtVRne{AQm|qbDg+om+}M+%%FTQ+7EQ+xUbGAi=oUr*+Rt9?%%c zVa!|5BMIRKXv%W$PeF>PZRENg$kn-)duFitG?DskeM9UQKKJm-yPR37PHr#)2X#n! zg{!#!2wnle9d!9oa_^*Vk4ZK{kcQtu;#(%c+eT{s04Ec=BRV%|cmtg75H`Y)-9wXl za<+wyIGn*4XdQ38u6o=1hvJ9u7v4$ah%{@Tv?)=R$9O~+SMs%*S&t;IGG((vX9;x^b+%Z8 z5idQK7J$YoI6KfV_JOyZ2Q;2u61OT^kv6meN@S&5hF;~3A9WqxW9G$aJJ~f*0gidF zil9(RZvDhOMvc(hMCv<)1d4VA4=T5IpUHxA`U5^g9A5|l;d60ARIAQ~c#_V!Z$+>O zr;cgcdQ|?)>*W|?zC#NmhW-Iskvn9SgDboF@?^)tBqR9S`J_9QgDtNfK;P+CmqBw`IlvMqfuj+dLs&mAWWQ3b3cZMX%M z=&}zh4DRvXWhjYHCgQR(bH^OEK#4V-}ri z0J}8N5=!H+`&Sm-=2Jf~J1qr)%HcEj(QQdUBmT!t #include -#define blockSize 256 +#define blockSize 128 // macros for bit checks and toggles // define macro to get nth bit of int diff --git a/stream_compaction/shared_mem.cu b/stream_compaction/shared_mem.cu index 729d158..cb7bbd7 100644 --- a/stream_compaction/shared_mem.cu +++ b/stream_compaction/shared_mem.cu @@ -4,11 +4,11 @@ #include "efficient.h" #include "shared_mem.h" -#define blockSize 128 +#define blockSize 512 // for reducing bank conflicts -#define NUM_BANKS 16 -#define LOG_NUM_BANKS 4 +#define NUM_BANKS 32 +#define LOG_NUM_BANKS 5 #define CONFLICT_FREE_OFFSET(n) \ ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS)) From 212a68bd04f2e49fa884d22f6a1901ae22a5d2b4 Mon Sep 17 00:00:00 2001 From: risia Date: Tue, 18 Sep 2018 23:32:55 -0400 Subject: [PATCH 32/37] performance analysis --- README.md | 110 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 106 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index e3b3ff5..6153d0c 100644 --- a/README.md +++ b/README.md @@ -110,8 +110,8 @@ This shared memory scan algorithm is further improved by using offsets on the sh ```cpp // for reducing bank conflicts -#define NUM_BANKS 16 // Number of memory banks assumed on SM -#define LOG_NUM_BANKS 4 // log2(NUM_BANKS) +#define NUM_BANKS 32 // Number of memory banks assumed on SM +#define LOG_NUM_BANKS 5 // log2(NUM_BANKS) #define CONFLICT_FREE_OFFSET(n) \ ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS)) // Offset added to each shared memory index so that more threads accesses through diff bank @@ -129,7 +129,99 @@ for (offset = 1; offset < blockSize; offset *=2) { // this offset is for calcula if (access < blockSize) sBuf[access] += sBuf[a2]; // manipulate data at offset indices __syncthreads(); // avoid mem issues } -``` +``` + +## Test Output + +''' +**************** +** SCAN TESTS ** +**************** + [ 15 38 45 12 38 26 6 23 30 2 34 33 7 ... 1 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.077432ms (std::chrono Measured) + [ 0 15 53 98 110 148 174 180 203 233 235 269 302 ... 809545 809546 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.058864ms (std::chrono Measured) + [ 0 15 53 98 110 148 174 180 203 233 235 269 302 ... 809458 809482 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.099264ms (CUDA Measured) + [ 0 15 53 98 110 148 174 180 203 233 235 269 302 ... 809545 809546 ] + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.098816ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.164352ms (CUDA Measured) + [ 0 15 53 98 110 148 174 180 203 233 235 269 302 ... 809545 809546 ] + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.156096ms (CUDA Measured) + [ 0 15 53 98 110 148 174 180 203 233 235 269 302 ... 809458 809482 ] + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.57184ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.29728ms (CUDA Measured) + passed +==== Find max, power-of-two ==== + elapsed time: 0.059936ms (CUDA Measured) +max = 49 +==== Find max, non-power-of-two ==== + elapsed time: 0.05984ms (CUDA Measured) +max = 49 +==== Radix sort, power-of-two ==== + elapsed time: 2.60947ms (CUDA Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] +==== Radix sort, non-power-of-two ==== + elapsed time: 2.29824ms (CUDA Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] +==== Radix example sort ==== +Test input array: + [ 4 7 2 6 3 5 1 0 ] + elapsed time: 0.464576ms (CUDA Measured) +Sorted Output: + [ 0 1 2 3 4 5 6 7 ] +==== Shared Memory Efficient Scan, power-of-two ==== + elapsed time: 0.0904ms (CUDA Measured) + [ 0 15 53 98 110 148 174 180 203 233 235 269 302 ... 809545 809546 ] + passed +==== Shared Memory Efficient Scan, non-power-of-two ==== + elapsed time: 0.11568ms (CUDA Measured) + [ 0 15 53 98 110 148 174 180 203 233 235 269 302 ... 809458 809482 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 0 3 3 2 3 1 0 3 0 0 3 2 ... 2 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.116543ms (std::chrono Measured) + [ 1 3 3 2 3 1 3 3 2 1 3 3 2 ... 1 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.132346ms (std::chrono Measured) + [ 1 3 3 2 3 1 3 3 2 1 3 3 2 ... 2 2 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.390321ms (std::chrono Measured) + [ 1 3 3 2 3 1 3 3 2 1 3 3 2 ... 1 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.171744ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.192384ms (CUDA Measured) + passed +==== Shared Memory work-efficient compact, power-of-two ==== + elapsed time: 0.234592ms (CUDA Measured) + passed +==== Shared Memory work-efficient compact, non-power-of-two ==== + elapsed time: 0.236352ms (CUDA Measured) + passed +''' ## Performance Analysis @@ -159,5 +251,15 @@ The Radix sort appears most efficient at 128 or 256 threads per block. Since the ### Varying Data Set Sizes -Once the algorithms' block sizes were optimized, they could be tested for varying data set sizes. The data size was swept through powers of two from 26 (64) to 222 (4,194,304) for completeness in examining small to large data sets. +Once the algorithms' block sizes were optimized, they could be tested for varying data set sizes. The data size was swept through powers of two from 26 (64) to 222 (4,194,304) for completeness in examining small to large data sets. + +![Scan Comparison 1](img/scan_comp1.PNG) ![Scan Comparison 2](img/scan_comp2.PNG) + +The scan algorithms were first compared. The plots demonstrate that the CPU implementation is actually significantly faster at first, but is overtaken by all the GPU implementations around an array size of 218. This is due to the CPU scan scaling directly with array size, while GPU implementations mitigate this with parallelism. Interesting to note is a sudden jump in thrust scan time at 215, but much slower time scaling otherwise. This is assumed to be due to how thrust optimizes scan processing or allocates the arrays. The work efficient scan is actually slower than the naive scan as well, but the shared memory work-efficient scan is faster than naive, and it eventually is faster than naive scan at larger array sizes. + +![Compact Comparison 1](img/compact_comp1.PNG) ![Compact Comparison 2](img/compact_comp2.PNG) + +Compation was implemented on the CPU with and without scanning and on the GPU with the work efficient scan. The CPU compact without scanning requires less computation and is thus very fast. The GPU manages to become faster than the CPU at an array size of 219, when the array size is large enough that iterating over each index is slower than the parallel GPU map, scan, and scatter algorithm. + +These analyses show that unless properly optimized, the CPU can be faster than the GPU on small enough data sets with simple enough computations. From 45c7ea80f262ace182f36214237cc0faf070b0b6 Mon Sep 17 00:00:00 2001 From: risia Date: Tue, 18 Sep 2018 23:37:13 -0400 Subject: [PATCH 33/37] Update README.md --- README.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6153d0c..4870a6d 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,18 @@ CUDA Stream Compaction This project implements a variety of scan, compact and sort algorithms on the GPU with some comparison tests implemented on the CPU. The base requirements were to implement CPU Scan and Compact Functions, and to implement GPU Naive Scan and Compact and GPU Work-Efficient Scan and Compact. I also created a wrapper function for the Thrust scan implementation on the GPU. In addition to these base requirements, I implemented all the defined extra credit assignments. These were Radix sort, using shared GPU memory in the scan implementation, implementing memory bank conflict avoidance, and improving the work-efficient implementation's efficiency over the CPU implementation. -### Features +### Features + +* CPU Scan +* CPU Compact + - With and without scanning +* GPU Naive Scan +* GPU "Work-Efficient" Scan + - Including Shared Memory implementation (Extra Credit) w/ Bank Conflict resolution +* GPU Stream Compaction + - Using Work-efficient scan, but can we replaced with other scan algorithms +* GPU Radix Sort (Extra Credit) + ## Extra Credit From 1d8ca9738b6423f36206a55bf3defbc031f5aa49 Mon Sep 17 00:00:00 2001 From: risia Date: Tue, 18 Sep 2018 23:40:03 -0400 Subject: [PATCH 34/37] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4870a6d..e92d38e 100644 --- a/README.md +++ b/README.md @@ -144,7 +144,7 @@ for (offset = 1; offset < blockSize; offset *=2) { // this offset is for calcula ## Test Output -''' +``` **************** ** SCAN TESTS ** **************** @@ -232,7 +232,7 @@ Sorted Output: ==== Shared Memory work-efficient compact, non-power-of-two ==== elapsed time: 0.236352ms (CUDA Measured) passed -''' +``` ## Performance Analysis From b38fb670602c1cf0e194ea04ae7ad3c398b3fb58 Mon Sep 17 00:00:00 2001 From: risia Date: Tue, 18 Sep 2018 23:42:58 -0400 Subject: [PATCH 35/37] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e92d38e..2309f76 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ In addition to these base requirements, I implemented all the defined extra cred * GPU "Work-Efficient" Scan - Including Shared Memory implementation (Extra Credit) w/ Bank Conflict resolution * GPU Stream Compaction - - Using Work-efficient scan, but can we replaced with other scan algorithms + - Using Work-efficient scan, but can be replaced with other scan algorithms * GPU Radix Sort (Extra Credit) From 98fd2336e9bd7788ecb63a6c53656fb946878320 Mon Sep 17 00:00:00 2001 From: risia Date: Tue, 18 Sep 2018 23:46:36 -0400 Subject: [PATCH 36/37] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 2309f76..82f43c5 100644 --- a/README.md +++ b/README.md @@ -274,3 +274,5 @@ Compation was implemented on the CPU with and without scanning and on the GPU wi These analyses show that unless properly optimized, the CPU can be faster than the GPU on small enough data sets with simple enough computations. +All performance data recorded can be found in the Performance Analysis excel file [here](https://github.com/risia/Project2-Stream-Compaction/blob/master/Project2%20Performance%20Analysis.xlsx). + From cd68a7d0c66844cb9d886786a0a2d6db0f7de824 Mon Sep 17 00:00:00 2001 From: risia Date: Tue, 18 Sep 2018 23:56:39 -0400 Subject: [PATCH 37/37] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 82f43c5..6c5d81c 100644 --- a/README.md +++ b/README.md @@ -272,7 +272,7 @@ The scan algorithms were first compared. The plots demonstrate that the CPU impl Compation was implemented on the CPU with and without scanning and on the GPU with the work efficient scan. The CPU compact without scanning requires less computation and is thus very fast. The GPU manages to become faster than the CPU at an array size of 219, when the array size is large enough that iterating over each index is slower than the parallel GPU map, scan, and scatter algorithm. -These analyses show that unless properly optimized, the CPU can be faster than the GPU on small enough data sets with simple enough computations. +These analyses show that unless properly optimized, the CPU can be faster than the GPU on small enough data sets with simple enough computations. Global memory access latency, idle threads, inefficient branching, etc. in the GPU implementations can cause great decreases in thoroughput, and with more time and effort could be further reduced. The comparison with the thrust implementation shows there are more optimizations to be applied. But, we still find use of even naive implementations on large enough data sets. For example, many modern laptop screens have 1920 x 1080 resolution, over 2 million pixels, making parallel GPU computations significantly more efficient. All performance data recorded can be found in the Performance Analysis excel file [here](https://github.com/risia/Project2-Stream-Compaction/blob/master/Project2%20Performance%20Analysis.xlsx).