From b674ee28f9ed7758e425561e154ec48f13a3ab06 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Thu, 11 May 2023 21:30:34 +0000 Subject: [PATCH 01/99] Initial checkin of bitvector Added simple size test --- include/cuco/bit_vector.cuh | 323 ++++++++++++++++++++++++++++++++++ tests/CMakeLists.txt | 5 + tests/bit_vector/size_test.cu | 44 +++++ 3 files changed, 372 insertions(+) create mode 100644 include/cuco/bit_vector.cuh create mode 100644 tests/bit_vector/size_test.cu diff --git a/include/cuco/bit_vector.cuh b/include/cuco/bit_vector.cuh new file mode 100644 index 000000000..734a34a12 --- /dev/null +++ b/include/cuco/bit_vector.cuh @@ -0,0 +1,323 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cuco { +namespace experimental { + +__host__ __device__ uint64_t ith_set_pos(uint32_t i, uint64_t val) { + for (uint32_t pos = 0; pos < i; pos++) { + val &= val - 1; + } + return __builtin_ffsll(val & -val) - 1; +} + +template +T* move_vector_to_device(std::vector& host_vector, thrust::device_vector& device_vector) { + device_vector = host_vector; + host_vector.clear(); + return thrust::raw_pointer_cast(device_vector.data()); +} + +inline uint64_t Popcnt(uint64_t x) { return __builtin_popcountll(x); } +inline uint64_t Ctz(uint64_t x) { return __builtin_ctzll(x); } + +struct bit_vector { + struct Rank { + uint32_t abs_hi; + uint8_t abs_lo; + uint8_t rels[3]; + + __host__ __device__ uint64_t abs() const { return ((uint64_t)abs_hi << 8) | abs_lo; } + void set_abs(uint64_t abs) { + abs_hi = (uint32_t)(abs >> 8); + abs_lo = (uint8_t)abs; + } + }; + + std::vector words; + std::vector ranks, ranks0; + std::vector selects, selects0; + + thrust::device_vector d_words; + thrust::device_vector d_ranks, d_ranks0; + thrust::device_vector d_selects, d_selects0; + + uint64_t* d_words_ptr; + Rank *d_ranks_ptr, *d_ranks0_ptr; + uint32_t *d_selects_ptr, *d_selects0_ptr; + uint32_t num_selects, num_selects0; + + uint64_t n_bits; + + bit_vector() : words(), ranks(), selects(), n_bits(0) {} + + uint64_t host_get(uint64_t i) const { return (words[i / 64] >> (i % 64)) & 1UL; } + __device__ uint64_t get(uint64_t i) const { return (d_words_ptr[i / 64] >> (i % 64)) & 1UL; } + void set(uint64_t i, uint64_t bit) { + if (bit) { + words[i / 64] |= (1UL << (i % 64)); + } else { + words[i / 64] &= ~(1UL << (i % 64)); + } + } + + void add(uint64_t bit) { + if (n_bits % 256 == 0) { + words.resize((n_bits + 256) / 64); + } + set(n_bits, bit); + ++n_bits; + } + + // builds indexes for rank and select. + void build() { + uint64_t n_blocks = words.size() / 4; + uint64_t n_ones = 0, n_zeroes = 0; + ranks.resize(n_blocks + 1); + ranks0.resize(n_blocks + 1); + for (uint64_t block_id = 0; block_id < n_blocks; ++block_id) { + ranks[block_id].set_abs(n_ones); + ranks0[block_id].set_abs(n_zeroes); + for (uint64_t j = 0; j < 4; ++j) { + if (j != 0) { + uint64_t rel1 = n_ones - ranks[block_id].abs(); + ranks[block_id].rels[j - 1] = rel1; + + uint64_t rel0 = n_zeroes - ranks0[block_id].abs(); + ranks0[block_id].rels[j - 1] = rel0; + } + + uint64_t word_id = (block_id * 4) + j; + { + uint64_t word = words[word_id]; + uint64_t n_pops = Popcnt(word); + uint64_t new_n_ones = n_ones + n_pops; + if (((n_ones + 255) / 256) != ((new_n_ones + 255) / 256)) { + uint64_t count = n_ones; + while (word != 0) { + uint64_t pos = Ctz(word); + if (count % 256 == 0) { + selects.push_back(((word_id * 64) + pos) / 256); + break; + } + word ^= 1UL << pos; + ++count; + } + } + n_ones = new_n_ones; + } + { + uint64_t word = ~words[word_id]; + uint64_t n_pops = Popcnt(word); + uint64_t new_n_zeroes = n_zeroes + n_pops; + if (((n_zeroes + 255) / 256) != ((new_n_zeroes + 255) / 256)) { + uint64_t count = n_zeroes; + while (word != 0) { + uint64_t pos = Ctz(word); + if (count % 256 == 0) { + selects0.push_back(((word_id * 64) + pos) / 256); + break; + } + word ^= 1UL << pos; + ++count; + } + } + n_zeroes = new_n_zeroes; + } + } + } + ranks.back().set_abs(n_ones); + ranks0.back().set_abs(n_zeroes); + selects.push_back(words.size() * 64 / 256); + selects0.push_back(words.size() * 64 / 256); + + move_to_device(); + } + + void move_to_device() { + d_words_ptr = move_vector_to_device(words, d_words); + d_ranks_ptr = move_vector_to_device(ranks, d_ranks); + d_ranks0_ptr = move_vector_to_device(ranks, d_ranks); + + num_selects = selects.size(); + d_selects_ptr = move_vector_to_device(selects, d_selects); + num_selects0 = selects0.size(); + d_selects0_ptr = move_vector_to_device(selects0, d_selects0); + } + + // rank returns the number of 1-bits in the range [0, i). + uint64_t host_rank(uint64_t i) const { + uint64_t word_id = i / 64; + uint64_t bit_id = i % 64; + uint64_t rank_id = word_id / 4; + uint64_t rel_id = word_id % 4; + uint64_t n = ranks[rank_id].abs(); + if (rel_id != 0) { + n += ranks[rank_id].rels[rel_id - 1]; + } + n += __builtin_popcountll(words[word_id] & ((1UL << bit_id) - 1)); + return n; + } + + __device__ uint64_t rank(uint64_t i) const { + uint64_t word_id = i / 64; + uint64_t bit_id = i % 64; + uint64_t rank_id = word_id / 4; + uint64_t rel_id = word_id % 4; + uint64_t n = d_ranks_ptr[rank_id].abs(); + if (rel_id != 0) { + n += d_ranks_ptr[rank_id].rels[rel_id - 1]; + } + n += __popcll(d_words_ptr[word_id] & ((1UL << bit_id) - 1)); + return n; + } + + // select returns the position of the (i+1)-th 1-bit. + uint64_t host_select(uint64_t i) const { + const uint64_t block_id = i / 256; + uint64_t begin = selects[block_id]; + uint64_t end = selects[block_id + 1] + 1UL; + if (begin + 10 >= end) { + while (i >= ranks[begin + 1].abs()) { + ++begin; + } + } else { + while (begin + 1 < end) { + const uint64_t middle = (begin + end) / 2; + if (i < ranks[middle].abs()) { + end = middle; + } else { + begin = middle; + } + } + } + const uint64_t rank_id = begin; + i -= ranks[rank_id].abs(); + + uint64_t word_id = rank_id * 4; + if (i < ranks[rank_id].rels[1]) { + if (i >= ranks[rank_id].rels[0]) { + word_id += 1; + i -= ranks[rank_id].rels[0]; + } + } else if (i < ranks[rank_id].rels[2]) { + word_id += 2; + i -= ranks[rank_id].rels[1]; + } else { + word_id += 3; + i -= ranks[rank_id].rels[2]; + } + return (word_id * 64) + ith_set_pos(i, words[word_id]); + } + + // select returns the position of the (i+1)-th 1-bit. + __device__ uint64_t select(uint64_t i) const { + const uint64_t block_id = i / 256; + uint64_t begin = d_selects_ptr[block_id]; + uint64_t end = d_selects_ptr[block_id + 1] + 1UL; + if (begin + 10 >= end) { + while (i >= d_ranks_ptr[begin + 1].abs()) { + ++begin; + } + } else { + while (begin + 1 < end) { + const uint64_t middle = (begin + end) / 2; + if (i < d_ranks_ptr[middle].abs()) { + end = middle; + } else { + begin = middle; + } + } + } + const uint64_t rank_id = begin; + const auto& rank = d_ranks_ptr[rank_id]; + i -= rank.abs(); + + uint64_t word_id = rank_id * 4; + bool a0 = i >= rank.rels[0]; + bool a1 = i >= rank.rels[1]; + bool a2 = i >= rank.rels[2]; + + uint32_t inc = a0 + a1 + a2; + word_id += inc; + i -= (inc > 0) * rank.rels[inc - (inc > 0)]; + + return (word_id * 64) + ith_set_pos(i, d_words_ptr[word_id]); + } + + // select returns the position of the (i+1)-th 0-bit. + __device__ uint64_t select0(uint64_t i) const { + const uint64_t block_id = i / 256; + uint64_t begin = d_selects0_ptr[block_id]; + uint64_t end = d_selects0_ptr[block_id + 1] + 1UL; + if (begin + 10 >= end) { + while (i >= d_ranks0_ptr[begin + 1].abs()) { + ++begin; + } + } else { + while (begin + 1 < end) { + const uint64_t middle = (begin + end) / 2; + if (i < d_ranks0_ptr[middle].abs()) { + end = middle; + } else { + begin = middle; + } + } + } + const uint64_t rank_id = begin; + const auto& rank = d_ranks0_ptr[rank_id]; + i -= rank.abs(); + + uint64_t word_id = rank_id * 4; + bool a0 = i >= rank.rels[0]; + bool a1 = i >= rank.rels[1]; + bool a2 = i >= rank.rels[2]; + + uint32_t inc = a0 + a1 + a2; + word_id += inc; + i -= (inc > 0) * rank.rels[inc - (inc > 0)]; + + return (word_id * 64) + ith_set_pos(i, ~d_words_ptr[word_id]); + } + + __device__ uint64_t find_next_set(uint64_t i) const { + uint64_t word_id = i / 64; + uint64_t bit_id = i % 64; + uint64_t word = d_words_ptr[word_id]; + word &= ~(0lu) << bit_id; + while (word == 0) { + word = d_words_ptr[++word_id]; + } + return (word_id * 64) + __builtin_ffsll(word) - 1; + } + + size_t size() const { + return n_bits; + } + + size_t memory_consumption() const { + return sizeof(uint64_t) * words.size() + sizeof(Rank) * (ranks.size() + ranks0.size()) + + sizeof(uint32_t) * (selects.size() + selects0.size()); + } +}; + +} // namespace experimental +} // namespace cuco diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ebc37e39b..45b6111b7 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -95,3 +95,8 @@ ConfigureTest(STATIC_MULTIMAP_TEST static_multimap/multiplicity_test.cu static_multimap/non_match_test.cu static_multimap/pair_function_test.cu) + +################################################################################################### +# - bit_vector tests ------------------------------------------------------------------------------ +ConfigureTest(BIT_VECTOR_TEST + bit_vector/size_test.cu) diff --git a/tests/bit_vector/size_test.cu b/tests/bit_vector/size_test.cu new file mode 100644 index 000000000..281f5a067 --- /dev/null +++ b/tests/bit_vector/size_test.cu @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include + +TEST_CASE("Size computation", "") +{ + constexpr std::size_t num_elements{400}; + + cuco::experimental::bit_vector bv; + + thrust::host_vector d_keys(num_elements); + for (size_t i = 0; i < num_elements; i++) { + bv.add(i % 7 == 0); + } + bv.build(); + + auto const size = bv.size(); + REQUIRE(size == num_elements); + + std::size_t num_set = 0; + for (size_t i = 0; i < num_elements; i++) { + num_set += bv.host_get(i); + } + + REQUIRE(num_set == (num_elements / 7 + 1)); +} From 8e31b2db04338ab8615e077ea9f01ef37d9160ed Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Thu, 11 May 2023 23:01:46 +0000 Subject: [PATCH 02/99] Add bitvector get test --- include/cuco/bit_vector.cuh | 179 ++++++++++++---------------------- tests/CMakeLists.txt | 1 + tests/bit_vector/get_test.cu | 58 +++++++++++ tests/bit_vector/size_test.cu | 14 +-- 4 files changed, 125 insertions(+), 127 deletions(-) create mode 100644 tests/bit_vector/get_test.cu diff --git a/include/cuco/bit_vector.cuh b/include/cuco/bit_vector.cuh index 734a34a12..7237f9856 100644 --- a/include/cuco/bit_vector.cuh +++ b/include/cuco/bit_vector.cuh @@ -22,63 +22,10 @@ namespace cuco { namespace experimental { -__host__ __device__ uint64_t ith_set_pos(uint32_t i, uint64_t val) { - for (uint32_t pos = 0; pos < i; pos++) { - val &= val - 1; - } - return __builtin_ffsll(val & -val) - 1; -} - -template -T* move_vector_to_device(std::vector& host_vector, thrust::device_vector& device_vector) { - device_vector = host_vector; - host_vector.clear(); - return thrust::raw_pointer_cast(device_vector.data()); -} - -inline uint64_t Popcnt(uint64_t x) { return __builtin_popcountll(x); } -inline uint64_t Ctz(uint64_t x) { return __builtin_ctzll(x); } - -struct bit_vector { - struct Rank { - uint32_t abs_hi; - uint8_t abs_lo; - uint8_t rels[3]; - - __host__ __device__ uint64_t abs() const { return ((uint64_t)abs_hi << 8) | abs_lo; } - void set_abs(uint64_t abs) { - abs_hi = (uint32_t)(abs >> 8); - abs_lo = (uint8_t)abs; - } - }; - - std::vector words; - std::vector ranks, ranks0; - std::vector selects, selects0; - - thrust::device_vector d_words; - thrust::device_vector d_ranks, d_ranks0; - thrust::device_vector d_selects, d_selects0; - - uint64_t* d_words_ptr; - Rank *d_ranks_ptr, *d_ranks0_ptr; - uint32_t *d_selects_ptr, *d_selects0_ptr; - uint32_t num_selects, num_selects0; - - uint64_t n_bits; - +class bit_vector { + public: bit_vector() : words(), ranks(), selects(), n_bits(0) {} - uint64_t host_get(uint64_t i) const { return (words[i / 64] >> (i % 64)) & 1UL; } - __device__ uint64_t get(uint64_t i) const { return (d_words_ptr[i / 64] >> (i % 64)) & 1UL; } - void set(uint64_t i, uint64_t bit) { - if (bit) { - words[i / 64] |= (1UL << (i % 64)); - } else { - words[i / 64] &= ~(1UL << (i % 64)); - } - } - void add(uint64_t bit) { if (n_bits % 256 == 0) { words.resize((n_bits + 256) / 64); @@ -152,29 +99,13 @@ struct bit_vector { move_to_device(); } - void move_to_device() { - d_words_ptr = move_vector_to_device(words, d_words); - d_ranks_ptr = move_vector_to_device(ranks, d_ranks); - d_ranks0_ptr = move_vector_to_device(ranks, d_ranks); - - num_selects = selects.size(); - d_selects_ptr = move_vector_to_device(selects, d_selects); - num_selects0 = selects0.size(); - d_selects0_ptr = move_vector_to_device(selects0, d_selects0); - } - - // rank returns the number of 1-bits in the range [0, i). - uint64_t host_rank(uint64_t i) const { - uint64_t word_id = i / 64; - uint64_t bit_id = i % 64; - uint64_t rank_id = word_id / 4; - uint64_t rel_id = word_id % 4; - uint64_t n = ranks[rank_id].abs(); - if (rel_id != 0) { - n += ranks[rank_id].rels[rel_id - 1]; + __device__ uint64_t get(uint64_t i) const { return (d_words_ptr[i / 64] >> (i % 64)) & 1UL; } + void set(uint64_t i, uint64_t bit) { + if (bit) { + words[i / 64] |= (1UL << (i % 64)); + } else { + words[i / 64] &= ~(1UL << (i % 64)); } - n += __builtin_popcountll(words[word_id] & ((1UL << bit_id) - 1)); - return n; } __device__ uint64_t rank(uint64_t i) const { @@ -190,44 +121,6 @@ struct bit_vector { return n; } - // select returns the position of the (i+1)-th 1-bit. - uint64_t host_select(uint64_t i) const { - const uint64_t block_id = i / 256; - uint64_t begin = selects[block_id]; - uint64_t end = selects[block_id + 1] + 1UL; - if (begin + 10 >= end) { - while (i >= ranks[begin + 1].abs()) { - ++begin; - } - } else { - while (begin + 1 < end) { - const uint64_t middle = (begin + end) / 2; - if (i < ranks[middle].abs()) { - end = middle; - } else { - begin = middle; - } - } - } - const uint64_t rank_id = begin; - i -= ranks[rank_id].abs(); - - uint64_t word_id = rank_id * 4; - if (i < ranks[rank_id].rels[1]) { - if (i >= ranks[rank_id].rels[0]) { - word_id += 1; - i -= ranks[rank_id].rels[0]; - } - } else if (i < ranks[rank_id].rels[2]) { - word_id += 2; - i -= ranks[rank_id].rels[1]; - } else { - word_id += 3; - i -= ranks[rank_id].rels[2]; - } - return (word_id * 64) + ith_set_pos(i, words[word_id]); - } - // select returns the position of the (i+1)-th 1-bit. __device__ uint64_t select(uint64_t i) const { const uint64_t block_id = i / 256; @@ -317,6 +210,62 @@ struct bit_vector { return sizeof(uint64_t) * words.size() + sizeof(Rank) * (ranks.size() + ranks0.size()) + sizeof(uint32_t) * (selects.size() + selects0.size()); } + + private: + struct Rank { + uint32_t abs_hi; + uint8_t abs_lo; + uint8_t rels[3]; + + __host__ __device__ uint64_t abs() const { return ((uint64_t)abs_hi << 8) | abs_lo; } + void set_abs(uint64_t abs) { + abs_hi = (uint32_t)(abs >> 8); + abs_lo = (uint8_t)abs; + } + }; + + template + T* move_vector_to_device(std::vector& host_vector, thrust::device_vector& device_vector) { + device_vector = host_vector; + host_vector.clear(); + return thrust::raw_pointer_cast(device_vector.data()); + } + + void move_to_device() { + d_words_ptr = move_vector_to_device(words, d_words); + d_ranks_ptr = move_vector_to_device(ranks, d_ranks); + d_ranks0_ptr = move_vector_to_device(ranks, d_ranks); + + num_selects = selects.size(); + d_selects_ptr = move_vector_to_device(selects, d_selects); + num_selects0 = selects0.size(); + d_selects0_ptr = move_vector_to_device(selects0, d_selects0); + } + + inline uint64_t Popcnt(uint64_t x) { return __builtin_popcountll(x); } + inline uint64_t Ctz(uint64_t x) { return __builtin_ctzll(x); } + + __device__ uint64_t ith_set_pos(uint32_t i, uint64_t val) const { + for (uint32_t pos = 0; pos < i; pos++) { + val &= val - 1; + } + return __builtin_ffsll(val & -val) - 1; + } + + std::vector words; + std::vector ranks, ranks0; + std::vector selects, selects0; + + thrust::device_vector d_words; + thrust::device_vector d_ranks, d_ranks0; + thrust::device_vector d_selects, d_selects0; + + uint64_t* d_words_ptr; + Rank *d_ranks_ptr, *d_ranks0_ptr; + uint32_t *d_selects_ptr, *d_selects0_ptr; + uint32_t num_selects, num_selects0; + + uint64_t n_bits; }; } // namespace experimental diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 45b6111b7..7f256790e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -99,4 +99,5 @@ ConfigureTest(STATIC_MULTIMAP_TEST ################################################################################################### # - bit_vector tests ------------------------------------------------------------------------------ ConfigureTest(BIT_VECTOR_TEST + bit_vector/get_test.cu bit_vector/size_test.cu) diff --git a/tests/bit_vector/get_test.cu b/tests/bit_vector/get_test.cu new file mode 100644 index 000000000..e1f398216 --- /dev/null +++ b/tests/bit_vector/get_test.cu @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include + +#include + +__global__ void bitvector_get_kernel(cuco::experimental::bit_vector* bv, size_t n, uint32_t* output) { + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = gridDim.x * blockDim.x; + while (index < n) { + output[index] = bv->get(index); + index += stride; + } +} + +TEST_CASE("Get test", "") +{ + constexpr std::size_t num_elements{400}; + + cuco::experimental::bit_vector bv; + + for (size_t i = 0; i < num_elements; i++) { + bv.add(i % 7 == 0); // Alternate 0s and 1s pattern + } + bv.build(); + + cuco::experimental::bit_vector* bv_device_copy; + CUCO_CUDA_TRY(cudaMalloc(&bv_device_copy, sizeof(cuco::experimental::bit_vector))); + CUCO_CUDA_TRY(cudaMemcpy(bv_device_copy, &bv, sizeof(cuco::experimental::bit_vector), cudaMemcpyHostToDevice)); + + thrust::device_vector get_result(num_elements); + + bitvector_get_kernel<<<1, 1024>>>(bv_device_copy, num_elements, thrust::raw_pointer_cast(get_result.data())); + + CUCO_CUDA_TRY(cudaFree(bv_device_copy)); + + size_t num_set = thrust::reduce(thrust::device, get_result.begin(), get_result.end(), 0); + REQUIRE(num_set == num_elements / 7 + 1); +} diff --git a/tests/bit_vector/size_test.cu b/tests/bit_vector/size_test.cu index 281f5a067..7d645be16 100644 --- a/tests/bit_vector/size_test.cu +++ b/tests/bit_vector/size_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,8 +16,6 @@ #include -#include - #include TEST_CASE("Size computation", "") @@ -26,19 +24,11 @@ TEST_CASE("Size computation", "") cuco::experimental::bit_vector bv; - thrust::host_vector d_keys(num_elements); for (size_t i = 0; i < num_elements; i++) { - bv.add(i % 7 == 0); + bv.add(i % 2 == 0); // Alternate 0s and 1s pattern } bv.build(); auto const size = bv.size(); REQUIRE(size == num_elements); - - std::size_t num_set = 0; - for (size_t i = 0; i < num_elements; i++) { - num_set += bv.host_get(i); - } - - REQUIRE(num_set == (num_elements / 7 + 1)); } From c3e253ea3159a41ed97a4f165db089636c742408 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Thu, 11 May 2023 23:59:11 +0000 Subject: [PATCH 03/99] Add more bitvector tests --- include/cuco/bit_vector.cuh | 1 + tests/CMakeLists.txt | 2 + tests/bit_vector/get_test.cu | 8 +++- tests/bit_vector/rank_test.cu | 70 +++++++++++++++++++++++++++++++ tests/bit_vector/select_test.cu | 73 +++++++++++++++++++++++++++++++++ 5 files changed, 152 insertions(+), 2 deletions(-) create mode 100644 tests/bit_vector/rank_test.cu create mode 100644 tests/bit_vector/select_test.cu diff --git a/include/cuco/bit_vector.cuh b/include/cuco/bit_vector.cuh index 7237f9856..41864e275 100644 --- a/include/cuco/bit_vector.cuh +++ b/include/cuco/bit_vector.cuh @@ -108,6 +108,7 @@ class bit_vector { } } + // rank returns the number of 1-bits in the range [0, i) __device__ uint64_t rank(uint64_t i) const { uint64_t word_id = i / 64; uint64_t bit_id = i % 64; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7f256790e..16d0a4260 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -100,4 +100,6 @@ ConfigureTest(STATIC_MULTIMAP_TEST # - bit_vector tests ------------------------------------------------------------------------------ ConfigureTest(BIT_VECTOR_TEST bit_vector/get_test.cu + bit_vector/rank_test.cu + bit_vector/select_test.cu bit_vector/size_test.cu) diff --git a/tests/bit_vector/get_test.cu b/tests/bit_vector/get_test.cu index e1f398216..90c8810d3 100644 --- a/tests/bit_vector/get_test.cu +++ b/tests/bit_vector/get_test.cu @@ -32,14 +32,18 @@ __global__ void bitvector_get_kernel(cuco::experimental::bit_vector* bv, size_t } } +bool modulo_bitgen(uint32_t i) { return i % 7 == 0; } + TEST_CASE("Get test", "") { constexpr std::size_t num_elements{400}; cuco::experimental::bit_vector bv; + uint32_t num_set_ref = 0; for (size_t i = 0; i < num_elements; i++) { - bv.add(i % 7 == 0); // Alternate 0s and 1s pattern + bv.add(modulo_bitgen(i)); + num_set_ref += modulo_bitgen(i); } bv.build(); @@ -54,5 +58,5 @@ TEST_CASE("Get test", "") CUCO_CUDA_TRY(cudaFree(bv_device_copy)); size_t num_set = thrust::reduce(thrust::device, get_result.begin(), get_result.end(), 0); - REQUIRE(num_set == num_elements / 7 + 1); + REQUIRE(num_set == num_set_ref); } diff --git a/tests/bit_vector/rank_test.cu b/tests/bit_vector/rank_test.cu new file mode 100644 index 000000000..79feaadda --- /dev/null +++ b/tests/bit_vector/rank_test.cu @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include + +#include + +__global__ void bitvector_rank_kernel(cuco::experimental::bit_vector* bv, size_t n, uint32_t* output) { + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = gridDim.x * blockDim.x; + while (index < n) { + output[index] = bv->rank(index); + index += stride; + } +} + +extern bool modulo_bitgen(uint32_t i); + +TEST_CASE("Rank test", "") +{ + constexpr std::size_t num_elements{400}; + + cuco::experimental::bit_vector bv; + + for (size_t i = 0; i < num_elements; i++) { + bv.add(modulo_bitgen(i)); + } + bv.build(); + + thrust::device_vector rank_result_device(num_elements); + + cuco::experimental::bit_vector* bv_device_copy; + CUCO_CUDA_TRY(cudaMalloc(&bv_device_copy, sizeof(cuco::experimental::bit_vector))); + CUCO_CUDA_TRY(cudaMemcpy(bv_device_copy, &bv, sizeof(cuco::experimental::bit_vector), cudaMemcpyHostToDevice)); + + bitvector_rank_kernel<<<1, 1024>>>(bv_device_copy, num_elements, thrust::raw_pointer_cast(rank_result_device.data())); + + CUCO_CUDA_TRY(cudaFree(bv_device_copy)); + + thrust::host_vector rank_result = rank_result_device; + + uint32_t cur_rank = 0; + uint32_t num_matches = 0; + for (size_t i = 0; i < num_elements; i++) { + num_matches += cur_rank == rank_result[i]; + if (modulo_bitgen(i)) { + cur_rank++; + } + } + REQUIRE(num_matches == num_elements); +} diff --git a/tests/bit_vector/select_test.cu b/tests/bit_vector/select_test.cu new file mode 100644 index 000000000..05d01bd1b --- /dev/null +++ b/tests/bit_vector/select_test.cu @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include + +#include + +__global__ void bitvector_select_kernel(cuco::experimental::bit_vector* bv, size_t n, uint32_t* output) { + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = gridDim.x * blockDim.x; + while (index < n) { + output[index] = bv->select(index); + index += stride; + } +} + +extern bool modulo_bitgen(uint32_t i); + +TEST_CASE("Select test", "") +{ + constexpr std::size_t num_elements{400}; + + cuco::experimental::bit_vector bv; + + uint32_t num_set = 0; + for (size_t i = 0; i < num_elements; i++) { + bv.add(modulo_bitgen(i)); + num_set += modulo_bitgen(i); + } + bv.build(); + + thrust::device_vector select_result_device(num_set); + + cuco::experimental::bit_vector* bv_device_copy; + CUCO_CUDA_TRY(cudaMalloc(&bv_device_copy, sizeof(cuco::experimental::bit_vector))); + CUCO_CUDA_TRY(cudaMemcpy(bv_device_copy, &bv, sizeof(cuco::experimental::bit_vector), cudaMemcpyHostToDevice)); + + bitvector_select_kernel<<<1, 1024>>>(bv_device_copy, num_set, thrust::raw_pointer_cast(select_result_device.data())); + + CUCO_CUDA_TRY(cudaFree(bv_device_copy)); + + thrust::host_vector select_result = select_result_device; + + uint32_t num_matches = 0; + uint32_t cur_set_pos = -1u; + for (size_t i = 0; i < num_set; i++) { + do { + cur_set_pos++; + } while (cur_set_pos < num_elements and !modulo_bitgen(cur_set_pos)); + + num_matches += cur_set_pos == select_result[i]; + } + REQUIRE(num_matches == num_set); +} From 0d56be409c06ed20624a4512852ac481281cef73 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 12 May 2023 22:00:44 +0000 Subject: [PATCH 04/99] Add trie tests --- include/cuco/bit_vector.cuh | 23 ++-- include/cuco/trie.cuh | 235 ++++++++++++++++++++++++++++++++++++ tests/CMakeLists.txt | 5 + tests/trie/lookup_test.cu | 72 +++++++++++ 4 files changed, 325 insertions(+), 10 deletions(-) create mode 100644 include/cuco/trie.cuh create mode 100644 tests/trie/lookup_test.cu diff --git a/include/cuco/bit_vector.cuh b/include/cuco/bit_vector.cuh index 41864e275..2def30eeb 100644 --- a/include/cuco/bit_vector.cuh +++ b/include/cuco/bit_vector.cuh @@ -22,6 +22,13 @@ namespace cuco { namespace experimental { +template +T* move_vector_to_device(std::vector& host_vector, thrust::device_vector& device_vector) { + device_vector = host_vector; + host_vector.clear(); + return thrust::raw_pointer_cast(device_vector.data()); +} + class bit_vector { public: bit_vector() : words(), ranks(), selects(), n_bits(0) {} @@ -107,6 +114,9 @@ class bit_vector { words[i / 64] &= ~(1UL << (i % 64)); } } + void set_last(uint64_t bit) { + set(n_bits - 1, bit); + } // rank returns the number of 1-bits in the range [0, i) __device__ uint64_t rank(uint64_t i) const { @@ -207,13 +217,12 @@ class bit_vector { return n_bits; } - size_t memory_consumption() const { + size_t memory_footprint() const { return sizeof(uint64_t) * words.size() + sizeof(Rank) * (ranks.size() + ranks0.size()) + sizeof(uint32_t) * (selects.size() + selects0.size()); } - private: - struct Rank { + struct Rank { uint32_t abs_hi; uint8_t abs_lo; uint8_t rels[3]; @@ -225,13 +234,7 @@ class bit_vector { } }; - template - T* move_vector_to_device(std::vector& host_vector, thrust::device_vector& device_vector) { - device_vector = host_vector; - host_vector.clear(); - return thrust::raw_pointer_cast(device_vector.data()); - } - + private: void move_to_device() { d_words_ptr = move_vector_to_device(words, d_words); d_ranks_ptr = move_vector_to_device(ranks, d_ranks); diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh new file mode 100644 index 000000000..6dc465227 --- /dev/null +++ b/include/cuco/trie.cuh @@ -0,0 +1,235 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include "bit_vector.cuh" + +namespace cuco { +namespace experimental { + +template +class trie { + public: + trie(); + ~trie() noexcept(false); + void add(const std::vector& key); + void build(); + + void lookup(const T* queries, const uint64_t* offsets, uint64_t* ids, uint64_t num_queries, + uint64_t start_offset, cudaStream_t stream) const; + + uint64_t n_keys() const { return n_keys_; } + uint64_t memory_footprint() const { return footprint_; } + +struct Level { + bit_vector louds; + bit_vector outs; + std::vector labels; + thrust::device_vector d_labels; + T* d_labels_ptr; + uint64_t offset; + + Level() : louds(), outs(), labels(), offset(0) {} + + uint64_t memory_footprint() const { return louds.size() + outs.size() + sizeof(T) * labels.size(); } +}; + + std::vector levels_; + Level* d_levels_ptr_; + uint64_t num_levels_; + + uint64_t n_keys_; + uint64_t n_nodes_; + uint64_t footprint_; + std::vector last_key_; + + trie* device_impl_; +}; + +template +trie::trie() + : levels_(2), + d_levels_ptr_(nullptr), + num_levels_(2), + n_keys_(0), + n_nodes_(1), + footprint_(0), + last_key_(), + device_impl_(nullptr) { + levels_[0].louds.add(0); + levels_[0].louds.add(1); + levels_[1].louds.add(1); + levels_[0].outs.add(0); + levels_[0].labels.push_back(sizeof(T) == 1 ? ' ' : (T)-1); +} + +template +trie::~trie() noexcept(false) { + if (d_levels_ptr_) { + CUCO_CUDA_TRY(cudaFree(d_levels_ptr_)); + } + if (device_impl_) { + CUCO_CUDA_TRY(cudaFree(device_impl_)); + } +} + +template +void trie::add(const std::vector& key) { + if (key == last_key_) { + return; + } + assert(n_keys_ == 0 || key > last_key_); + if (key.empty()) { + levels_[0].outs.set(0, 1); + ++levels_[1].offset; + ++n_keys_; + return; + } + if (key.size() + 1 >= levels_.size()) { + levels_.resize(key.size() + 2); + } + uint64_t i = 0; + for (; i < key.size(); ++i) { + auto& level = levels_[i + 1]; + T byte = key[i]; + if ((i == last_key_.size()) || (byte != level.labels.back())) { + level.louds.set_last(0); + level.louds.add(1); + level.outs.add(0); + level.labels.push_back(key[i]); + ++n_nodes_; + break; + } + } + for (++i; i < key.size(); ++i) { + auto& level = levels_[i + 1]; + level.louds.add(0); + level.louds.add(1); + level.outs.add(0); + level.labels.push_back(key[i]); + ++n_nodes_; + } + levels_[key.size() + 1].louds.add(1); + ++levels_[key.size() + 1].offset; + levels_[key.size()].outs.set_last(1); + ++n_keys_; + last_key_ = key; +} + +template +void trie::build() { + uint64_t offset = 0; + for (uint64_t i = 0; i < levels_.size(); ++i) { + auto& level = levels_[i]; + level.louds.build(); + level.outs.build(); + offset += level.offset; + level.offset = offset; + footprint_ += level.memory_footprint(); + level.d_labels_ptr = move_vector_to_device(level.labels, level.d_labels); + } + + num_levels_ = levels_.size(); + CUCO_CUDA_TRY(cudaMalloc(&d_levels_ptr_, sizeof(Level) * num_levels_)); + CUCO_CUDA_TRY(cudaMemcpy(d_levels_ptr_, &levels_[0], sizeof(Level) * num_levels_, + cudaMemcpyHostToDevice)); + + CUCO_CUDA_TRY(cudaMalloc(&device_impl_, sizeof(trie))); + CUCO_CUDA_TRY(cudaMemcpy(device_impl_, this, sizeof(trie), cudaMemcpyHostToDevice)); +} + +template +__global__ __launch_bounds__(256, 1) void trie_lookup_kernel(const trie* t, const T* keys, + const uint64_t* offsets, uint64_t* ids, + uint64_t num_queries, + uint64_t start_offset) { + auto const key_id = blockDim.x * blockIdx.x + threadIdx.x; + if (key_id >= num_queries) { + return; + } + + const int length = offsets[key_id + 1] - offsets[key_id]; + const T* query = keys + (offsets[key_id] - start_offset); + + uint32_t node_id = 0; + for (uint32_t cur_depth = 1; cur_depth <= length; cur_depth++) { + if (!binary_search_labels_array(t, query[cur_depth - 1], node_id, cur_depth)) { + ids[key_id] = -1lu; + return; + } + } + + const auto& level = t->d_levels_ptr_[length]; + if (!level.outs.get(node_id)) { + ids[key_id] = -1lu; + return; + } + ids[key_id] = level.offset + level.outs.rank(node_id); +} + +template +void trie::lookup(const T* queries, const uint64_t* offsets, uint64_t* ids, + uint64_t num_queries, uint64_t start_offset, + cudaStream_t stream) const { + int block_size = 256; + int num_blocks = (num_queries - 1) / block_size + 1; + + trie_lookup_kernel<<>>(device_impl_, queries, offsets, ids, + num_queries, start_offset); +} + +template +__device__ uint32_t init_node_pos(const trie* t, uint32_t& node_id, uint32_t cur_depth) { + uint32_t node_pos = 0; + if (node_id != 0) { + node_pos = t->d_levels_ptr_[cur_depth].louds.select(node_id - 1) + 1; + node_id = node_pos - node_id; + } + return node_pos; +} + +template +__device__ bool binary_search_labels_array(const trie* t, T target, uint32_t& node_id, uint32_t level_id) { + const auto& level = t->d_levels_ptr_[level_id]; + + uint32_t node_pos = init_node_pos(t, node_id, level_id); + uint32_t begin = node_id; + uint32_t pos_end = level.louds.find_next_set(node_pos); + uint32_t end = node_id + (pos_end - node_pos); + + while (begin < end) { + node_id = (begin + end) / 2; + auto label = level.d_labels_ptr[node_id]; + if (target < label) { + end = node_id; + } else if (target > label) { + begin = node_id + 1; + } else { + break; + } + } + return begin < end; +} + +} // namespace experimental +} // namespace cuco diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 16d0a4260..fddd6b247 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -103,3 +103,8 @@ ConfigureTest(BIT_VECTOR_TEST bit_vector/rank_test.cu bit_vector/select_test.cu bit_vector/size_test.cu) + +################################################################################################### +# - trie tests ------------------------------------------------------------------------------ +ConfigureTest(TRIE_TEST + trie/lookup_test.cu) diff --git a/tests/trie/lookup_test.cu b/tests/trie/lookup_test.cu new file mode 100644 index 000000000..0cdf255d2 --- /dev/null +++ b/tests/trie/lookup_test.cu @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include + +#include + +struct valid_key { + __host__ __device__ bool operator()(uint64_t x) const { + return x != -1lu; + } +}; + +TEST_CASE("Lookup test", "") +{ + + using KeyType = int; + cuco::experimental::trie trie; + + std::size_t num_keys = 3; + thrust::host_vector flatten_keys = std::vector{1, 2, 3, 1, 2, 4, 1, 4, 2}; + thrust::host_vector key_offsets = std::vector{0, 3, 6, 9}; + + for (size_t key_id = 0; key_id < num_keys; key_id++) { + std::vector cur_key; + for (size_t pos = key_offsets[key_id]; pos < key_offsets[key_id + 1]; pos++) { + cur_key.push_back(flatten_keys[pos]); + } + trie.add(cur_key); + } + + trie.build(); + + thrust::device_vector lookup_result(num_keys, -1lu); + { + thrust::device_vector device_keys = flatten_keys; + thrust::device_vector device_offsets = key_offsets; + + trie.lookup(thrust::raw_pointer_cast(device_keys.data()), + thrust::raw_pointer_cast(device_offsets.data()), + thrust::raw_pointer_cast(lookup_result.data()), 3, 0, 0); + + thrust::host_vector host_lookup_result = lookup_result; + for (size_t key_id = 0; key_id < num_keys; key_id++) { + REQUIRE(host_lookup_result[key_id] == key_id); + } + } + + thrust::transform(thrust::device, lookup_result.begin(), lookup_result.end(), lookup_result.begin(), valid_key()); + size_t num_matches = thrust::reduce(thrust::device, lookup_result.begin(), lookup_result.end(), 0); + REQUIRE(num_matches == num_keys); +} From 528778a3bbfd8fbd684e22767c189335a2fc2574 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Tue, 16 May 2023 21:38:38 +0000 Subject: [PATCH 05/99] Move implementations to .inl files --- include/cuco/bit_vector.cuh | 240 +++--------------- include/cuco/detail/bit_vector/bit_vector.inl | 232 +++++++++++++++++ include/cuco/detail/trie/trie.inl | 189 ++++++++++++++ include/cuco/trie.cuh | 196 ++------------ tests/CMakeLists.txt | 5 +- tests/bit_vector/bit_vector_test.cu | 4 + 6 files changed, 475 insertions(+), 391 deletions(-) create mode 100644 include/cuco/detail/bit_vector/bit_vector.inl create mode 100644 include/cuco/detail/trie/trie.inl create mode 100644 tests/bit_vector/bit_vector_test.cu diff --git a/include/cuco/bit_vector.cuh b/include/cuco/bit_vector.cuh index 2def30eeb..1c6a5910e 100644 --- a/include/cuco/bit_vector.cuh +++ b/include/cuco/bit_vector.cuh @@ -22,205 +22,35 @@ namespace cuco { namespace experimental { -template -T* move_vector_to_device(std::vector& host_vector, thrust::device_vector& device_vector) { - device_vector = host_vector; - host_vector.clear(); - return thrust::raw_pointer_cast(device_vector.data()); -} - class bit_vector { public: - bit_vector() : words(), ranks(), selects(), n_bits(0) {} - - void add(uint64_t bit) { - if (n_bits % 256 == 0) { - words.resize((n_bits + 256) / 64); - } - set(n_bits, bit); - ++n_bits; - } + bit_vector(); - // builds indexes for rank and select. - void build() { - uint64_t n_blocks = words.size() / 4; - uint64_t n_ones = 0, n_zeroes = 0; - ranks.resize(n_blocks + 1); - ranks0.resize(n_blocks + 1); - for (uint64_t block_id = 0; block_id < n_blocks; ++block_id) { - ranks[block_id].set_abs(n_ones); - ranks0[block_id].set_abs(n_zeroes); - for (uint64_t j = 0; j < 4; ++j) { - if (j != 0) { - uint64_t rel1 = n_ones - ranks[block_id].abs(); - ranks[block_id].rels[j - 1] = rel1; + using Key = uint64_t; - uint64_t rel0 = n_zeroes - ranks0[block_id].abs(); - ranks0[block_id].rels[j - 1] = rel0; - } + void add(Key bit); - uint64_t word_id = (block_id * 4) + j; - { - uint64_t word = words[word_id]; - uint64_t n_pops = Popcnt(word); - uint64_t new_n_ones = n_ones + n_pops; - if (((n_ones + 255) / 256) != ((new_n_ones + 255) / 256)) { - uint64_t count = n_ones; - while (word != 0) { - uint64_t pos = Ctz(word); - if (count % 256 == 0) { - selects.push_back(((word_id * 64) + pos) / 256); - break; - } - word ^= 1UL << pos; - ++count; - } - } - n_ones = new_n_ones; - } - { - uint64_t word = ~words[word_id]; - uint64_t n_pops = Popcnt(word); - uint64_t new_n_zeroes = n_zeroes + n_pops; - if (((n_zeroes + 255) / 256) != ((new_n_zeroes + 255) / 256)) { - uint64_t count = n_zeroes; - while (word != 0) { - uint64_t pos = Ctz(word); - if (count % 256 == 0) { - selects0.push_back(((word_id * 64) + pos) / 256); - break; - } - word ^= 1UL << pos; - ++count; - } - } - n_zeroes = new_n_zeroes; - } - } - } - ranks.back().set_abs(n_ones); - ranks0.back().set_abs(n_zeroes); - selects.push_back(words.size() * 64 / 256); - selects0.push_back(words.size() * 64 / 256); - - move_to_device(); - } - - __device__ uint64_t get(uint64_t i) const { return (d_words_ptr[i / 64] >> (i % 64)) & 1UL; } - void set(uint64_t i, uint64_t bit) { - if (bit) { - words[i / 64] |= (1UL << (i % 64)); - } else { - words[i / 64] &= ~(1UL << (i % 64)); - } - } - void set_last(uint64_t bit) { - set(n_bits - 1, bit); - } - - // rank returns the number of 1-bits in the range [0, i) - __device__ uint64_t rank(uint64_t i) const { - uint64_t word_id = i / 64; - uint64_t bit_id = i % 64; - uint64_t rank_id = word_id / 4; - uint64_t rel_id = word_id % 4; - uint64_t n = d_ranks_ptr[rank_id].abs(); - if (rel_id != 0) { - n += d_ranks_ptr[rank_id].rels[rel_id - 1]; - } - n += __popcll(d_words_ptr[word_id] & ((1UL << bit_id) - 1)); - return n; - } - - // select returns the position of the (i+1)-th 1-bit. - __device__ uint64_t select(uint64_t i) const { - const uint64_t block_id = i / 256; - uint64_t begin = d_selects_ptr[block_id]; - uint64_t end = d_selects_ptr[block_id + 1] + 1UL; - if (begin + 10 >= end) { - while (i >= d_ranks_ptr[begin + 1].abs()) { - ++begin; - } - } else { - while (begin + 1 < end) { - const uint64_t middle = (begin + end) / 2; - if (i < d_ranks_ptr[middle].abs()) { - end = middle; - } else { - begin = middle; - } - } - } - const uint64_t rank_id = begin; - const auto& rank = d_ranks_ptr[rank_id]; - i -= rank.abs(); - - uint64_t word_id = rank_id * 4; - bool a0 = i >= rank.rels[0]; - bool a1 = i >= rank.rels[1]; - bool a2 = i >= rank.rels[2]; - - uint32_t inc = a0 + a1 + a2; - word_id += inc; - i -= (inc > 0) * rank.rels[inc - (inc > 0)]; - - return (word_id * 64) + ith_set_pos(i, d_words_ptr[word_id]); - } + // builds indexes for rank and select. + void build(); - // select returns the position of the (i+1)-th 0-bit. - __device__ uint64_t select0(uint64_t i) const { - const uint64_t block_id = i / 256; - uint64_t begin = d_selects0_ptr[block_id]; - uint64_t end = d_selects0_ptr[block_id + 1] + 1UL; - if (begin + 10 >= end) { - while (i >= d_ranks0_ptr[begin + 1].abs()) { - ++begin; - } - } else { - while (begin + 1 < end) { - const uint64_t middle = (begin + end) / 2; - if (i < d_ranks0_ptr[middle].abs()) { - end = middle; - } else { - begin = middle; - } - } - } - const uint64_t rank_id = begin; - const auto& rank = d_ranks0_ptr[rank_id]; - i -= rank.abs(); + __device__ uint64_t get(Key i) const; + void set(Key i, bool bit); + void set_last(bool bit); - uint64_t word_id = rank_id * 4; - bool a0 = i >= rank.rels[0]; - bool a1 = i >= rank.rels[1]; - bool a2 = i >= rank.rels[2]; + // returns the number of 1-bits in the range [0, i) + __device__ uint64_t rank(Key i) const; - uint32_t inc = a0 + a1 + a2; - word_id += inc; - i -= (inc > 0) * rank.rels[inc - (inc > 0)]; + // returns the position of the (i+1)-th 1-bit. + __device__ uint64_t select(Key i) const; - return (word_id * 64) + ith_set_pos(i, ~d_words_ptr[word_id]); - } + // returns the position of the (i+1)-th 0-bit. + __device__ uint64_t select0(Key i) const; - __device__ uint64_t find_next_set(uint64_t i) const { - uint64_t word_id = i / 64; - uint64_t bit_id = i % 64; - uint64_t word = d_words_ptr[word_id]; - word &= ~(0lu) << bit_id; - while (word == 0) { - word = d_words_ptr[++word_id]; - } - return (word_id * 64) + __builtin_ffsll(word) - 1; - } + __device__ uint64_t find_next_set(Key i) const; - size_t size() const { - return n_bits; - } + size_t size() const; - size_t memory_footprint() const { - return sizeof(uint64_t) * words.size() + sizeof(Rank) * (ranks.size() + ranks0.size()) + - sizeof(uint32_t) * (selects.size() + selects0.size()); - } + size_t memory_footprint() const; struct Rank { uint32_t abs_hi; @@ -234,28 +64,7 @@ class bit_vector { } }; - private: - void move_to_device() { - d_words_ptr = move_vector_to_device(words, d_words); - d_ranks_ptr = move_vector_to_device(ranks, d_ranks); - d_ranks0_ptr = move_vector_to_device(ranks, d_ranks); - - num_selects = selects.size(); - d_selects_ptr = move_vector_to_device(selects, d_selects); - num_selects0 = selects0.size(); - d_selects0_ptr = move_vector_to_device(selects0, d_selects0); - } - - inline uint64_t Popcnt(uint64_t x) { return __builtin_popcountll(x); } - inline uint64_t Ctz(uint64_t x) { return __builtin_ctzll(x); } - - __device__ uint64_t ith_set_pos(uint32_t i, uint64_t val) const { - for (uint32_t pos = 0; pos < i; pos++) { - val &= val - 1; - } - return __builtin_ffsll(val & -val) - 1; - } - + private: std::vector words; std::vector ranks, ranks0; std::vector selects, selects0; @@ -270,7 +79,20 @@ class bit_vector { uint32_t num_selects, num_selects0; uint64_t n_bits; + + void move_to_device(); + + uint64_t Popcnt(uint64_t x) { return __builtin_popcountll(x); } + uint64_t Ctz(uint64_t x) { return __builtin_ctzll(x); } + __device__ uint64_t ith_set_pos(uint32_t i, uint64_t word) const { + for (uint32_t pos = 0; pos < i; pos++) { + word &= word - 1; + } + return __builtin_ffsll(word & -word) - 1; + } }; } // namespace experimental } // namespace cuco + +#include diff --git a/include/cuco/detail/bit_vector/bit_vector.inl b/include/cuco/detail/bit_vector/bit_vector.inl new file mode 100644 index 000000000..91f32602b --- /dev/null +++ b/include/cuco/detail/bit_vector/bit_vector.inl @@ -0,0 +1,232 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace cuco { +namespace experimental { + +template +T* move_vector_to_device(std::vector& host_vector, thrust::device_vector& device_vector) { + device_vector = host_vector; + host_vector.clear(); + return thrust::raw_pointer_cast(device_vector.data()); +} + +bit_vector::bit_vector() : words(), ranks(), selects(), n_bits(0) {} + +void bit_vector::add(Key bit) { + if (n_bits % 256 == 0) { + words.resize((n_bits + 256) / 64); + } + set(n_bits, bit); + ++n_bits; +} + +void bit_vector::build() { + uint64_t n_blocks = words.size() / 4; + uint64_t n_ones = 0, n_zeroes = 0; + ranks.resize(n_blocks + 1); + ranks0.resize(n_blocks + 1); + for (uint64_t block_id = 0; block_id < n_blocks; ++block_id) { + ranks[block_id].set_abs(n_ones); + ranks0[block_id].set_abs(n_zeroes); + for (uint64_t j = 0; j < 4; ++j) { + if (j != 0) { + uint64_t rel1 = n_ones - ranks[block_id].abs(); + ranks[block_id].rels[j - 1] = rel1; + + uint64_t rel0 = n_zeroes - ranks0[block_id].abs(); + ranks0[block_id].rels[j - 1] = rel0; + } + + uint64_t word_id = (block_id * 4) + j; + { + uint64_t word = words[word_id]; + uint64_t n_pops = Popcnt(word); + uint64_t new_n_ones = n_ones + n_pops; + if (((n_ones + 255) / 256) != ((new_n_ones + 255) / 256)) { + uint64_t count = n_ones; + while (word != 0) { + uint64_t pos = Ctz(word); + if (count % 256 == 0) { + selects.push_back(((word_id * 64) + pos) / 256); + break; + } + word ^= 1UL << pos; + ++count; + } + } + n_ones = new_n_ones; + } + { + uint64_t word = ~words[word_id]; + uint64_t n_pops = Popcnt(word); + uint64_t new_n_zeroes = n_zeroes + n_pops; + if (((n_zeroes + 255) / 256) != ((new_n_zeroes + 255) / 256)) { + uint64_t count = n_zeroes; + while (word != 0) { + uint64_t pos = Ctz(word); + if (count % 256 == 0) { + selects0.push_back(((word_id * 64) + pos) / 256); + break; + } + word ^= 1UL << pos; + ++count; + } + } + n_zeroes = new_n_zeroes; + } + } + } + ranks.back().set_abs(n_ones); + ranks0.back().set_abs(n_zeroes); + selects.push_back(words.size() * 64 / 256); + selects0.push_back(words.size() * 64 / 256); + + move_to_device(); +} + +__device__ uint64_t bit_vector::get(Key i) const { return (d_words_ptr[i / 64] >> (i % 64)) & 1UL; } + +void bit_vector::set(Key i, bool bit) { + if (bit) { + words[i / 64] |= (1UL << (i % 64)); + } else { + words[i / 64] &= ~(1UL << (i % 64)); + } +} + +void bit_vector::set_last(bool bit) { + set(n_bits - 1, bit); +} + +__device__ uint64_t bit_vector::rank(Key i) const { + uint64_t word_id = i / 64; + uint64_t bit_id = i % 64; + uint64_t rank_id = word_id / 4; + uint64_t rel_id = word_id % 4; + uint64_t n = d_ranks_ptr[rank_id].abs(); + if (rel_id != 0) { + n += d_ranks_ptr[rank_id].rels[rel_id - 1]; + } + n += __popcll(d_words_ptr[word_id] & ((1UL << bit_id) - 1)); + return n; +} + +__device__ uint64_t bit_vector::select(Key i) const { + const uint64_t block_id = i / 256; + uint64_t begin = d_selects_ptr[block_id]; + uint64_t end = d_selects_ptr[block_id + 1] + 1UL; + if (begin + 10 >= end) { + while (i >= d_ranks_ptr[begin + 1].abs()) { + ++begin; + } + } else { + while (begin + 1 < end) { + const uint64_t middle = (begin + end) / 2; + if (i < d_ranks_ptr[middle].abs()) { + end = middle; + } else { + begin = middle; + } + } + } + const uint64_t rank_id = begin; + const auto& rank = d_ranks_ptr[rank_id]; + i -= rank.abs(); + + uint64_t word_id = rank_id * 4; + bool a0 = i >= rank.rels[0]; + bool a1 = i >= rank.rels[1]; + bool a2 = i >= rank.rels[2]; + + uint32_t inc = a0 + a1 + a2; + word_id += inc; + i -= (inc > 0) * rank.rels[inc - (inc > 0)]; + + return (word_id * 64) + ith_set_pos(i, d_words_ptr[word_id]); +} + +__device__ uint64_t bit_vector::select0(Key i) const { + const uint64_t block_id = i / 256; + uint64_t begin = d_selects0_ptr[block_id]; + uint64_t end = d_selects0_ptr[block_id + 1] + 1UL; + if (begin + 10 >= end) { + while (i >= d_ranks0_ptr[begin + 1].abs()) { + ++begin; + } + } else { + while (begin + 1 < end) { + const uint64_t middle = (begin + end) / 2; + if (i < d_ranks0_ptr[middle].abs()) { + end = middle; + } else { + begin = middle; + } + } + } + const uint64_t rank_id = begin; + const auto& rank = d_ranks0_ptr[rank_id]; + i -= rank.abs(); + + uint64_t word_id = rank_id * 4; + bool a0 = i >= rank.rels[0]; + bool a1 = i >= rank.rels[1]; + bool a2 = i >= rank.rels[2]; + + uint32_t inc = a0 + a1 + a2; + word_id += inc; + i -= (inc > 0) * rank.rels[inc - (inc > 0)]; + + return (word_id * 64) + ith_set_pos(i, ~d_words_ptr[word_id]); +} + +__device__ uint64_t bit_vector::find_next_set(Key i) const { + uint64_t word_id = i / 64; + uint64_t bit_id = i % 64; + uint64_t word = d_words_ptr[word_id]; + word &= ~(0lu) << bit_id; + while (word == 0) { + word = d_words_ptr[++word_id]; + } + return (word_id * 64) + __builtin_ffsll(word) - 1; +} + +size_t bit_vector::size() const { + return n_bits; +} + +size_t bit_vector::memory_footprint() const { + return sizeof(uint64_t) * words.size() + sizeof(Rank) * (ranks.size() + ranks0.size()) + + sizeof(uint32_t) * (selects.size() + selects0.size()); +} + +void bit_vector::move_to_device() { + d_words_ptr = move_vector_to_device(words, d_words); + d_ranks_ptr = move_vector_to_device(ranks, d_ranks); + d_ranks0_ptr = move_vector_to_device(ranks, d_ranks); + + num_selects = selects.size(); + d_selects_ptr = move_vector_to_device(selects, d_selects); + num_selects0 = selects0.size(); + d_selects0_ptr = move_vector_to_device(selects0, d_selects0); +} + +} // namespace experimental +} // namespace cuco + diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl new file mode 100644 index 000000000..7c21cb628 --- /dev/null +++ b/include/cuco/detail/trie/trie.inl @@ -0,0 +1,189 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace cuco { +namespace experimental { + +template +trie::trie() + : levels_(2), + d_levels_ptr_(nullptr), + num_levels_(2), + n_keys_(0), + n_nodes_(1), + footprint_(0), + last_key_(), + device_impl_(nullptr) { + levels_[0].louds.add(0); + levels_[0].louds.add(1); + levels_[1].louds.add(1); + levels_[0].outs.add(0); + levels_[0].labels.push_back(sizeof(T) == 1 ? ' ' : (T)-1); +} + +template +trie::~trie() noexcept(false) { + if (d_levels_ptr_) { + CUCO_CUDA_TRY(cudaFree(d_levels_ptr_)); + } + if (device_impl_) { + CUCO_CUDA_TRY(cudaFree(device_impl_)); + } +} + +template +void trie::add(const std::vector& key) { + if (key == last_key_) { + return; + } + assert(n_keys_ == 0 || key > last_key_); + if (key.empty()) { + levels_[0].outs.set(0, 1); + ++levels_[1].offset; + ++n_keys_; + return; + } + if (key.size() + 1 >= levels_.size()) { + levels_.resize(key.size() + 2); + } + uint64_t i = 0; + for (; i < key.size(); ++i) { + auto& level = levels_[i + 1]; + T byte = key[i]; + if ((i == last_key_.size()) || (byte != level.labels.back())) { + level.louds.set_last(0); + level.louds.add(1); + level.outs.add(0); + level.labels.push_back(key[i]); + ++n_nodes_; + break; + } + } + for (++i; i < key.size(); ++i) { + auto& level = levels_[i + 1]; + level.louds.add(0); + level.louds.add(1); + level.outs.add(0); + level.labels.push_back(key[i]); + ++n_nodes_; + } + levels_[key.size() + 1].louds.add(1); + ++levels_[key.size() + 1].offset; + levels_[key.size()].outs.set_last(1); + ++n_keys_; + last_key_ = key; +} + +template +void trie::build() { + uint64_t offset = 0; + for (uint64_t i = 0; i < levels_.size(); ++i) { + auto& level = levels_[i]; + level.louds.build(); + level.outs.build(); + offset += level.offset; + level.offset = offset; + footprint_ += level.memory_footprint(); + level.d_labels_ptr = move_vector_to_device(level.labels, level.d_labels); + } + + num_levels_ = levels_.size(); + CUCO_CUDA_TRY(cudaMalloc(&d_levels_ptr_, sizeof(Level) * num_levels_)); + CUCO_CUDA_TRY(cudaMemcpy(d_levels_ptr_, &levels_[0], sizeof(Level) * num_levels_, + cudaMemcpyHostToDevice)); + + CUCO_CUDA_TRY(cudaMalloc(&device_impl_, sizeof(trie))); + CUCO_CUDA_TRY(cudaMemcpy(device_impl_, this, sizeof(trie), cudaMemcpyHostToDevice)); +} + +template +__global__ __launch_bounds__(256, 1) void trie_lookup_kernel(const trie* t, const T* keys, + const uint64_t* offsets, uint64_t* ids, + uint64_t num_queries, + uint64_t start_offset) { + auto const key_id = blockDim.x * blockIdx.x + threadIdx.x; + if (key_id >= num_queries) { + return; + } + + const int length = offsets[key_id + 1] - offsets[key_id]; + const T* query = keys + (offsets[key_id] - start_offset); + + uint32_t node_id = 0; + for (uint32_t cur_depth = 1; cur_depth <= length; cur_depth++) { + if (!binary_search_labels_array(t, query[cur_depth - 1], node_id, cur_depth)) { + ids[key_id] = -1lu; + return; + } + } + + const auto& level = t->d_levels_ptr_[length]; + if (!level.outs.get(node_id)) { + ids[key_id] = -1lu; + return; + } + ids[key_id] = level.offset + level.outs.rank(node_id); +} + +template +void trie::lookup(const T* queries, const uint64_t* offsets, uint64_t* ids, + uint64_t num_queries, uint64_t start_offset, + cudaStream_t stream) const { + int block_size = 256; + int num_blocks = (num_queries - 1) / block_size + 1; + + trie_lookup_kernel<<>>(device_impl_, queries, offsets, ids, + num_queries, start_offset); +} + +template +__device__ uint32_t init_node_pos(const trie* t, uint32_t& node_id, uint32_t cur_depth) { + uint32_t node_pos = 0; + if (node_id != 0) { + node_pos = t->d_levels_ptr_[cur_depth].louds.select(node_id - 1) + 1; + node_id = node_pos - node_id; + } + return node_pos; +} + +template +__device__ bool binary_search_labels_array(const trie* t, T target, uint32_t& node_id, uint32_t level_id) { + const auto& level = t->d_levels_ptr_[level_id]; + + uint32_t node_pos = init_node_pos(t, node_id, level_id); + uint32_t begin = node_id; + uint32_t pos_end = level.louds.find_next_set(node_pos); + uint32_t end = node_id + (pos_end - node_pos); + + while (begin < end) { + node_id = (begin + end) / 2; + auto label = level.d_labels_ptr[node_id]; + if (target < label) { + end = node_id; + } else if (target > label) { + begin = node_id + 1; + } else { + break; + } + } + return begin < end; +} + +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh index 6dc465227..62ac69ebf 100644 --- a/include/cuco/trie.cuh +++ b/include/cuco/trie.cuh @@ -22,7 +22,7 @@ #include #include -#include "bit_vector.cuh" +#include namespace cuco { namespace experimental { @@ -41,22 +41,25 @@ class trie { uint64_t n_keys() const { return n_keys_; } uint64_t memory_footprint() const { return footprint_; } -struct Level { - bit_vector louds; - bit_vector outs; - std::vector labels; - thrust::device_vector d_labels; - T* d_labels_ptr; - uint64_t offset; + private: + struct Level { + bit_vector louds; + bit_vector outs; + std::vector labels; + thrust::device_vector d_labels; + T* d_labels_ptr; + uint64_t offset; - Level() : louds(), outs(), labels(), offset(0) {} + Level() : louds(), outs(), labels(), offset(0) {} + uint64_t memory_footprint() const { return louds.size() + outs.size() + sizeof(T) * labels.size(); } + }; - uint64_t memory_footprint() const { return louds.size() + outs.size() + sizeof(T) * labels.size(); } -}; - - std::vector levels_; + public: Level* d_levels_ptr_; + + private: uint64_t num_levels_; + std::vector levels_; uint64_t n_keys_; uint64_t n_nodes_; @@ -66,170 +69,7 @@ struct Level { trie* device_impl_; }; -template -trie::trie() - : levels_(2), - d_levels_ptr_(nullptr), - num_levels_(2), - n_keys_(0), - n_nodes_(1), - footprint_(0), - last_key_(), - device_impl_(nullptr) { - levels_[0].louds.add(0); - levels_[0].louds.add(1); - levels_[1].louds.add(1); - levels_[0].outs.add(0); - levels_[0].labels.push_back(sizeof(T) == 1 ? ' ' : (T)-1); -} - -template -trie::~trie() noexcept(false) { - if (d_levels_ptr_) { - CUCO_CUDA_TRY(cudaFree(d_levels_ptr_)); - } - if (device_impl_) { - CUCO_CUDA_TRY(cudaFree(device_impl_)); - } -} - -template -void trie::add(const std::vector& key) { - if (key == last_key_) { - return; - } - assert(n_keys_ == 0 || key > last_key_); - if (key.empty()) { - levels_[0].outs.set(0, 1); - ++levels_[1].offset; - ++n_keys_; - return; - } - if (key.size() + 1 >= levels_.size()) { - levels_.resize(key.size() + 2); - } - uint64_t i = 0; - for (; i < key.size(); ++i) { - auto& level = levels_[i + 1]; - T byte = key[i]; - if ((i == last_key_.size()) || (byte != level.labels.back())) { - level.louds.set_last(0); - level.louds.add(1); - level.outs.add(0); - level.labels.push_back(key[i]); - ++n_nodes_; - break; - } - } - for (++i; i < key.size(); ++i) { - auto& level = levels_[i + 1]; - level.louds.add(0); - level.louds.add(1); - level.outs.add(0); - level.labels.push_back(key[i]); - ++n_nodes_; - } - levels_[key.size() + 1].louds.add(1); - ++levels_[key.size() + 1].offset; - levels_[key.size()].outs.set_last(1); - ++n_keys_; - last_key_ = key; -} - -template -void trie::build() { - uint64_t offset = 0; - for (uint64_t i = 0; i < levels_.size(); ++i) { - auto& level = levels_[i]; - level.louds.build(); - level.outs.build(); - offset += level.offset; - level.offset = offset; - footprint_ += level.memory_footprint(); - level.d_labels_ptr = move_vector_to_device(level.labels, level.d_labels); - } - - num_levels_ = levels_.size(); - CUCO_CUDA_TRY(cudaMalloc(&d_levels_ptr_, sizeof(Level) * num_levels_)); - CUCO_CUDA_TRY(cudaMemcpy(d_levels_ptr_, &levels_[0], sizeof(Level) * num_levels_, - cudaMemcpyHostToDevice)); - - CUCO_CUDA_TRY(cudaMalloc(&device_impl_, sizeof(trie))); - CUCO_CUDA_TRY(cudaMemcpy(device_impl_, this, sizeof(trie), cudaMemcpyHostToDevice)); -} - -template -__global__ __launch_bounds__(256, 1) void trie_lookup_kernel(const trie* t, const T* keys, - const uint64_t* offsets, uint64_t* ids, - uint64_t num_queries, - uint64_t start_offset) { - auto const key_id = blockDim.x * blockIdx.x + threadIdx.x; - if (key_id >= num_queries) { - return; - } - - const int length = offsets[key_id + 1] - offsets[key_id]; - const T* query = keys + (offsets[key_id] - start_offset); - - uint32_t node_id = 0; - for (uint32_t cur_depth = 1; cur_depth <= length; cur_depth++) { - if (!binary_search_labels_array(t, query[cur_depth - 1], node_id, cur_depth)) { - ids[key_id] = -1lu; - return; - } - } - - const auto& level = t->d_levels_ptr_[length]; - if (!level.outs.get(node_id)) { - ids[key_id] = -1lu; - return; - } - ids[key_id] = level.offset + level.outs.rank(node_id); -} - -template -void trie::lookup(const T* queries, const uint64_t* offsets, uint64_t* ids, - uint64_t num_queries, uint64_t start_offset, - cudaStream_t stream) const { - int block_size = 256; - int num_blocks = (num_queries - 1) / block_size + 1; - - trie_lookup_kernel<<>>(device_impl_, queries, offsets, ids, - num_queries, start_offset); -} - -template -__device__ uint32_t init_node_pos(const trie* t, uint32_t& node_id, uint32_t cur_depth) { - uint32_t node_pos = 0; - if (node_id != 0) { - node_pos = t->d_levels_ptr_[cur_depth].louds.select(node_id - 1) + 1; - node_id = node_pos - node_id; - } - return node_pos; -} - -template -__device__ bool binary_search_labels_array(const trie* t, T target, uint32_t& node_id, uint32_t level_id) { - const auto& level = t->d_levels_ptr_[level_id]; - - uint32_t node_pos = init_node_pos(t, node_id, level_id); - uint32_t begin = node_id; - uint32_t pos_end = level.louds.find_next_set(node_pos); - uint32_t end = node_id + (pos_end - node_pos); - - while (begin < end) { - node_id = (begin + end) / 2; - auto label = level.d_labels_ptr[node_id]; - if (target < label) { - end = node_id; - } else if (target > label) { - begin = node_id + 1; - } else { - break; - } - } - return begin < end; -} - } // namespace experimental } // namespace cuco + +#include diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index fddd6b247..94fe349c7 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -99,10 +99,7 @@ ConfigureTest(STATIC_MULTIMAP_TEST ################################################################################################### # - bit_vector tests ------------------------------------------------------------------------------ ConfigureTest(BIT_VECTOR_TEST - bit_vector/get_test.cu - bit_vector/rank_test.cu - bit_vector/select_test.cu - bit_vector/size_test.cu) + bit_vector/bit_vector_test.cu) ################################################################################################### # - trie tests ------------------------------------------------------------------------------ diff --git a/tests/bit_vector/bit_vector_test.cu b/tests/bit_vector/bit_vector_test.cu new file mode 100644 index 000000000..c74fe2186 --- /dev/null +++ b/tests/bit_vector/bit_vector_test.cu @@ -0,0 +1,4 @@ +#include "size_test.cu" +#include "get_test.cu" +#include "rank_test.cu" +#include "select_test.cu" From 5e9ad380c85de913f7ebd586d2d201cd1050a98c Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Thu, 18 May 2023 04:59:26 +0000 Subject: [PATCH 06/99] Add bit_vector reference classes --- include/cuco/bit_vector.cuh | 94 +++++----- include/cuco/bit_vector_ref.cuh | 46 +++++ include/cuco/detail/bit_vector/bit_vector.inl | 172 +++++++----------- .../cuco/detail/bit_vector/bit_vector_ref.inl | 128 +++++++++++++ include/cuco/operator.hpp | 24 +++ tests/CMakeLists.txt | 10 +- tests/bit_vector/bit_vector_test.cu | 4 - tests/bit_vector/find_next_set_test.cu | 73 ++++++++ tests/bit_vector/get_test.cu | 18 +- tests/bit_vector/rank_test.cu | 19 +- tests/bit_vector/select_test.cu | 19 +- tests/bit_vector/size_test.cu | 2 +- 12 files changed, 414 insertions(+), 195 deletions(-) create mode 100644 include/cuco/bit_vector_ref.cuh create mode 100644 include/cuco/detail/bit_vector/bit_vector_ref.inl delete mode 100644 tests/bit_vector/bit_vector_test.cu create mode 100644 tests/bit_vector/find_next_set_test.cu diff --git a/include/cuco/bit_vector.cuh b/include/cuco/bit_vector.cuh index 1c6a5910e..a58877eda 100644 --- a/include/cuco/bit_vector.cuh +++ b/include/cuco/bit_vector.cuh @@ -17,53 +17,71 @@ #pragma once +#include +#include +#include +#include + #include +#include + namespace cuco { namespace experimental { -class bit_vector { - public: - bit_vector(); +struct Rank { + uint32_t abs_hi; + uint8_t abs_lo; + uint8_t rels[3]; - using Key = uint64_t; + __host__ __device__ uint64_t abs() const { return ((uint64_t)abs_hi << 8) | abs_lo; } + void set_abs(uint64_t abs) { + abs_hi = (uint32_t)(abs >> 8); + abs_lo = (uint8_t)abs; + } +}; - void add(Key bit); +template , + cuda::thread_scope Scope = cuda::thread_scope_device, + class Allocator = cuco::cuda_allocator, + class Storage = cuco::experimental::aow_storage<1>> +class bit_vector { + public: + bit_vector(Extent capacity); - // builds indexes for rank and select. - void build(); + void add(bool bit); // adds a new bit at the end + void build(); // builds indexes for rank and select. - __device__ uint64_t get(Key i) const; void set(Key i, bool bit); void set_last(bool bit); - // returns the number of 1-bits in the range [0, i) - __device__ uint64_t rank(Key i) const; - - // returns the position of the (i+1)-th 1-bit. - __device__ uint64_t select(Key i) const; - - // returns the position of the (i+1)-th 0-bit. - __device__ uint64_t select0(Key i) const; - - __device__ uint64_t find_next_set(Key i) const; - - size_t size() const; - + static constexpr auto cg_size = 1; + static constexpr auto window_size = 1; + static constexpr auto thread_scope = Scope; + + using key_type = Key; ///< Key type + using value_type = Key; ///< Key type + using extent_type = decltype(make_valid_extent(std::declval())); + using size_type = typename extent_type::value_type; ///< Size type + using allocator_type = Allocator; ///< Allocator type + using storage_type = + detail::storage; ///< Storage type + + using storage_ref_type = typename storage_type::ref_type; ///< Non-owning window storage ref type + template + using ref_type = + cuco::experimental::bit_vector_ref< + storage_ref_type, + Operators...>; ///< Non-owning container ref type + + template + [[nodiscard]] auto ref(Operators... ops) const noexcept; + + size_t size() const { return n_bits; } + size_t constexpr capacity() const { return storage_.capacity(); } size_t memory_footprint() const; - struct Rank { - uint32_t abs_hi; - uint8_t abs_lo; - uint8_t rels[3]; - - __host__ __device__ uint64_t abs() const { return ((uint64_t)abs_hi << 8) | abs_lo; } - void set_abs(uint64_t abs) { - abs_hi = (uint32_t)(abs >> 8); - abs_lo = (uint8_t)abs; - } - }; - private: std::vector words; std::vector ranks, ranks0; @@ -82,14 +100,8 @@ class bit_vector { void move_to_device(); - uint64_t Popcnt(uint64_t x) { return __builtin_popcountll(x); } - uint64_t Ctz(uint64_t x) { return __builtin_ctzll(x); } - __device__ uint64_t ith_set_pos(uint32_t i, uint64_t word) const { - for (uint32_t pos = 0; pos < i; pos++) { - word &= word - 1; - } - return __builtin_ffsll(word & -word) - 1; - } + allocator_type allocator_; ///< Allocator used to (de)allocate temporary storage + storage_type storage_; ///< Slot window storage }; } // namespace experimental diff --git a/include/cuco/bit_vector_ref.cuh b/include/cuco/bit_vector_ref.cuh new file mode 100644 index 000000000..3a62f3008 --- /dev/null +++ b/include/cuco/bit_vector_ref.cuh @@ -0,0 +1,46 @@ +#pragma once + +#include + +namespace cuco { +namespace experimental { + +struct Rank; + +template +class bit_vector_ref + : public detail::operator_impl< + Operators, + bit_vector_ref>... { + public: + /** + * @brief Constructs bit_vector_ref. + * + * @param storage_ref Non-owning ref of slot storage + */ + __host__ __device__ explicit constexpr bit_vector_ref( + uint64_t* words, Rank* ranks, uint32_t* selects, uint32_t num_selects) noexcept; + + /** + * @brief Gets the maximum number of elements the container can hold. + * + * @return The maximum number of elements the container can hold + */ + [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept; + + private: + uint64_t* words_; + Rank* ranks_; + uint32_t* selects_; + uint32_t num_selects_; + + // Mixins need to be friends with this class in order to access private members + template + friend class detail::operator_impl; +}; + +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/detail/bit_vector/bit_vector.inl b/include/cuco/detail/bit_vector/bit_vector.inl index 91f32602b..249536312 100644 --- a/include/cuco/detail/bit_vector/bit_vector.inl +++ b/include/cuco/detail/bit_vector/bit_vector.inl @@ -15,8 +15,6 @@ * limitations under the License. */ -#pragma once - namespace cuco { namespace experimental { @@ -27,9 +25,21 @@ T* move_vector_to_device(std::vector& host_vector, thrust::device_vector& return thrust::raw_pointer_cast(device_vector.data()); } -bit_vector::bit_vector() : words(), ranks(), selects(), n_bits(0) {} +template +bit_vector::bit_vector(Extent capacity) + : words(), ranks(), selects(), n_bits(0), storage_{make_valid_extent(capacity), allocator_} { +} -void bit_vector::add(Key bit) { +template +void bit_vector::add(bool bit) { if (n_bits % 256 == 0) { words.resize((n_bits + 256) / 64); } @@ -37,7 +47,12 @@ void bit_vector::add(Key bit) { ++n_bits; } -void bit_vector::build() { +template +void bit_vector::build() { uint64_t n_blocks = words.size() / 4; uint64_t n_ones = 0, n_zeroes = 0; ranks.resize(n_blocks + 1); @@ -57,12 +72,12 @@ void bit_vector::build() { uint64_t word_id = (block_id * 4) + j; { uint64_t word = words[word_id]; - uint64_t n_pops = Popcnt(word); + uint64_t n_pops = __builtin_popcountll(word); uint64_t new_n_ones = n_ones + n_pops; if (((n_ones + 255) / 256) != ((new_n_ones + 255) / 256)) { uint64_t count = n_ones; while (word != 0) { - uint64_t pos = Ctz(word); + uint64_t pos = __builtin_ctzll(word); if (count % 256 == 0) { selects.push_back(((word_id * 64) + pos) / 256); break; @@ -75,12 +90,12 @@ void bit_vector::build() { } { uint64_t word = ~words[word_id]; - uint64_t n_pops = Popcnt(word); + uint64_t n_pops = __builtin_popcountll(word); uint64_t new_n_zeroes = n_zeroes + n_pops; if (((n_zeroes + 255) / 256) != ((new_n_zeroes + 255) / 256)) { uint64_t count = n_zeroes; while (word != 0) { - uint64_t pos = Ctz(word); + uint64_t pos = __builtin_ctzll(word); if (count % 256 == 0) { selects0.push_back(((word_id * 64) + pos) / 256); break; @@ -101,9 +116,12 @@ void bit_vector::build() { move_to_device(); } -__device__ uint64_t bit_vector::get(Key i) const { return (d_words_ptr[i / 64] >> (i % 64)) & 1UL; } - -void bit_vector::set(Key i, bool bit) { +template +void bit_vector::set(Key i, bool bit) { if (bit) { words[i / 64] |= (1UL << (i % 64)); } else { @@ -111,112 +129,31 @@ void bit_vector::set(Key i, bool bit) { } } -void bit_vector::set_last(bool bit) { +template +void bit_vector::set_last(bool bit) { set(n_bits - 1, bit); } -__device__ uint64_t bit_vector::rank(Key i) const { - uint64_t word_id = i / 64; - uint64_t bit_id = i % 64; - uint64_t rank_id = word_id / 4; - uint64_t rel_id = word_id % 4; - uint64_t n = d_ranks_ptr[rank_id].abs(); - if (rel_id != 0) { - n += d_ranks_ptr[rank_id].rels[rel_id - 1]; - } - n += __popcll(d_words_ptr[word_id] & ((1UL << bit_id) - 1)); - return n; -} - -__device__ uint64_t bit_vector::select(Key i) const { - const uint64_t block_id = i / 256; - uint64_t begin = d_selects_ptr[block_id]; - uint64_t end = d_selects_ptr[block_id + 1] + 1UL; - if (begin + 10 >= end) { - while (i >= d_ranks_ptr[begin + 1].abs()) { - ++begin; - } - } else { - while (begin + 1 < end) { - const uint64_t middle = (begin + end) / 2; - if (i < d_ranks_ptr[middle].abs()) { - end = middle; - } else { - begin = middle; - } - } - } - const uint64_t rank_id = begin; - const auto& rank = d_ranks_ptr[rank_id]; - i -= rank.abs(); - - uint64_t word_id = rank_id * 4; - bool a0 = i >= rank.rels[0]; - bool a1 = i >= rank.rels[1]; - bool a2 = i >= rank.rels[2]; - - uint32_t inc = a0 + a1 + a2; - word_id += inc; - i -= (inc > 0) * rank.rels[inc - (inc > 0)]; - - return (word_id * 64) + ith_set_pos(i, d_words_ptr[word_id]); -} - -__device__ uint64_t bit_vector::select0(Key i) const { - const uint64_t block_id = i / 256; - uint64_t begin = d_selects0_ptr[block_id]; - uint64_t end = d_selects0_ptr[block_id + 1] + 1UL; - if (begin + 10 >= end) { - while (i >= d_ranks0_ptr[begin + 1].abs()) { - ++begin; - } - } else { - while (begin + 1 < end) { - const uint64_t middle = (begin + end) / 2; - if (i < d_ranks0_ptr[middle].abs()) { - end = middle; - } else { - begin = middle; - } - } - } - const uint64_t rank_id = begin; - const auto& rank = d_ranks0_ptr[rank_id]; - i -= rank.abs(); - - uint64_t word_id = rank_id * 4; - bool a0 = i >= rank.rels[0]; - bool a1 = i >= rank.rels[1]; - bool a2 = i >= rank.rels[2]; - - uint32_t inc = a0 + a1 + a2; - word_id += inc; - i -= (inc > 0) * rank.rels[inc - (inc > 0)]; - - return (word_id * 64) + ith_set_pos(i, ~d_words_ptr[word_id]); -} - -__device__ uint64_t bit_vector::find_next_set(Key i) const { - uint64_t word_id = i / 64; - uint64_t bit_id = i % 64; - uint64_t word = d_words_ptr[word_id]; - word &= ~(0lu) << bit_id; - while (word == 0) { - word = d_words_ptr[++word_id]; - } - return (word_id * 64) + __builtin_ffsll(word) - 1; -} - -size_t bit_vector::size() const { - return n_bits; -} - -size_t bit_vector::memory_footprint() const { +template +size_t bit_vector::memory_footprint() const { return sizeof(uint64_t) * words.size() + sizeof(Rank) * (ranks.size() + ranks0.size()) + sizeof(uint32_t) * (selects.size() + selects0.size()); } -void bit_vector::move_to_device() { +template +void bit_vector::move_to_device() { d_words_ptr = move_vector_to_device(words, d_words); d_ranks_ptr = move_vector_to_device(ranks, d_ranks); d_ranks0_ptr = move_vector_to_device(ranks, d_ranks); @@ -227,6 +164,19 @@ void bit_vector::move_to_device() { d_selects0_ptr = move_vector_to_device(selects0, d_selects0); } +template +template +auto bit_vector::ref( + Operators...) const noexcept +{ + static_assert(sizeof...(Operators), "No operators specified"); + return ref_type{d_words_ptr, d_ranks_ptr, d_selects_ptr, num_selects}; +} + } // namespace experimental } // namespace cuco diff --git a/include/cuco/detail/bit_vector/bit_vector_ref.inl b/include/cuco/detail/bit_vector/bit_vector_ref.inl new file mode 100644 index 000000000..15bfbda3c --- /dev/null +++ b/include/cuco/detail/bit_vector/bit_vector_ref.inl @@ -0,0 +1,128 @@ +#include + +namespace cuco { +namespace experimental { + +template +__host__ __device__ constexpr bit_vector_ref::bit_vector_ref(uint64_t* words, Rank* ranks, uint32_t* selects, uint32_t num_selects) noexcept + : words_{words}, ranks_{ranks}, selects_{selects}, num_selects_{num_selects} +{ +} + +namespace detail { + +template +class operator_impl> { + using ref_type = bit_vector_ref; + + public: + [[nodiscard]] __device__ bool get(uint64_t key) const noexcept + { + auto const& ref_ = static_cast(*this); + return (ref_.words_[key / 64] >> (key % 64)) & 1UL; + } +}; + +template +class operator_impl> { + using ref_type = bit_vector_ref; + + public: + [[nodiscard]] __device__ uint64_t rank(uint64_t key) const noexcept + { + auto const& ref_ = static_cast(*this); + + uint64_t word_id = key / 64; + uint64_t bit_id = key % 64; + uint64_t rank_id = word_id / 4; + uint64_t rel_id = word_id % 4; + uint64_t n = ref_.ranks_[rank_id].abs(); + if (rel_id != 0) { + n += ref_.ranks_[rank_id].rels[rel_id - 1]; + } + n += __builtin_popcountll(ref_.words_[word_id] & ((1UL << bit_id) - 1)); + return n; + } +}; + +template +class operator_impl> { + using ref_type = bit_vector_ref; + + public: + [[nodiscard]] __device__ uint64_t select(uint64_t key) const noexcept + { + auto const& ref_ = static_cast(*this); + const uint64_t block_id = key / 256; + uint64_t begin = ref_.selects_[block_id]; + uint64_t end = ref_.selects_[block_id + 1] + 1UL; + if (begin + 10 >= end) { + while (key >= ref_.ranks_[begin + 1].abs()) { + ++begin; + } + } else { + while (begin + 1 < end) { + const uint64_t middle = (begin + end) / 2; + if (key < ref_.ranks_[middle].abs()) { + end = middle; + } else { + begin = middle; + } + } + } + const uint64_t rank_id = begin; + const auto& rank = ref_.ranks_[rank_id]; + key -= rank.abs(); + + uint64_t word_id = rank_id * 4; + bool a0 = key >= rank.rels[0]; + bool a1 = key >= rank.rels[1]; + bool a2 = key >= rank.rels[2]; + + uint32_t inc = a0 + a1 + a2; + word_id += inc; + key -= (inc > 0) * rank.rels[inc - (inc > 0)]; + + return (word_id * 64) + ith_set_pos(key, ref_.words_[word_id]); + } + + private: +__device__ uint64_t ith_set_pos(uint32_t i, uint64_t word) const { + for (uint32_t pos = 0; pos < i; pos++) { + word &= word - 1; + } + return __builtin_ffsll(word & -word) - 1; +} +}; + +template +class operator_impl> { + using ref_type = bit_vector_ref; + + public: + [[nodiscard]] __device__ uint64_t find_next_set(uint64_t key) const noexcept + { + auto const& ref_ = static_cast(*this); + uint64_t word_id = key / 64; + uint64_t bit_id = key % 64; + uint64_t word = ref_.words_[word_id]; + word &= ~(0lu) << bit_id; + while (word == 0) { + word = ref_.words_[++word_id]; + } + return (word_id * 64) + __builtin_ffsll(word) - 1; + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp index b7629ae4c..f813c64c2 100644 --- a/include/cuco/operator.hpp +++ b/include/cuco/operator.hpp @@ -45,6 +45,30 @@ struct contains_tag { struct find_tag { } inline constexpr find; +/** + * @brief `get` operator tag + */ +struct get_tag { +} inline constexpr get; + +/** + * @brief `rank` operator tag + */ +struct rank_tag { +} inline constexpr rank; + +/** + * @brief `select` operator tag + */ +struct select_tag { +} inline constexpr select; + +/** + * @brief `find_next_set` operator tag + */ +struct find_next_set_tag { +} inline constexpr find_next_set; + } // namespace op } // namespace experimental } // namespace cuco diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 94fe349c7..27e49d988 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -99,9 +99,13 @@ ConfigureTest(STATIC_MULTIMAP_TEST ################################################################################################### # - bit_vector tests ------------------------------------------------------------------------------ ConfigureTest(BIT_VECTOR_TEST - bit_vector/bit_vector_test.cu) + bit_vector/find_next_set_test.cu + bit_vector/get_test.cu + bit_vector/rank_test.cu + bit_vector/select_test.cu + bit_vector/size_test.cu) ################################################################################################### # - trie tests ------------------------------------------------------------------------------ -ConfigureTest(TRIE_TEST - trie/lookup_test.cu) +#ConfigureTest(TRIE_TEST +# trie/lookup_test.cu) diff --git a/tests/bit_vector/bit_vector_test.cu b/tests/bit_vector/bit_vector_test.cu deleted file mode 100644 index c74fe2186..000000000 --- a/tests/bit_vector/bit_vector_test.cu +++ /dev/null @@ -1,4 +0,0 @@ -#include "size_test.cu" -#include "get_test.cu" -#include "rank_test.cu" -#include "select_test.cu" diff --git a/tests/bit_vector/find_next_set_test.cu b/tests/bit_vector/find_next_set_test.cu new file mode 100644 index 000000000..cefcf8f89 --- /dev/null +++ b/tests/bit_vector/find_next_set_test.cu @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include + +#include + +template +__global__ void find_next_set_kernel(BitVectorRef ref, size_t n, uint32_t* output) { + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = gridDim.x * blockDim.x; + while (index < n) { + output[index] = ref.find_next_set(index); + index += stride; + } +} + +extern bool modulo_bitgen(uint32_t i); + +TEST_CASE("Find next set test", "") +{ + constexpr std::size_t num_elements{400}; + + using Key = uint64_t; + cuco::experimental::bit_vector bv{cuco::experimental::extent{400}}; + + for (size_t i = 0; i < num_elements; i++) { + bv.add(modulo_bitgen(i)); + } + bv.build(); + + thrust::device_vector device_result(num_elements); + auto ref = bv.ref(cuco::experimental::find_next_set); + find_next_set_kernel<<<1, 1024>>>(ref, num_elements, thrust::raw_pointer_cast(device_result.data())); + + thrust::host_vector host_result = device_result; + uint32_t num_matches = 0; + + uint32_t next_set_pos = -1u; + do { + next_set_pos++; + } while (next_set_pos < num_elements and !modulo_bitgen(next_set_pos)); + + for (size_t key = 0; key < num_elements; key++) { + num_matches += host_result[key] == next_set_pos; + + if (key == next_set_pos) { + do { + next_set_pos++; + } while (next_set_pos < num_elements and !modulo_bitgen(next_set_pos)); + } + } + REQUIRE(num_matches == num_elements); +} diff --git a/tests/bit_vector/get_test.cu b/tests/bit_vector/get_test.cu index 90c8810d3..d5d206513 100644 --- a/tests/bit_vector/get_test.cu +++ b/tests/bit_vector/get_test.cu @@ -23,11 +23,12 @@ #include -__global__ void bitvector_get_kernel(cuco::experimental::bit_vector* bv, size_t n, uint32_t* output) { +template +__global__ void get_kernel(BitVectorRef ref, size_t n, uint32_t* output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; while (index < n) { - output[index] = bv->get(index); + output[index] = ref.get(index); index += stride; } } @@ -38,7 +39,8 @@ TEST_CASE("Get test", "") { constexpr std::size_t num_elements{400}; - cuco::experimental::bit_vector bv; + using Key = uint64_t; + cuco::experimental::bit_vector bv{cuco::experimental::extent{400}}; uint32_t num_set_ref = 0; for (size_t i = 0; i < num_elements; i++) { @@ -47,15 +49,9 @@ TEST_CASE("Get test", "") } bv.build(); - cuco::experimental::bit_vector* bv_device_copy; - CUCO_CUDA_TRY(cudaMalloc(&bv_device_copy, sizeof(cuco::experimental::bit_vector))); - CUCO_CUDA_TRY(cudaMemcpy(bv_device_copy, &bv, sizeof(cuco::experimental::bit_vector), cudaMemcpyHostToDevice)); - + auto ref = bv.ref(cuco::experimental::get); thrust::device_vector get_result(num_elements); - - bitvector_get_kernel<<<1, 1024>>>(bv_device_copy, num_elements, thrust::raw_pointer_cast(get_result.data())); - - CUCO_CUDA_TRY(cudaFree(bv_device_copy)); + get_kernel<<<1, 1024>>>(ref, num_elements, thrust::raw_pointer_cast(get_result.data())); size_t num_set = thrust::reduce(thrust::device, get_result.begin(), get_result.end(), 0); REQUIRE(num_set == num_set_ref); diff --git a/tests/bit_vector/rank_test.cu b/tests/bit_vector/rank_test.cu index 79feaadda..dc6bfd682 100644 --- a/tests/bit_vector/rank_test.cu +++ b/tests/bit_vector/rank_test.cu @@ -24,11 +24,12 @@ #include -__global__ void bitvector_rank_kernel(cuco::experimental::bit_vector* bv, size_t n, uint32_t* output) { +template +__global__ void rank_kernel(BitVectorRef ref, size_t n, uint32_t* output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; while (index < n) { - output[index] = bv->rank(index); + output[index] = ref.rank(index); index += stride; } } @@ -39,7 +40,8 @@ TEST_CASE("Rank test", "") { constexpr std::size_t num_elements{400}; - cuco::experimental::bit_vector bv; + using Key = uint64_t; + cuco::experimental::bit_vector bv{cuco::experimental::extent{400}}; for (size_t i = 0; i < num_elements; i++) { bv.add(modulo_bitgen(i)); @@ -47,17 +49,10 @@ TEST_CASE("Rank test", "") bv.build(); thrust::device_vector rank_result_device(num_elements); - - cuco::experimental::bit_vector* bv_device_copy; - CUCO_CUDA_TRY(cudaMalloc(&bv_device_copy, sizeof(cuco::experimental::bit_vector))); - CUCO_CUDA_TRY(cudaMemcpy(bv_device_copy, &bv, sizeof(cuco::experimental::bit_vector), cudaMemcpyHostToDevice)); - - bitvector_rank_kernel<<<1, 1024>>>(bv_device_copy, num_elements, thrust::raw_pointer_cast(rank_result_device.data())); - - CUCO_CUDA_TRY(cudaFree(bv_device_copy)); + auto ref = bv.ref(cuco::experimental::rank); + rank_kernel<<<1, 1024>>>(ref, num_elements, thrust::raw_pointer_cast(rank_result_device.data())); thrust::host_vector rank_result = rank_result_device; - uint32_t cur_rank = 0; uint32_t num_matches = 0; for (size_t i = 0; i < num_elements; i++) { diff --git a/tests/bit_vector/select_test.cu b/tests/bit_vector/select_test.cu index 05d01bd1b..b30feabbc 100644 --- a/tests/bit_vector/select_test.cu +++ b/tests/bit_vector/select_test.cu @@ -24,11 +24,12 @@ #include -__global__ void bitvector_select_kernel(cuco::experimental::bit_vector* bv, size_t n, uint32_t* output) { +template +__global__ void select_kernel(BitVectorRef ref, size_t n, uint32_t* output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; while (index < n) { - output[index] = bv->select(index); + output[index] = ref.select(index); index += stride; } } @@ -39,7 +40,8 @@ TEST_CASE("Select test", "") { constexpr std::size_t num_elements{400}; - cuco::experimental::bit_vector bv; + using Key = uint64_t; + cuco::experimental::bit_vector bv{cuco::experimental::extent{400}}; uint32_t num_set = 0; for (size_t i = 0; i < num_elements; i++) { @@ -49,17 +51,10 @@ TEST_CASE("Select test", "") bv.build(); thrust::device_vector select_result_device(num_set); - - cuco::experimental::bit_vector* bv_device_copy; - CUCO_CUDA_TRY(cudaMalloc(&bv_device_copy, sizeof(cuco::experimental::bit_vector))); - CUCO_CUDA_TRY(cudaMemcpy(bv_device_copy, &bv, sizeof(cuco::experimental::bit_vector), cudaMemcpyHostToDevice)); - - bitvector_select_kernel<<<1, 1024>>>(bv_device_copy, num_set, thrust::raw_pointer_cast(select_result_device.data())); - - CUCO_CUDA_TRY(cudaFree(bv_device_copy)); + auto ref = bv.ref(cuco::experimental::select); + select_kernel<<<1, 1024>>>(ref, num_set, thrust::raw_pointer_cast(select_result_device.data())); thrust::host_vector select_result = select_result_device; - uint32_t num_matches = 0; uint32_t cur_set_pos = -1u; for (size_t i = 0; i < num_set; i++) { diff --git a/tests/bit_vector/size_test.cu b/tests/bit_vector/size_test.cu index 7d645be16..d5bcf4ab1 100644 --- a/tests/bit_vector/size_test.cu +++ b/tests/bit_vector/size_test.cu @@ -22,7 +22,7 @@ TEST_CASE("Size computation", "") { constexpr std::size_t num_elements{400}; - cuco::experimental::bit_vector bv; + cuco::experimental::bit_vector bv{cuco::experimental::extent{400}}; for (size_t i = 0; i < num_elements; i++) { bv.add(i % 2 == 0); // Alternate 0s and 1s pattern From b5b8f5ae83334aa875c48e4105945df5073b295e Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 19 May 2023 06:15:30 +0000 Subject: [PATCH 07/99] Use aow_storage for bit_vector structures --- include/cuco/bit_vector.cuh | 30 +++---- include/cuco/bit_vector_ref.cuh | 20 ++--- include/cuco/detail/bit_vector/bit_vector.inl | 63 ++++++++++--- .../cuco/detail/bit_vector/bit_vector_ref.inl | 90 ++++++++++++------- include/cuco/detail/trie/trie.inl | 8 +- include/cuco/trie.cuh | 6 +- tests/bit_vector/select_test.cu | 47 ++++++++-- 7 files changed, 178 insertions(+), 86 deletions(-) diff --git a/include/cuco/bit_vector.cuh b/include/cuco/bit_vector.cuh index a58877eda..bafdbf735 100644 --- a/include/cuco/bit_vector.cuh +++ b/include/cuco/bit_vector.cuh @@ -30,6 +30,7 @@ namespace cuco { namespace experimental { struct Rank { + // Basically a uint64_t split into 1 uin32_t and 2 uint8_t uint32_t abs_hi; uint8_t abs_lo; uint8_t rels[3]; @@ -41,6 +42,12 @@ struct Rank { } }; +// Need this union to use uint64_t for all aow_storage structures +union RankUnion { + uint64_t word; + Rank rank; +}; + template , cuda::thread_scope Scope = cuda::thread_scope_device, @@ -79,32 +86,25 @@ class bit_vector { [[nodiscard]] auto ref(Operators... ops) const noexcept; size_t size() const { return n_bits; } - size_t constexpr capacity() const { return storage_.capacity(); } size_t memory_footprint() const; private: + uint64_t n_bits; + + // Host structures std::vector words; std::vector ranks, ranks0; - std::vector selects, selects0; - - thrust::device_vector d_words; - thrust::device_vector d_ranks, d_ranks0; - thrust::device_vector d_selects, d_selects0; + std::vector selects, selects0; - uint64_t* d_words_ptr; - Rank *d_ranks_ptr, *d_ranks0_ptr; - uint32_t *d_selects_ptr, *d_selects0_ptr; - uint32_t num_selects, num_selects0; - - uint64_t n_bits; + // Device structures + allocator_type allocator_; ///< Allocator used to (de)allocate temporary storage + storage_type aow_words, aow_ranks, aow_selects, aow_ranks0, aow_selects0; void move_to_device(); - - allocator_type allocator_; ///< Allocator used to (de)allocate temporary storage - storage_type storage_; ///< Slot window storage }; } // namespace experimental } // namespace cuco #include +#include diff --git a/include/cuco/bit_vector_ref.cuh b/include/cuco/bit_vector_ref.cuh index 3a62f3008..d6d6c913c 100644 --- a/include/cuco/bit_vector_ref.cuh +++ b/include/cuco/bit_vector_ref.cuh @@ -14,26 +14,18 @@ class bit_vector_ref Operators, bit_vector_ref>... { public: + using storage_ref_type = StorageRef; ///< Type of storage ref /** * @brief Constructs bit_vector_ref. * * @param storage_ref Non-owning ref of slot storage */ - __host__ __device__ explicit constexpr bit_vector_ref( - uint64_t* words, Rank* ranks, uint32_t* selects, uint32_t num_selects) noexcept; - - /** - * @brief Gets the maximum number of elements the container can hold. - * - * @return The maximum number of elements the container can hold - */ - [[nodiscard]] __host__ __device__ constexpr auto capacity() const noexcept; + __host__ __device__ explicit constexpr bit_vector_ref(storage_ref_type words_ref, + storage_ref_type ranks_ref, storage_ref_type selects_ref, storage_ref_type ranks0_ref, + storage_ref_type selects0_ref) noexcept; private: - uint64_t* words_; - Rank* ranks_; - uint32_t* selects_; - uint32_t num_selects_; + storage_ref_type words_ref_, ranks_ref_, selects_ref_, ranks0_ref_, selects0_ref_; // Mixins need to be friends with this class in order to access private members template @@ -43,4 +35,4 @@ class bit_vector_ref } // namespace experimental } // namespace cuco -#include +//#include diff --git a/include/cuco/detail/bit_vector/bit_vector.inl b/include/cuco/detail/bit_vector/bit_vector.inl index 249536312..be96b2728 100644 --- a/include/cuco/detail/bit_vector/bit_vector.inl +++ b/include/cuco/detail/bit_vector/bit_vector.inl @@ -31,7 +31,13 @@ template bit_vector::bit_vector(Extent capacity) - : words(), ranks(), selects(), n_bits(0), storage_{make_valid_extent(capacity), allocator_} { + : words(), ranks(), selects(), n_bits(0), + aow_words{make_valid_extent(capacity), allocator_}, + aow_ranks{make_valid_extent(capacity), allocator_}, + aow_selects{make_valid_extent(capacity), allocator_}, + aow_ranks0{make_valid_extent(capacity), allocator_}, + aow_selects0{make_valid_extent(capacity), allocator_} +{ } template size_t bit_vector::memory_footprint() const { return sizeof(uint64_t) * words.size() + sizeof(Rank) * (ranks.size() + ranks0.size()) + - sizeof(uint32_t) * (selects.size() + selects0.size()); + sizeof(uint64_t) * (selects.size() + selects0.size()); +} + +template +__global__ void copy_to_window(WindowT* windows, + cuco::detail::index_type n, + T* values) +{ + cuco::detail::index_type const loop_stride = gridDim.x * blockDim.x; + cuco::detail::index_type idx = blockDim.x * blockIdx.x + threadIdx.x; + + while (idx < n) { + auto& window_slots = *(windows + idx); + window_slots[0] = values[idx]; + idx += loop_stride; + } +} + +template +void initialize_aow(Storage& storage, T* ptr, uint64_t num_elements) { + auto constexpr stride = 4; + auto const grid_size = (num_elements + stride * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (stride * detail::CUCO_DEFAULT_BLOCK_SIZE); + + copy_to_window<<>>( + storage.data(), num_elements, ptr); } template void bit_vector::move_to_device() { - d_words_ptr = move_vector_to_device(words, d_words); - d_ranks_ptr = move_vector_to_device(ranks, d_ranks); - d_ranks0_ptr = move_vector_to_device(ranks, d_ranks); - - num_selects = selects.size(); - d_selects_ptr = move_vector_to_device(selects, d_selects); - num_selects0 = selects0.size(); - d_selects0_ptr = move_vector_to_device(selects0, d_selects0); + uint32_t num_ranks = ranks.size(), num_ranks0 = ranks0.size(); + uint32_t num_selects = selects.size(), num_selects0 = selects0.size(); + + thrust::device_vector d_words, d_selects, d_selects0; + thrust::device_vector d_ranks, d_ranks0; + + auto d_words_ptr = move_vector_to_device(words, d_words); + auto d_ranks_ptr = move_vector_to_device(ranks, d_ranks); + auto d_ranks0_ptr = move_vector_to_device(ranks0, d_ranks0); + auto d_selects_ptr = move_vector_to_device(selects, d_selects); + auto d_selects0_ptr = move_vector_to_device(selects0, d_selects0); + + initialize_aow(aow_words, d_words_ptr, d_words.size()); + initialize_aow(aow_ranks, (uint64_t*)d_ranks_ptr, num_ranks); + initialize_aow(aow_selects, d_selects_ptr, num_selects); + initialize_aow(aow_ranks0, (uint64_t*)d_ranks0_ptr, num_ranks0); + initialize_aow(aow_selects0, d_selects0_ptr, num_selects0); } template ::ref( Operators...) const noexcept { static_assert(sizeof...(Operators), "No operators specified"); - return ref_type{d_words_ptr, d_ranks_ptr, d_selects_ptr, num_selects}; + return ref_type{aow_words.ref(), aow_ranks.ref(), aow_selects.ref(), + aow_ranks0.ref(), aow_selects0.ref()}; } } // namespace experimental diff --git a/include/cuco/detail/bit_vector/bit_vector_ref.inl b/include/cuco/detail/bit_vector/bit_vector_ref.inl index 15bfbda3c..9addae37e 100644 --- a/include/cuco/detail/bit_vector/bit_vector_ref.inl +++ b/include/cuco/detail/bit_vector/bit_vector_ref.inl @@ -5,10 +5,10 @@ namespace experimental { template -__host__ __device__ constexpr bit_vector_ref::bit_vector_ref(uint64_t* words, Rank* ranks, uint32_t* selects, uint32_t num_selects) noexcept - : words_{words}, ranks_{ranks}, selects_{selects}, num_selects_{num_selects} -{ -} +__host__ __device__ constexpr bit_vector_ref::bit_vector_ref(StorageRef words_ref, StorageRef ranks_ref, StorageRef selects_ref, StorageRef ranks0_ref, StorageRef selects0_ref) noexcept + : words_ref_{words_ref}, ranks_ref_{ranks_ref}, selects_ref_{selects_ref}, ranks0_ref_{ranks0_ref}, + selects0_ref_{selects0_ref} +{} namespace detail { @@ -22,7 +22,7 @@ class operator_impl(*this); - return (ref_.words_[key / 64] >> (key % 64)) & 1UL; + return (ref_.words_ref_[key / 64][0] >> (key % 64)) & 1UL; } }; @@ -41,11 +41,12 @@ class operator_impl(*this); - const uint64_t block_id = key / 256; - uint64_t begin = ref_.selects_[block_id]; - uint64_t end = ref_.selects_[block_id + 1] + 1UL; + + const uint64_t rank_id = binary_search_selects_array(key, ref_.selects_ref_, ref_.ranks_ref_); + uint64_t word_id = subtract_offset(key, rank_id, ref_.ranks_ref_); + + return (word_id * 64) + ith_set_pos(key, ref_.words_ref_[word_id][0]); + } + + [[nodiscard]] __device__ uint64_t select0(uint64_t key) const noexcept + { + auto const& ref_ = static_cast(*this); + + const uint64_t rank_id = binary_search_selects_array(key, ref_.selects0_ref_, ref_.ranks0_ref_); + uint64_t word_id = subtract_offset(key, rank_id, ref_.ranks0_ref_); + + return (word_id * 64) + ith_set_pos(key, ~ref_.words_ref_[word_id][0]); + } + + private: + [[nodiscard]] __device__ uint64_t binary_search_selects_array(uint64_t key, const StorageRef& selects_ref, const StorageRef& ranks_ref) const noexcept { + uint64_t block_id = key / 256; + uint64_t begin = selects_ref[block_id][0]; + uint64_t end = selects_ref[block_id + 1][0] + 1UL; if (begin + 10 >= end) { - while (key >= ref_.ranks_[begin + 1].abs()) { + while (key >= RankUnion{ranks_ref[begin + 1][0]}.rank.abs()) { ++begin; } } else { while (begin + 1 < end) { const uint64_t middle = (begin + end) / 2; - if (key < ref_.ranks_[middle].abs()) { + if (key < RankUnion{ranks_ref[middle][0]}.rank.abs()) { end = middle; } else { - begin = middle; + begin = middle; + } } } + return begin; } - const uint64_t rank_id = begin; - const auto& rank = ref_.ranks_[rank_id]; - key -= rank.abs(); - uint64_t word_id = rank_id * 4; - bool a0 = key >= rank.rels[0]; - bool a1 = key >= rank.rels[1]; - bool a2 = key >= rank.rels[2]; + [[nodiscard]] __device__ uint64_t subtract_offset(uint64_t& key, uint64_t rank_id, const StorageRef& ranks_ref) const noexcept + { + const auto& rank = RankUnion{ranks_ref[rank_id][0]}.rank; + key -= rank.abs(); + + uint64_t word_id = rank_id * 4; + bool a0 = key >= rank.rels[0]; + bool a1 = key >= rank.rels[1]; + bool a2 = key >= rank.rels[2]; - uint32_t inc = a0 + a1 + a2; - word_id += inc; - key -= (inc > 0) * rank.rels[inc - (inc > 0)]; + uint64_t inc = a0 + a1 + a2; + word_id += inc; + key -= (inc > 0) * rank.rels[inc - (inc > 0)]; - return (word_id * 64) + ith_set_pos(key, ref_.words_[word_id]); + return word_id; } - private: -__device__ uint64_t ith_set_pos(uint32_t i, uint64_t word) const { - for (uint32_t pos = 0; pos < i; pos++) { - word &= word - 1; + [[nodiscard]] __device__ uint64_t ith_set_pos(uint32_t i, uint64_t word) const noexcept + { + for (uint32_t pos = 0; pos < i; pos++) { + word &= word - 1; + } + return __builtin_ffsll(word & -word) - 1; } - return __builtin_ffsll(word & -word) - 1; -} }; template (*this); uint64_t word_id = key / 64; uint64_t bit_id = key % 64; - uint64_t word = ref_.words_[word_id]; + uint64_t word = ref_.words_ref_[word_id][0]; word &= ~(0lu) << bit_id; while (word == 0) { - word = ref_.words_[++word_id]; + word = ref_.words_ref_[++word_id][0]; } return (word_id * 64) + __builtin_ffsll(word) - 1; } diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl index 7c21cb628..2334c1d80 100644 --- a/include/cuco/detail/trie/trie.inl +++ b/include/cuco/detail/trie/trie.inl @@ -134,11 +134,11 @@ __global__ __launch_bounds__(256, 1) void trie_lookup_kernel(const trie* t, c } const auto& level = t->d_levels_ptr_[length]; - if (!level.outs.get(node_id)) { + if (!level.outs.ref(cuco::experimental::get).get(node_id)) { ids[key_id] = -1lu; return; } - ids[key_id] = level.offset + level.outs.rank(node_id); + ids[key_id] = level.offset + level.outs.ref(cuco::experimental::rank).rank(node_id); } template @@ -156,7 +156,7 @@ template __device__ uint32_t init_node_pos(const trie* t, uint32_t& node_id, uint32_t cur_depth) { uint32_t node_pos = 0; if (node_id != 0) { - node_pos = t->d_levels_ptr_[cur_depth].louds.select(node_id - 1) + 1; + node_pos = t->d_levels_ptr_[cur_depth].louds.ref(cuco::experimental::select).select(node_id - 1) + 1; node_id = node_pos - node_id; } return node_pos; @@ -168,7 +168,7 @@ __device__ bool binary_search_labels_array(const trie* t, T target, uint32_t& uint32_t node_pos = init_node_pos(t, node_id, level_id); uint32_t begin = node_id; - uint32_t pos_end = level.louds.find_next_set(node_pos); + uint32_t pos_end = level.louds.ref(cuco::experimental::find_next_set).find_next_set(node_pos); uint32_t end = node_id + (pos_end - node_pos); while (begin < end) { diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh index 62ac69ebf..5588e8329 100644 --- a/include/cuco/trie.cuh +++ b/include/cuco/trie.cuh @@ -43,14 +43,14 @@ class trie { private: struct Level { - bit_vector louds; - bit_vector outs; + bit_vector<> louds; + bit_vector<> outs; std::vector labels; thrust::device_vector d_labels; T* d_labels_ptr; uint64_t offset; - Level() : louds(), outs(), labels(), offset(0) {} + Level() : louds(cuco::experimental::extent{0}), outs(cuco::experimental::extent{0}), labels(), offset(0) {} uint64_t memory_footprint() const { return louds.size() + outs.size() + sizeof(T) * labels.size(); } }; diff --git a/tests/bit_vector/select_test.cu b/tests/bit_vector/select_test.cu index b30feabbc..87c74ad9d 100644 --- a/tests/bit_vector/select_test.cu +++ b/tests/bit_vector/select_test.cu @@ -25,7 +25,7 @@ #include template -__global__ void select_kernel(BitVectorRef ref, size_t n, uint32_t* output) { +__global__ void select_kernel(BitVectorRef ref, size_t n, uint64_t* output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; while (index < n) { @@ -34,6 +34,17 @@ __global__ void select_kernel(BitVectorRef ref, size_t n, uint32_t* output) { } } + +template +__global__ void select0_kernel(BitVectorRef ref, size_t n, uint64_t* output) { + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = gridDim.x * blockDim.x; + while (index < n) { + output[index] = ref.select0(index); + index += stride; + } +} + extern bool modulo_bitgen(uint32_t i); TEST_CASE("Select test", "") @@ -49,12 +60,15 @@ TEST_CASE("Select test", "") num_set += modulo_bitgen(i); } bv.build(); - - thrust::device_vector select_result_device(num_set); auto ref = bv.ref(cuco::experimental::select); - select_kernel<<<1, 1024>>>(ref, num_set, thrust::raw_pointer_cast(select_result_device.data())); - thrust::host_vector select_result = select_result_device; + + // Check select + { + thrust::device_vector device_result(num_set); + select_kernel<<<1, 1024>>>(ref, num_set, thrust::raw_pointer_cast(device_result.data())); + thrust::host_vector host_result = device_result; + uint32_t num_matches = 0; uint32_t cur_set_pos = -1u; for (size_t i = 0; i < num_set; i++) { @@ -62,7 +76,28 @@ TEST_CASE("Select test", "") cur_set_pos++; } while (cur_set_pos < num_elements and !modulo_bitgen(cur_set_pos)); - num_matches += cur_set_pos == select_result[i]; + num_matches += cur_set_pos == host_result[i]; } REQUIRE(num_matches == num_set); + } + + // Check select0 + { + uint32_t num_not_set = num_elements - num_set; + + thrust::device_vector device_result(num_not_set); + select0_kernel<<<1, 1024>>>(ref, num_not_set, thrust::raw_pointer_cast(device_result.data())); + thrust::host_vector host_result = device_result; + + uint32_t num_matches = 0; + uint32_t cur_not_set_pos = -1u; + for (size_t i = 0; i < num_not_set; i++) { + do { + cur_not_set_pos++; + } while (cur_not_set_pos < num_elements and modulo_bitgen(cur_not_set_pos)); + + num_matches += cur_not_set_pos == host_result[i]; + } + REQUIRE(num_matches == num_not_set); + } } From 97cf3577ad7abfa79bb0343bfd037b346224e187 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Sat, 20 May 2023 05:22:21 +0000 Subject: [PATCH 08/99] Minor --- include/cuco/bit_vector.cuh | 3 + include/cuco/detail/bit_vector/bit_vector.inl | 95 ++++++++----------- include/cuco/detail/trie/trie.inl | 7 ++ 3 files changed, 48 insertions(+), 57 deletions(-) diff --git a/include/cuco/bit_vector.cuh b/include/cuco/bit_vector.cuh index bafdbf735..5f2d42bfb 100644 --- a/include/cuco/bit_vector.cuh +++ b/include/cuco/bit_vector.cuh @@ -101,6 +101,9 @@ class bit_vector { storage_type aow_words, aow_ranks, aow_selects, aow_ranks0, aow_selects0; void move_to_device(); + + template + void copy_host_array_to_aow(storage_type& aow, std::vector& host_array); }; } // namespace experimental diff --git a/include/cuco/detail/bit_vector/bit_vector.inl b/include/cuco/detail/bit_vector/bit_vector.inl index be96b2728..2fc15dbfc 100644 --- a/include/cuco/detail/bit_vector/bit_vector.inl +++ b/include/cuco/detail/bit_vector/bit_vector.inl @@ -18,13 +18,6 @@ namespace cuco { namespace experimental { -template -T* move_vector_to_device(std::vector& host_vector, thrust::device_vector& device_vector) { - device_vector = host_vector; - host_vector.clear(); - return thrust::raw_pointer_cast(device_vector.data()); -} - template void bit_vector::build() { uint64_t n_blocks = words.size() / 4; - uint64_t n_ones = 0, n_zeroes = 0; ranks.resize(n_blocks + 1); ranks0.resize(n_blocks + 1); + + uint64_t n_ones = 0, n_zeroes = 0; for (uint64_t block_id = 0; block_id < n_blocks; ++block_id) { ranks[block_id].set_abs(n_ones); ranks0[block_id].set_abs(n_zeroes); - for (uint64_t j = 0; j < 4; ++j) { - if (j != 0) { - uint64_t rel1 = n_ones - ranks[block_id].abs(); - ranks[block_id].rels[j - 1] = rel1; - uint64_t rel0 = n_zeroes - ranks0[block_id].abs(); - ranks0[block_id].rels[j - 1] = rel0; + for (uint64_t block_offset = 0; block_offset < 4; ++block_offset) { + if (block_offset != 0) { + ranks[block_id].rels[block_offset - 1] = n_ones - ranks[block_id].abs(); + ranks0[block_id].rels[block_offset - 1] = n_zeroes - ranks0[block_id].abs(); } - uint64_t word_id = (block_id * 4) + j; - { - uint64_t word = words[word_id]; + auto update_selects = [] (uint64_t word_id, uint64_t word, uint64_t& gcount, std::vector& selects) { uint64_t n_pops = __builtin_popcountll(word); - uint64_t new_n_ones = n_ones + n_pops; - if (((n_ones + 255) / 256) != ((new_n_ones + 255) / 256)) { - uint64_t count = n_ones; + uint64_t new_gcount = gcount + n_pops; + if (((gcount + 255) / 256) != ((new_gcount + 255) / 256)) { + uint64_t count = gcount; while (word != 0) { uint64_t pos = __builtin_ctzll(word); if (count % 256 == 0) { @@ -92,28 +82,15 @@ void bit_vector::build() { ++count; } } - n_ones = new_n_ones; - } - { - uint64_t word = ~words[word_id]; - uint64_t n_pops = __builtin_popcountll(word); - uint64_t new_n_zeroes = n_zeroes + n_pops; - if (((n_zeroes + 255) / 256) != ((new_n_zeroes + 255) / 256)) { - uint64_t count = n_zeroes; - while (word != 0) { - uint64_t pos = __builtin_ctzll(word); - if (count % 256 == 0) { - selects0.push_back(((word_id * 64) + pos) / 256); - break; - } - word ^= 1UL << pos; - ++count; - } - } - n_zeroes = new_n_zeroes; - } + gcount = new_gcount; + }; + + uint64_t word_id = (block_id * 4) + block_offset; + update_selects(word_id, words[word_id], n_ones, selects); + update_selects(word_id, ~words[word_id], n_zeroes, selects0); } } + ranks.back().set_abs(n_ones); ranks0.back().set_abs(n_zeroes); selects.push_back(words.size() * 64 / 256); @@ -179,29 +156,33 @@ void initialize_aow(Storage& storage, T* ptr, uint64_t num_elements) { storage.data(), num_elements, ptr); } +template +template +void bit_vector::copy_host_array_to_aow(storage_type& aow, std::vector& host_array) { + thrust::device_vector device_array = host_array; + auto device_ptr = (uint64_t*)thrust::raw_pointer_cast(device_array.data()); + + uint64_t num_elements = host_array.size(); + host_array.clear(); + + initialize_aow(aow, device_ptr, num_elements); +} + template void bit_vector::move_to_device() { - uint32_t num_ranks = ranks.size(), num_ranks0 = ranks0.size(); - uint32_t num_selects = selects.size(), num_selects0 = selects0.size(); - - thrust::device_vector d_words, d_selects, d_selects0; - thrust::device_vector d_ranks, d_ranks0; - - auto d_words_ptr = move_vector_to_device(words, d_words); - auto d_ranks_ptr = move_vector_to_device(ranks, d_ranks); - auto d_ranks0_ptr = move_vector_to_device(ranks0, d_ranks0); - auto d_selects_ptr = move_vector_to_device(selects, d_selects); - auto d_selects0_ptr = move_vector_to_device(selects0, d_selects0); - - initialize_aow(aow_words, d_words_ptr, d_words.size()); - initialize_aow(aow_ranks, (uint64_t*)d_ranks_ptr, num_ranks); - initialize_aow(aow_selects, d_selects_ptr, num_selects); - initialize_aow(aow_ranks0, (uint64_t*)d_ranks0_ptr, num_ranks0); - initialize_aow(aow_selects0, d_selects0_ptr, num_selects0); + copy_host_array_to_aow(aow_words, words); + copy_host_array_to_aow(aow_ranks, ranks); + copy_host_array_to_aow(aow_selects, selects); + copy_host_array_to_aow(aow_ranks0, ranks0); + copy_host_array_to_aow(aow_selects0, selects0); } template ::add(const std::vector& key) { last_key_ = key; } +template +T* move_vector_to_device(std::vector& host_vector, thrust::device_vector& device_vector) { + device_vector = host_vector; + host_vector.clear(); + return thrust::raw_pointer_cast(device_vector.data()); +} + template void trie::build() { uint64_t offset = 0; From 2259c2aec48cb18e23f20e90599f0b3915394d40 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Mon, 22 May 2023 23:07:36 +0000 Subject: [PATCH 09/99] Remove trie --- include/cuco/detail/trie/trie.inl | 196 ------------------------------ include/cuco/trie.cuh | 75 ------------ tests/CMakeLists.txt | 5 - tests/trie/lookup_test.cu | 72 ----------- 4 files changed, 348 deletions(-) delete mode 100644 include/cuco/detail/trie/trie.inl delete mode 100644 include/cuco/trie.cuh delete mode 100644 tests/trie/lookup_test.cu diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl deleted file mode 100644 index d27f97be1..000000000 --- a/include/cuco/detail/trie/trie.inl +++ /dev/null @@ -1,196 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -namespace cuco { -namespace experimental { - -template -trie::trie() - : levels_(2), - d_levels_ptr_(nullptr), - num_levels_(2), - n_keys_(0), - n_nodes_(1), - footprint_(0), - last_key_(), - device_impl_(nullptr) { - levels_[0].louds.add(0); - levels_[0].louds.add(1); - levels_[1].louds.add(1); - levels_[0].outs.add(0); - levels_[0].labels.push_back(sizeof(T) == 1 ? ' ' : (T)-1); -} - -template -trie::~trie() noexcept(false) { - if (d_levels_ptr_) { - CUCO_CUDA_TRY(cudaFree(d_levels_ptr_)); - } - if (device_impl_) { - CUCO_CUDA_TRY(cudaFree(device_impl_)); - } -} - -template -void trie::add(const std::vector& key) { - if (key == last_key_) { - return; - } - assert(n_keys_ == 0 || key > last_key_); - if (key.empty()) { - levels_[0].outs.set(0, 1); - ++levels_[1].offset; - ++n_keys_; - return; - } - if (key.size() + 1 >= levels_.size()) { - levels_.resize(key.size() + 2); - } - uint64_t i = 0; - for (; i < key.size(); ++i) { - auto& level = levels_[i + 1]; - T byte = key[i]; - if ((i == last_key_.size()) || (byte != level.labels.back())) { - level.louds.set_last(0); - level.louds.add(1); - level.outs.add(0); - level.labels.push_back(key[i]); - ++n_nodes_; - break; - } - } - for (++i; i < key.size(); ++i) { - auto& level = levels_[i + 1]; - level.louds.add(0); - level.louds.add(1); - level.outs.add(0); - level.labels.push_back(key[i]); - ++n_nodes_; - } - levels_[key.size() + 1].louds.add(1); - ++levels_[key.size() + 1].offset; - levels_[key.size()].outs.set_last(1); - ++n_keys_; - last_key_ = key; -} - -template -T* move_vector_to_device(std::vector& host_vector, thrust::device_vector& device_vector) { - device_vector = host_vector; - host_vector.clear(); - return thrust::raw_pointer_cast(device_vector.data()); -} - -template -void trie::build() { - uint64_t offset = 0; - for (uint64_t i = 0; i < levels_.size(); ++i) { - auto& level = levels_[i]; - level.louds.build(); - level.outs.build(); - offset += level.offset; - level.offset = offset; - footprint_ += level.memory_footprint(); - level.d_labels_ptr = move_vector_to_device(level.labels, level.d_labels); - } - - num_levels_ = levels_.size(); - CUCO_CUDA_TRY(cudaMalloc(&d_levels_ptr_, sizeof(Level) * num_levels_)); - CUCO_CUDA_TRY(cudaMemcpy(d_levels_ptr_, &levels_[0], sizeof(Level) * num_levels_, - cudaMemcpyHostToDevice)); - - CUCO_CUDA_TRY(cudaMalloc(&device_impl_, sizeof(trie))); - CUCO_CUDA_TRY(cudaMemcpy(device_impl_, this, sizeof(trie), cudaMemcpyHostToDevice)); -} - -template -__global__ __launch_bounds__(256, 1) void trie_lookup_kernel(const trie* t, const T* keys, - const uint64_t* offsets, uint64_t* ids, - uint64_t num_queries, - uint64_t start_offset) { - auto const key_id = blockDim.x * blockIdx.x + threadIdx.x; - if (key_id >= num_queries) { - return; - } - - const int length = offsets[key_id + 1] - offsets[key_id]; - const T* query = keys + (offsets[key_id] - start_offset); - - uint32_t node_id = 0; - for (uint32_t cur_depth = 1; cur_depth <= length; cur_depth++) { - if (!binary_search_labels_array(t, query[cur_depth - 1], node_id, cur_depth)) { - ids[key_id] = -1lu; - return; - } - } - - const auto& level = t->d_levels_ptr_[length]; - if (!level.outs.ref(cuco::experimental::get).get(node_id)) { - ids[key_id] = -1lu; - return; - } - ids[key_id] = level.offset + level.outs.ref(cuco::experimental::rank).rank(node_id); -} - -template -void trie::lookup(const T* queries, const uint64_t* offsets, uint64_t* ids, - uint64_t num_queries, uint64_t start_offset, - cudaStream_t stream) const { - int block_size = 256; - int num_blocks = (num_queries - 1) / block_size + 1; - - trie_lookup_kernel<<>>(device_impl_, queries, offsets, ids, - num_queries, start_offset); -} - -template -__device__ uint32_t init_node_pos(const trie* t, uint32_t& node_id, uint32_t cur_depth) { - uint32_t node_pos = 0; - if (node_id != 0) { - node_pos = t->d_levels_ptr_[cur_depth].louds.ref(cuco::experimental::select).select(node_id - 1) + 1; - node_id = node_pos - node_id; - } - return node_pos; -} - -template -__device__ bool binary_search_labels_array(const trie* t, T target, uint32_t& node_id, uint32_t level_id) { - const auto& level = t->d_levels_ptr_[level_id]; - - uint32_t node_pos = init_node_pos(t, node_id, level_id); - uint32_t begin = node_id; - uint32_t pos_end = level.louds.ref(cuco::experimental::find_next_set).find_next_set(node_pos); - uint32_t end = node_id + (pos_end - node_pos); - - while (begin < end) { - node_id = (begin + end) / 2; - auto label = level.d_labels_ptr[node_id]; - if (target < label) { - end = node_id; - } else if (target > label) { - begin = node_id + 1; - } else { - break; - } - } - return begin < end; -} - -} // namespace experimental -} // namespace cuco diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh deleted file mode 100644 index 5588e8329..000000000 --- a/include/cuco/trie.cuh +++ /dev/null @@ -1,75 +0,0 @@ -/* - * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include -#include -#include - -#include - -namespace cuco { -namespace experimental { - -template -class trie { - public: - trie(); - ~trie() noexcept(false); - void add(const std::vector& key); - void build(); - - void lookup(const T* queries, const uint64_t* offsets, uint64_t* ids, uint64_t num_queries, - uint64_t start_offset, cudaStream_t stream) const; - - uint64_t n_keys() const { return n_keys_; } - uint64_t memory_footprint() const { return footprint_; } - - private: - struct Level { - bit_vector<> louds; - bit_vector<> outs; - std::vector labels; - thrust::device_vector d_labels; - T* d_labels_ptr; - uint64_t offset; - - Level() : louds(cuco::experimental::extent{0}), outs(cuco::experimental::extent{0}), labels(), offset(0) {} - uint64_t memory_footprint() const { return louds.size() + outs.size() + sizeof(T) * labels.size(); } - }; - - public: - Level* d_levels_ptr_; - - private: - uint64_t num_levels_; - std::vector levels_; - - uint64_t n_keys_; - uint64_t n_nodes_; - uint64_t footprint_; - std::vector last_key_; - - trie* device_impl_; -}; - -} // namespace experimental -} // namespace cuco - -#include diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 27e49d988..64fe713ac 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -104,8 +104,3 @@ ConfigureTest(BIT_VECTOR_TEST bit_vector/rank_test.cu bit_vector/select_test.cu bit_vector/size_test.cu) - -################################################################################################### -# - trie tests ------------------------------------------------------------------------------ -#ConfigureTest(TRIE_TEST -# trie/lookup_test.cu) diff --git a/tests/trie/lookup_test.cu b/tests/trie/lookup_test.cu deleted file mode 100644 index 0cdf255d2..000000000 --- a/tests/trie/lookup_test.cu +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include - -#include -#include -#include -#include - -#include - -struct valid_key { - __host__ __device__ bool operator()(uint64_t x) const { - return x != -1lu; - } -}; - -TEST_CASE("Lookup test", "") -{ - - using KeyType = int; - cuco::experimental::trie trie; - - std::size_t num_keys = 3; - thrust::host_vector flatten_keys = std::vector{1, 2, 3, 1, 2, 4, 1, 4, 2}; - thrust::host_vector key_offsets = std::vector{0, 3, 6, 9}; - - for (size_t key_id = 0; key_id < num_keys; key_id++) { - std::vector cur_key; - for (size_t pos = key_offsets[key_id]; pos < key_offsets[key_id + 1]; pos++) { - cur_key.push_back(flatten_keys[pos]); - } - trie.add(cur_key); - } - - trie.build(); - - thrust::device_vector lookup_result(num_keys, -1lu); - { - thrust::device_vector device_keys = flatten_keys; - thrust::device_vector device_offsets = key_offsets; - - trie.lookup(thrust::raw_pointer_cast(device_keys.data()), - thrust::raw_pointer_cast(device_offsets.data()), - thrust::raw_pointer_cast(lookup_result.data()), 3, 0, 0); - - thrust::host_vector host_lookup_result = lookup_result; - for (size_t key_id = 0; key_id < num_keys; key_id++) { - REQUIRE(host_lookup_result[key_id] == key_id); - } - } - - thrust::transform(thrust::device, lookup_result.begin(), lookup_result.end(), lookup_result.begin(), valid_key()); - size_t num_matches = thrust::reduce(thrust::device, lookup_result.begin(), lookup_result.end(), 0); - REQUIRE(num_matches == num_keys); -} From 52307cb1ed633c02a14d315b0c5f79e17ad2344b Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 24 May 2023 17:07:01 +0000 Subject: [PATCH 10/99] Clang format --- include/cuco/bit_vector.cuh | 16 +- include/cuco/bit_vector_ref.cuh | 15 +- include/cuco/detail/bit_vector/bit_vector.inl | 142 +++++++----------- .../cuco/detail/bit_vector/bit_vector_ref.inl | 88 +++++------ tests/bit_vector/find_next_set_test.cu | 12 +- tests/bit_vector/get_test.cu | 7 +- tests/bit_vector/rank_test.cu | 15 +- tests/bit_vector/select_test.cu | 74 ++++----- tests/bit_vector/size_test.cu | 2 +- 9 files changed, 172 insertions(+), 199 deletions(-) diff --git a/include/cuco/bit_vector.cuh b/include/cuco/bit_vector.cuh index 5f2d42bfb..c926a4f87 100644 --- a/include/cuco/bit_vector.cuh +++ b/include/cuco/bit_vector.cuh @@ -36,7 +36,8 @@ struct Rank { uint8_t rels[3]; __host__ __device__ uint64_t abs() const { return ((uint64_t)abs_hi << 8) | abs_lo; } - void set_abs(uint64_t abs) { + void set_abs(uint64_t abs) + { abs_hi = (uint32_t)(abs >> 8); abs_lo = (uint8_t)abs; } @@ -57,8 +58,8 @@ class bit_vector { public: bit_vector(Extent capacity); - void add(bool bit); // adds a new bit at the end - void build(); // builds indexes for rank and select. + void add(bool bit); // adds a new bit at the end + void build(); // builds indexes for rank and select. void set(Key i, bool bit); void set_last(bool bit); @@ -67,8 +68,8 @@ class bit_vector { static constexpr auto window_size = 1; static constexpr auto thread_scope = Scope; - using key_type = Key; ///< Key type - using value_type = Key; ///< Key type + using key_type = Key; ///< Key type + using value_type = Key; ///< Key type using extent_type = decltype(make_valid_extent(std::declval())); using size_type = typename extent_type::value_type; ///< Size type using allocator_type = Allocator; ///< Allocator type @@ -78,8 +79,7 @@ class bit_vector { using storage_ref_type = typename storage_type::ref_type; ///< Non-owning window storage ref type template using ref_type = - cuco::experimental::bit_vector_ref< - storage_ref_type, + cuco::experimental::bit_vector_ref; ///< Non-owning container ref type template @@ -97,7 +97,7 @@ class bit_vector { std::vector selects, selects0; // Device structures - allocator_type allocator_; ///< Allocator used to (de)allocate temporary storage + allocator_type allocator_; ///< Allocator used to (de)allocate temporary storage storage_type aow_words, aow_ranks, aow_selects, aow_ranks0, aow_selects0; void move_to_device(); diff --git a/include/cuco/bit_vector_ref.cuh b/include/cuco/bit_vector_ref.cuh index d6d6c913c..464934ef8 100644 --- a/include/cuco/bit_vector_ref.cuh +++ b/include/cuco/bit_vector_ref.cuh @@ -7,22 +7,21 @@ namespace experimental { struct Rank; -template +template class bit_vector_ref - : public detail::operator_impl< - Operators, - bit_vector_ref>... { + : public detail::operator_impl>... { public: - using storage_ref_type = StorageRef; ///< Type of storage ref + using storage_ref_type = StorageRef; ///< Type of storage ref /** * @brief Constructs bit_vector_ref. * * @param storage_ref Non-owning ref of slot storage */ __host__ __device__ explicit constexpr bit_vector_ref(storage_ref_type words_ref, - storage_ref_type ranks_ref, storage_ref_type selects_ref, storage_ref_type ranks0_ref, - storage_ref_type selects0_ref) noexcept; + storage_ref_type ranks_ref, + storage_ref_type selects_ref, + storage_ref_type ranks0_ref, + storage_ref_type selects0_ref) noexcept; private: storage_ref_type words_ref_, ranks_ref_, selects_ref_, ranks0_ref_, selects0_ref_; diff --git a/include/cuco/detail/bit_vector/bit_vector.inl b/include/cuco/detail/bit_vector/bit_vector.inl index 2fc15dbfc..1df04465b 100644 --- a/include/cuco/detail/bit_vector/bit_vector.inl +++ b/include/cuco/detail/bit_vector/bit_vector.inl @@ -18,13 +18,12 @@ namespace cuco { namespace experimental { -template +template bit_vector::bit_vector(Extent capacity) - : words(), ranks(), selects(), n_bits(0), + : words(), + ranks(), + selects(), + n_bits(0), aow_words{make_valid_extent(capacity), allocator_}, aow_ranks{make_valid_extent(capacity), allocator_}, aow_selects{make_valid_extent(capacity), allocator_}, @@ -33,25 +32,17 @@ bit_vector::bit_vector(Extent capacity) { } -template -void bit_vector::add(bool bit) { - if (n_bits % 256 == 0) { - words.resize((n_bits + 256) / 64); - } +template +void bit_vector::add(bool bit) +{ + if (n_bits % 256 == 0) { words.resize((n_bits + 256) / 64); } set(n_bits, bit); ++n_bits; } -template -void bit_vector::build() { +template +void bit_vector::build() +{ uint64_t n_blocks = words.size() / 4; ranks.resize(n_blocks + 1); ranks0.resize(n_blocks + 1); @@ -63,27 +54,28 @@ void bit_vector::build() { for (uint64_t block_offset = 0; block_offset < 4; ++block_offset) { if (block_offset != 0) { - ranks[block_id].rels[block_offset - 1] = n_ones - ranks[block_id].abs(); + ranks[block_id].rels[block_offset - 1] = n_ones - ranks[block_id].abs(); ranks0[block_id].rels[block_offset - 1] = n_zeroes - ranks0[block_id].abs(); } - auto update_selects = [] (uint64_t word_id, uint64_t word, uint64_t& gcount, std::vector& selects) { - uint64_t n_pops = __builtin_popcountll(word); - uint64_t new_gcount = gcount + n_pops; - if (((gcount + 255) / 256) != ((new_gcount + 255) / 256)) { - uint64_t count = gcount; - while (word != 0) { - uint64_t pos = __builtin_ctzll(word); - if (count % 256 == 0) { - selects.push_back(((word_id * 64) + pos) / 256); - break; + auto update_selects = + [](uint64_t word_id, uint64_t word, uint64_t& gcount, std::vector& selects) { + uint64_t n_pops = __builtin_popcountll(word); + uint64_t new_gcount = gcount + n_pops; + if (((gcount + 255) / 256) != ((new_gcount + 255) / 256)) { + uint64_t count = gcount; + while (word != 0) { + uint64_t pos = __builtin_ctzll(word); + if (count % 256 == 0) { + selects.push_back(((word_id * 64) + pos) / 256); + break; + } + word ^= 1UL << pos; + ++count; } - word ^= 1UL << pos; - ++count; } - } - gcount = new_gcount; - }; + gcount = new_gcount; + }; uint64_t word_id = (block_id * 4) + block_offset; update_selects(word_id, words[word_id], n_ones, selects); @@ -99,12 +91,9 @@ void bit_vector::build() { move_to_device(); } -template -void bit_vector::set(Key i, bool bit) { +template +void bit_vector::set(Key i, bool bit) +{ if (bit) { words[i / 64] |= (1UL << (i % 64)); } else { @@ -112,59 +101,49 @@ void bit_vector::set(Key i, bool bit) { } } -template -void bit_vector::set_last(bool bit) { +template +void bit_vector::set_last(bool bit) +{ set(n_bits - 1, bit); } -template -size_t bit_vector::memory_footprint() const { +template +size_t bit_vector::memory_footprint() const +{ return sizeof(uint64_t) * words.size() + sizeof(Rank) * (ranks.size() + ranks0.size()) + sizeof(uint64_t) * (selects.size() + selects0.size()); } template -__global__ void copy_to_window(WindowT* windows, - cuco::detail::index_type n, - T* values) +__global__ void copy_to_window(WindowT* windows, cuco::detail::index_type n, T* values) { cuco::detail::index_type const loop_stride = gridDim.x * blockDim.x; cuco::detail::index_type idx = blockDim.x * blockIdx.x + threadIdx.x; while (idx < n) { auto& window_slots = *(windows + idx); - window_slots[0] = values[idx]; + window_slots[0] = values[idx]; idx += loop_stride; } } template -void initialize_aow(Storage& storage, T* ptr, uint64_t num_elements) { +void initialize_aow(Storage& storage, T* ptr, uint64_t num_elements) +{ auto constexpr stride = 4; auto const grid_size = (num_elements + stride * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (stride * detail::CUCO_DEFAULT_BLOCK_SIZE); + (stride * detail::CUCO_DEFAULT_BLOCK_SIZE); - copy_to_window<<>>( - storage.data(), num_elements, ptr); + copy_to_window<<>>(storage.data(), num_elements, ptr); } -template +template template -void bit_vector::copy_host_array_to_aow(storage_type& aow, std::vector& host_array) { +void bit_vector::copy_host_array_to_aow( + storage_type& aow, std::vector& host_array) +{ thrust::device_vector device_array = host_array; - auto device_ptr = (uint64_t*)thrust::raw_pointer_cast(device_array.data()); + auto device_ptr = (uint64_t*)thrust::raw_pointer_cast(device_array.data()); uint64_t num_elements = host_array.size(); host_array.clear(); @@ -172,12 +151,9 @@ void bit_vector::copy_host_array_to_aow( initialize_aow(aow, device_ptr, num_elements); } -template -void bit_vector::move_to_device() { +template +void bit_vector::move_to_device() +{ copy_host_array_to_aow(aow_words, words); copy_host_array_to_aow(aow_ranks, ranks); copy_host_array_to_aow(aow_selects, selects); @@ -185,20 +161,14 @@ void bit_vector::move_to_device() { copy_host_array_to_aow(aow_selects0, selects0); } -template +template template -auto bit_vector::ref( - Operators...) const noexcept +auto bit_vector::ref(Operators...) const noexcept { static_assert(sizeof...(Operators), "No operators specified"); - return ref_type{aow_words.ref(), aow_ranks.ref(), aow_selects.ref(), - aow_ranks0.ref(), aow_selects0.ref()}; + return ref_type{ + aow_words.ref(), aow_ranks.ref(), aow_selects.ref(), aow_ranks0.ref(), aow_selects0.ref()}; } } // namespace experimental } // namespace cuco - diff --git a/include/cuco/detail/bit_vector/bit_vector_ref.inl b/include/cuco/detail/bit_vector/bit_vector_ref.inl index 9addae37e..83ee93b36 100644 --- a/include/cuco/detail/bit_vector/bit_vector_ref.inl +++ b/include/cuco/detail/bit_vector/bit_vector_ref.inl @@ -3,20 +3,26 @@ namespace cuco { namespace experimental { -template -__host__ __device__ constexpr bit_vector_ref::bit_vector_ref(StorageRef words_ref, StorageRef ranks_ref, StorageRef selects_ref, StorageRef ranks0_ref, StorageRef selects0_ref) noexcept - : words_ref_{words_ref}, ranks_ref_{ranks_ref}, selects_ref_{selects_ref}, ranks0_ref_{ranks0_ref}, +template +__host__ __device__ constexpr bit_vector_ref::bit_vector_ref( + StorageRef words_ref, + StorageRef ranks_ref, + StorageRef selects_ref, + StorageRef ranks0_ref, + StorageRef selects0_ref) noexcept + : words_ref_{words_ref}, + ranks_ref_{ranks_ref}, + selects_ref_{selects_ref}, + ranks0_ref_{ranks0_ref}, selects0_ref_{selects0_ref} -{} +{ +} namespace detail { -template -class operator_impl> { - using ref_type = bit_vector_ref; +template +class operator_impl> { + using ref_type = bit_vector_ref; public: [[nodiscard]] __device__ bool get(uint64_t key) const noexcept @@ -26,11 +32,9 @@ class operator_impl -class operator_impl> { - using ref_type = bit_vector_ref; +template +class operator_impl> { + using ref_type = bit_vector_ref; public: [[nodiscard]] __device__ uint64_t rank(uint64_t key) const noexcept @@ -38,24 +42,20 @@ class operator_impl(*this); uint64_t word_id = key / 64; - uint64_t bit_id = key % 64; + uint64_t bit_id = key % 64; uint64_t rank_id = word_id / 4; - uint64_t rel_id = word_id % 4; - auto rank = RankUnion{ref_.ranks_ref_[rank_id][0]}.rank; - uint64_t n = rank.abs(); - if (rel_id != 0) { - n += rank.rels[rel_id - 1]; - } + uint64_t rel_id = word_id % 4; + auto rank = RankUnion{ref_.ranks_ref_[rank_id][0]}.rank; + uint64_t n = rank.abs(); + if (rel_id != 0) { n += rank.rels[rel_id - 1]; } n += __builtin_popcountll(ref_.words_ref_[word_id][0] & ((1UL << bit_id) - 1)); return n; } }; -template -class operator_impl> { - using ref_type = bit_vector_ref; +template +class operator_impl> { + using ref_type = bit_vector_ref; public: [[nodiscard]] __device__ uint64_t select(uint64_t key) const noexcept @@ -63,7 +63,7 @@ class operator_impl(*this); const uint64_t rank_id = binary_search_selects_array(key, ref_.selects_ref_, ref_.ranks_ref_); - uint64_t word_id = subtract_offset(key, rank_id, ref_.ranks_ref_); + uint64_t word_id = subtract_offset(key, rank_id, ref_.ranks_ref_); return (word_id * 64) + ith_set_pos(key, ref_.words_ref_[word_id][0]); } @@ -73,16 +73,18 @@ class operator_impl(*this); const uint64_t rank_id = binary_search_selects_array(key, ref_.selects0_ref_, ref_.ranks0_ref_); - uint64_t word_id = subtract_offset(key, rank_id, ref_.ranks0_ref_); + uint64_t word_id = subtract_offset(key, rank_id, ref_.ranks0_ref_); return (word_id * 64) + ith_set_pos(key, ~ref_.words_ref_[word_id][0]); } private: - [[nodiscard]] __device__ uint64_t binary_search_selects_array(uint64_t key, const StorageRef& selects_ref, const StorageRef& ranks_ref) const noexcept { + [[nodiscard]] __device__ uint64_t binary_search_selects_array( + uint64_t key, const StorageRef& selects_ref, const StorageRef& ranks_ref) const noexcept + { uint64_t block_id = key / 256; - uint64_t begin = selects_ref[block_id][0]; - uint64_t end = selects_ref[block_id + 1][0] + 1UL; + uint64_t begin = selects_ref[block_id][0]; + uint64_t end = selects_ref[block_id + 1][0] + 1UL; if (begin + 10 >= end) { while (key >= RankUnion{ranks_ref[begin + 1][0]}.rank.abs()) { ++begin; @@ -100,15 +102,17 @@ class operator_impl= rank.rels[0]; - bool a1 = key >= rank.rels[1]; - bool a2 = key >= rank.rels[2]; + bool a0 = key >= rank.rels[0]; + bool a1 = key >= rank.rels[1]; + bool a2 = key >= rank.rels[2]; uint64_t inc = a0 + a1 + a2; word_id += inc; @@ -126,19 +130,17 @@ class operator_impl -class operator_impl> { - using ref_type = bit_vector_ref; +template +class operator_impl> { + using ref_type = bit_vector_ref; public: [[nodiscard]] __device__ uint64_t find_next_set(uint64_t key) const noexcept { auto const& ref_ = static_cast(*this); uint64_t word_id = key / 64; - uint64_t bit_id = key % 64; - uint64_t word = ref_.words_ref_[word_id][0]; + uint64_t bit_id = key % 64; + uint64_t word = ref_.words_ref_[word_id][0]; word &= ~(0lu) << bit_id; while (word == 0) { word = ref_.words_ref_[++word_id][0]; diff --git a/tests/bit_vector/find_next_set_test.cu b/tests/bit_vector/find_next_set_test.cu index cefcf8f89..4a4597999 100644 --- a/tests/bit_vector/find_next_set_test.cu +++ b/tests/bit_vector/find_next_set_test.cu @@ -25,8 +25,9 @@ #include template -__global__ void find_next_set_kernel(BitVectorRef ref, size_t n, uint32_t* output) { - size_t index = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void find_next_set_kernel(BitVectorRef ref, size_t n, uint32_t* output) +{ + size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; while (index < n) { output[index] = ref.find_next_set(index); @@ -49,11 +50,12 @@ TEST_CASE("Find next set test", "") bv.build(); thrust::device_vector device_result(num_elements); - auto ref = bv.ref(cuco::experimental::find_next_set); - find_next_set_kernel<<<1, 1024>>>(ref, num_elements, thrust::raw_pointer_cast(device_result.data())); + auto ref = bv.ref(cuco::experimental::find_next_set); + find_next_set_kernel<<<1, 1024>>>( + ref, num_elements, thrust::raw_pointer_cast(device_result.data())); thrust::host_vector host_result = device_result; - uint32_t num_matches = 0; + uint32_t num_matches = 0; uint32_t next_set_pos = -1u; do { diff --git a/tests/bit_vector/get_test.cu b/tests/bit_vector/get_test.cu index d5d206513..788d22d1a 100644 --- a/tests/bit_vector/get_test.cu +++ b/tests/bit_vector/get_test.cu @@ -24,8 +24,9 @@ #include template -__global__ void get_kernel(BitVectorRef ref, size_t n, uint32_t* output) { - size_t index = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void get_kernel(BitVectorRef ref, size_t n, uint32_t* output) +{ + size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; while (index < n) { output[index] = ref.get(index); @@ -49,7 +50,7 @@ TEST_CASE("Get test", "") } bv.build(); - auto ref = bv.ref(cuco::experimental::get); + auto ref = bv.ref(cuco::experimental::get); thrust::device_vector get_result(num_elements); get_kernel<<<1, 1024>>>(ref, num_elements, thrust::raw_pointer_cast(get_result.data())); diff --git a/tests/bit_vector/rank_test.cu b/tests/bit_vector/rank_test.cu index dc6bfd682..ee873ad2d 100644 --- a/tests/bit_vector/rank_test.cu +++ b/tests/bit_vector/rank_test.cu @@ -25,8 +25,9 @@ #include template -__global__ void rank_kernel(BitVectorRef ref, size_t n, uint32_t* output) { - size_t index = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void rank_kernel(BitVectorRef ref, size_t n, uint32_t* output) +{ + size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; while (index < n) { output[index] = ref.rank(index); @@ -49,17 +50,15 @@ TEST_CASE("Rank test", "") bv.build(); thrust::device_vector rank_result_device(num_elements); - auto ref = bv.ref(cuco::experimental::rank); + auto ref = bv.ref(cuco::experimental::rank); rank_kernel<<<1, 1024>>>(ref, num_elements, thrust::raw_pointer_cast(rank_result_device.data())); thrust::host_vector rank_result = rank_result_device; - uint32_t cur_rank = 0; - uint32_t num_matches = 0; + uint32_t cur_rank = 0; + uint32_t num_matches = 0; for (size_t i = 0; i < num_elements; i++) { num_matches += cur_rank == rank_result[i]; - if (modulo_bitgen(i)) { - cur_rank++; - } + if (modulo_bitgen(i)) { cur_rank++; } } REQUIRE(num_matches == num_elements); } diff --git a/tests/bit_vector/select_test.cu b/tests/bit_vector/select_test.cu index 87c74ad9d..5aff9f270 100644 --- a/tests/bit_vector/select_test.cu +++ b/tests/bit_vector/select_test.cu @@ -25,8 +25,9 @@ #include template -__global__ void select_kernel(BitVectorRef ref, size_t n, uint64_t* output) { - size_t index = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void select_kernel(BitVectorRef ref, size_t n, uint64_t* output) +{ + size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; while (index < n) { output[index] = ref.select(index); @@ -34,10 +35,10 @@ __global__ void select_kernel(BitVectorRef ref, size_t n, uint64_t* output) { } } - template -__global__ void select0_kernel(BitVectorRef ref, size_t n, uint64_t* output) { - size_t index = blockIdx.x * blockDim.x + threadIdx.x; +__global__ void select0_kernel(BitVectorRef ref, size_t n, uint64_t* output) +{ + size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; while (index < n) { output[index] = ref.select0(index); @@ -60,44 +61,43 @@ TEST_CASE("Select test", "") num_set += modulo_bitgen(i); } bv.build(); - auto ref = bv.ref(cuco::experimental::select); - + auto ref = bv.ref(cuco::experimental::select); // Check select { - thrust::device_vector device_result(num_set); - select_kernel<<<1, 1024>>>(ref, num_set, thrust::raw_pointer_cast(device_result.data())); - thrust::host_vector host_result = device_result; - - uint32_t num_matches = 0; - uint32_t cur_set_pos = -1u; - for (size_t i = 0; i < num_set; i++) { - do { - cur_set_pos++; - } while (cur_set_pos < num_elements and !modulo_bitgen(cur_set_pos)); - - num_matches += cur_set_pos == host_result[i]; - } - REQUIRE(num_matches == num_set); + thrust::device_vector device_result(num_set); + select_kernel<<<1, 1024>>>(ref, num_set, thrust::raw_pointer_cast(device_result.data())); + thrust::host_vector host_result = device_result; + + uint32_t num_matches = 0; + uint32_t cur_set_pos = -1u; + for (size_t i = 0; i < num_set; i++) { + do { + cur_set_pos++; + } while (cur_set_pos < num_elements and !modulo_bitgen(cur_set_pos)); + + num_matches += cur_set_pos == host_result[i]; + } + REQUIRE(num_matches == num_set); } // Check select0 { - uint32_t num_not_set = num_elements - num_set; - - thrust::device_vector device_result(num_not_set); - select0_kernel<<<1, 1024>>>(ref, num_not_set, thrust::raw_pointer_cast(device_result.data())); - thrust::host_vector host_result = device_result; - - uint32_t num_matches = 0; - uint32_t cur_not_set_pos = -1u; - for (size_t i = 0; i < num_not_set; i++) { - do { - cur_not_set_pos++; - } while (cur_not_set_pos < num_elements and modulo_bitgen(cur_not_set_pos)); - - num_matches += cur_not_set_pos == host_result[i]; - } - REQUIRE(num_matches == num_not_set); + uint32_t num_not_set = num_elements - num_set; + + thrust::device_vector device_result(num_not_set); + select0_kernel<<<1, 1024>>>(ref, num_not_set, thrust::raw_pointer_cast(device_result.data())); + thrust::host_vector host_result = device_result; + + uint32_t num_matches = 0; + uint32_t cur_not_set_pos = -1u; + for (size_t i = 0; i < num_not_set; i++) { + do { + cur_not_set_pos++; + } while (cur_not_set_pos < num_elements and modulo_bitgen(cur_not_set_pos)); + + num_matches += cur_not_set_pos == host_result[i]; + } + REQUIRE(num_matches == num_not_set); } } diff --git a/tests/bit_vector/size_test.cu b/tests/bit_vector/size_test.cu index d5bcf4ab1..559391407 100644 --- a/tests/bit_vector/size_test.cu +++ b/tests/bit_vector/size_test.cu @@ -25,7 +25,7 @@ TEST_CASE("Size computation", "") cuco::experimental::bit_vector bv{cuco::experimental::extent{400}}; for (size_t i = 0; i < num_elements; i++) { - bv.add(i % 2 == 0); // Alternate 0s and 1s pattern + bv.add(i % 2 == 0); // Alternate 0s and 1s pattern } bv.build(); From 903e36dbd910b17b48725d219fd4c039f7c6aa06 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 24 May 2023 21:54:18 +0000 Subject: [PATCH 11/99] Coalesce all bitvector operations into single tag --- .../cuco/detail/bit_vector/bit_vector_ref.inl | 44 ++++++------------- include/cuco/operator.hpp | 24 ++-------- tests/bit_vector/find_next_set_test.cu | 2 +- tests/bit_vector/get_test.cu | 2 +- tests/bit_vector/rank_test.cu | 2 +- tests/bit_vector/select_test.cu | 2 +- 6 files changed, 20 insertions(+), 56 deletions(-) diff --git a/include/cuco/detail/bit_vector/bit_vector_ref.inl b/include/cuco/detail/bit_vector/bit_vector_ref.inl index 83ee93b36..f9b67020d 100644 --- a/include/cuco/detail/bit_vector/bit_vector_ref.inl +++ b/include/cuco/detail/bit_vector/bit_vector_ref.inl @@ -21,7 +21,7 @@ __host__ __device__ constexpr bit_vector_ref::bit_vect namespace detail { template -class operator_impl> { +class operator_impl> { using ref_type = bit_vector_ref; public: @@ -30,13 +30,20 @@ class operator_impl> { auto const& ref_ = static_cast(*this); return (ref_.words_ref_[key / 64][0] >> (key % 64)) & 1UL; } -}; -template -class operator_impl> { - using ref_type = bit_vector_ref; + [[nodiscard]] __device__ uint64_t find_next_set(uint64_t key) const noexcept + { + auto const& ref_ = static_cast(*this); + uint64_t word_id = key / 64; + uint64_t bit_id = key % 64; + uint64_t word = ref_.words_ref_[word_id][0]; + word &= ~(0lu) << bit_id; + while (word == 0) { + word = ref_.words_ref_[++word_id][0]; + } + return (word_id * 64) + __builtin_ffsll(word) - 1; + } - public: [[nodiscard]] __device__ uint64_t rank(uint64_t key) const noexcept { auto const& ref_ = static_cast(*this); @@ -51,13 +58,7 @@ class operator_impl> { n += __builtin_popcountll(ref_.words_ref_[word_id][0] & ((1UL << bit_id) - 1)); return n; } -}; - -template -class operator_impl> { - using ref_type = bit_vector_ref; - public: [[nodiscard]] __device__ uint64_t select(uint64_t key) const noexcept { auto const& ref_ = static_cast(*this); @@ -130,25 +131,6 @@ class operator_impl> { } }; -template -class operator_impl> { - using ref_type = bit_vector_ref; - - public: - [[nodiscard]] __device__ uint64_t find_next_set(uint64_t key) const noexcept - { - auto const& ref_ = static_cast(*this); - uint64_t word_id = key / 64; - uint64_t bit_id = key % 64; - uint64_t word = ref_.words_ref_[word_id][0]; - word &= ~(0lu) << bit_id; - while (word == 0) { - word = ref_.words_ref_[++word_id][0]; - } - return (word_id * 64) + __builtin_ffsll(word) - 1; - } -}; - } // namespace detail } // namespace experimental } // namespace cuco diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp index f813c64c2..f9165d3bf 100644 --- a/include/cuco/operator.hpp +++ b/include/cuco/operator.hpp @@ -46,28 +46,10 @@ struct find_tag { } inline constexpr find; /** - * @brief `get` operator tag + * @brief `bv_read` operator tag */ -struct get_tag { -} inline constexpr get; - -/** - * @brief `rank` operator tag - */ -struct rank_tag { -} inline constexpr rank; - -/** - * @brief `select` operator tag - */ -struct select_tag { -} inline constexpr select; - -/** - * @brief `find_next_set` operator tag - */ -struct find_next_set_tag { -} inline constexpr find_next_set; +struct bv_read_tag { +} inline constexpr bv_read; } // namespace op } // namespace experimental diff --git a/tests/bit_vector/find_next_set_test.cu b/tests/bit_vector/find_next_set_test.cu index 4a4597999..388566ebf 100644 --- a/tests/bit_vector/find_next_set_test.cu +++ b/tests/bit_vector/find_next_set_test.cu @@ -50,7 +50,7 @@ TEST_CASE("Find next set test", "") bv.build(); thrust::device_vector device_result(num_elements); - auto ref = bv.ref(cuco::experimental::find_next_set); + auto ref = bv.ref(cuco::experimental::bv_read); find_next_set_kernel<<<1, 1024>>>( ref, num_elements, thrust::raw_pointer_cast(device_result.data())); diff --git a/tests/bit_vector/get_test.cu b/tests/bit_vector/get_test.cu index 788d22d1a..172ead363 100644 --- a/tests/bit_vector/get_test.cu +++ b/tests/bit_vector/get_test.cu @@ -50,7 +50,7 @@ TEST_CASE("Get test", "") } bv.build(); - auto ref = bv.ref(cuco::experimental::get); + auto ref = bv.ref(cuco::experimental::bv_read); thrust::device_vector get_result(num_elements); get_kernel<<<1, 1024>>>(ref, num_elements, thrust::raw_pointer_cast(get_result.data())); diff --git a/tests/bit_vector/rank_test.cu b/tests/bit_vector/rank_test.cu index ee873ad2d..bf7331b6e 100644 --- a/tests/bit_vector/rank_test.cu +++ b/tests/bit_vector/rank_test.cu @@ -50,7 +50,7 @@ TEST_CASE("Rank test", "") bv.build(); thrust::device_vector rank_result_device(num_elements); - auto ref = bv.ref(cuco::experimental::rank); + auto ref = bv.ref(cuco::experimental::bv_read); rank_kernel<<<1, 1024>>>(ref, num_elements, thrust::raw_pointer_cast(rank_result_device.data())); thrust::host_vector rank_result = rank_result_device; diff --git a/tests/bit_vector/select_test.cu b/tests/bit_vector/select_test.cu index 5aff9f270..49e9a9ab6 100644 --- a/tests/bit_vector/select_test.cu +++ b/tests/bit_vector/select_test.cu @@ -61,7 +61,7 @@ TEST_CASE("Select test", "") num_set += modulo_bitgen(i); } bv.build(); - auto ref = bv.ref(cuco::experimental::select); + auto ref = bv.ref(cuco::experimental::bv_read); // Check select { From 0749c04fd3ccc6d2eaf6bd85fcd179e5a89fd2a2 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 26 May 2023 04:20:49 +0000 Subject: [PATCH 12/99] Naming --- include/cuco/bit_vector.cuh | 38 +++++----- include/cuco/detail/bit_vector/bit_vector.inl | 74 +++++++++---------- .../cuco/detail/bit_vector/bit_vector_ref.inl | 18 ++--- 3 files changed, 65 insertions(+), 65 deletions(-) diff --git a/include/cuco/bit_vector.cuh b/include/cuco/bit_vector.cuh index c926a4f87..809ad5e09 100644 --- a/include/cuco/bit_vector.cuh +++ b/include/cuco/bit_vector.cuh @@ -29,24 +29,24 @@ namespace cuco { namespace experimental { -struct Rank { +struct rank { // Basically a uint64_t split into 1 uin32_t and 2 uint8_t - uint32_t abs_hi; - uint8_t abs_lo; - uint8_t rels[3]; + uint32_t abs_hi_; + uint8_t abs_lo_; + uint8_t rels_[3]; - __host__ __device__ uint64_t abs() const { return ((uint64_t)abs_hi << 8) | abs_lo; } + __host__ __device__ uint64_t abs() const { return ((uint64_t)abs_hi_ << 8) | abs_lo_; } void set_abs(uint64_t abs) { - abs_hi = (uint32_t)(abs >> 8); - abs_lo = (uint8_t)abs; + abs_hi_ = (uint32_t)(abs >> 8); + abs_lo_ = (uint8_t)abs; } }; // Need this union to use uint64_t for all aow_storage structures -union RankUnion { - uint64_t word; - Rank rank; +union rank_union { + uint64_t word_; + rank rank_; }; template [[nodiscard]] auto ref(Operators... ops) const noexcept; - size_t size() const { return n_bits; } + size_t size() const { return n_bits_; } size_t memory_footprint() const; private: - uint64_t n_bits; + uint64_t n_bits_; // Host structures - std::vector words; - std::vector ranks, ranks0; - std::vector selects, selects0; + std::vector words_; + std::vector ranks_, ranks0_; + std::vector selects_, selects0_; // Device structures allocator_type allocator_; ///< Allocator used to (de)allocate temporary storage - storage_type aow_words, aow_ranks, aow_selects, aow_ranks0, aow_selects0; - - void move_to_device(); + storage_type aow_words_, aow_ranks_, aow_selects_, aow_ranks0_, aow_selects0_; template void copy_host_array_to_aow(storage_type& aow, std::vector& host_array); + + void move_to_device(); }; } // namespace experimental diff --git a/include/cuco/detail/bit_vector/bit_vector.inl b/include/cuco/detail/bit_vector/bit_vector.inl index 1df04465b..f2b329c77 100644 --- a/include/cuco/detail/bit_vector/bit_vector.inl +++ b/include/cuco/detail/bit_vector/bit_vector.inl @@ -20,42 +20,42 @@ namespace experimental { template bit_vector::bit_vector(Extent capacity) - : words(), - ranks(), - selects(), - n_bits(0), - aow_words{make_valid_extent(capacity), allocator_}, - aow_ranks{make_valid_extent(capacity), allocator_}, - aow_selects{make_valid_extent(capacity), allocator_}, - aow_ranks0{make_valid_extent(capacity), allocator_}, - aow_selects0{make_valid_extent(capacity), allocator_} + : words_(), + ranks_(), + selects_(), + n_bits_(0), + aow_words_{make_valid_extent(capacity), allocator_}, + aow_ranks_{make_valid_extent(capacity), allocator_}, + aow_selects_{make_valid_extent(capacity), allocator_}, + aow_ranks0_{make_valid_extent(capacity), allocator_}, + aow_selects0_{make_valid_extent(capacity), allocator_} { } template void bit_vector::add(bool bit) { - if (n_bits % 256 == 0) { words.resize((n_bits + 256) / 64); } - set(n_bits, bit); - ++n_bits; + if (n_bits_ % 256 == 0) { words_.resize((n_bits_ + 256) / 64); } + set(n_bits_, bit); + ++n_bits_; } template void bit_vector::build() { - uint64_t n_blocks = words.size() / 4; - ranks.resize(n_blocks + 1); - ranks0.resize(n_blocks + 1); + uint64_t n_blocks = words_.size() / 4; + ranks_.resize(n_blocks + 1); + ranks0_.resize(n_blocks + 1); uint64_t n_ones = 0, n_zeroes = 0; for (uint64_t block_id = 0; block_id < n_blocks; ++block_id) { - ranks[block_id].set_abs(n_ones); - ranks0[block_id].set_abs(n_zeroes); + ranks_[block_id].set_abs(n_ones); + ranks0_[block_id].set_abs(n_zeroes); for (uint64_t block_offset = 0; block_offset < 4; ++block_offset) { if (block_offset != 0) { - ranks[block_id].rels[block_offset - 1] = n_ones - ranks[block_id].abs(); - ranks0[block_id].rels[block_offset - 1] = n_zeroes - ranks0[block_id].abs(); + ranks_[block_id].rels_[block_offset - 1] = n_ones - ranks_[block_id].abs(); + ranks0_[block_id].rels_[block_offset - 1] = n_zeroes - ranks0_[block_id].abs(); } auto update_selects = @@ -78,40 +78,40 @@ void bit_vector::build() }; uint64_t word_id = (block_id * 4) + block_offset; - update_selects(word_id, words[word_id], n_ones, selects); - update_selects(word_id, ~words[word_id], n_zeroes, selects0); + update_selects(word_id, words_[word_id], n_ones, selects_); + update_selects(word_id, ~words_[word_id], n_zeroes, selects0_); } } - ranks.back().set_abs(n_ones); - ranks0.back().set_abs(n_zeroes); - selects.push_back(words.size() * 64 / 256); - selects0.push_back(words.size() * 64 / 256); + ranks_.back().set_abs(n_ones); + ranks0_.back().set_abs(n_zeroes); + selects_.push_back(words_.size() * 64 / 256); + selects0_.push_back(words_.size() * 64 / 256); move_to_device(); } template -void bit_vector::set(Key i, bool bit) +void bit_vector::set(Key key, bool bit) { if (bit) { - words[i / 64] |= (1UL << (i % 64)); + words_[key / 64] |= (1UL << (key % 64)); } else { - words[i / 64] &= ~(1UL << (i % 64)); + words_[key / 64] &= ~(1UL << (key % 64)); } } template void bit_vector::set_last(bool bit) { - set(n_bits - 1, bit); + set(n_bits_ - 1, bit); } template size_t bit_vector::memory_footprint() const { - return sizeof(uint64_t) * words.size() + sizeof(Rank) * (ranks.size() + ranks0.size()) + - sizeof(uint64_t) * (selects.size() + selects0.size()); + return sizeof(uint64_t) * words_.size() + sizeof(rank) * (ranks_.size() + ranks0_.size()) + + sizeof(uint64_t) * (selects_.size() + selects0_.size()); } template @@ -154,11 +154,11 @@ void bit_vector::copy_host_array_to_aow( template void bit_vector::move_to_device() { - copy_host_array_to_aow(aow_words, words); - copy_host_array_to_aow(aow_ranks, ranks); - copy_host_array_to_aow(aow_selects, selects); - copy_host_array_to_aow(aow_ranks0, ranks0); - copy_host_array_to_aow(aow_selects0, selects0); + copy_host_array_to_aow(aow_words_, words_); + copy_host_array_to_aow(aow_ranks_, ranks_); + copy_host_array_to_aow(aow_selects_, selects_); + copy_host_array_to_aow(aow_ranks0_, ranks0_); + copy_host_array_to_aow(aow_selects0_, selects0_); } template @@ -167,7 +167,7 @@ auto bit_vector::ref(Operators...) const { static_assert(sizeof...(Operators), "No operators specified"); return ref_type{ - aow_words.ref(), aow_ranks.ref(), aow_selects.ref(), aow_ranks0.ref(), aow_selects0.ref()}; + aow_words_.ref(), aow_ranks_.ref(), aow_selects_.ref(), aow_ranks0_.ref(), aow_selects0_.ref()}; } } // namespace experimental diff --git a/include/cuco/detail/bit_vector/bit_vector_ref.inl b/include/cuco/detail/bit_vector/bit_vector_ref.inl index f9b67020d..36c9f66dc 100644 --- a/include/cuco/detail/bit_vector/bit_vector_ref.inl +++ b/include/cuco/detail/bit_vector/bit_vector_ref.inl @@ -52,9 +52,9 @@ class operator_impl> { uint64_t bit_id = key % 64; uint64_t rank_id = word_id / 4; uint64_t rel_id = word_id % 4; - auto rank = RankUnion{ref_.ranks_ref_[rank_id][0]}.rank; + auto rank = rank_union{ref_.ranks_ref_[rank_id][0]}.rank_; uint64_t n = rank.abs(); - if (rel_id != 0) { n += rank.rels[rel_id - 1]; } + if (rel_id != 0) { n += rank.rels_[rel_id - 1]; } n += __builtin_popcountll(ref_.words_ref_[word_id][0] & ((1UL << bit_id) - 1)); return n; } @@ -87,13 +87,13 @@ class operator_impl> { uint64_t begin = selects_ref[block_id][0]; uint64_t end = selects_ref[block_id + 1][0] + 1UL; if (begin + 10 >= end) { - while (key >= RankUnion{ranks_ref[begin + 1][0]}.rank.abs()) { + while (key >= rank_union{ranks_ref[begin + 1][0]}.rank_.abs()) { ++begin; } } else { while (begin + 1 < end) { const uint64_t middle = (begin + end) / 2; - if (key < RankUnion{ranks_ref[middle][0]}.rank.abs()) { + if (key < rank_union{ranks_ref[middle][0]}.rank_.abs()) { end = middle; } else { begin = middle; @@ -107,17 +107,17 @@ class operator_impl> { uint64_t rank_id, const StorageRef& ranks_ref) const noexcept { - const auto& rank = RankUnion{ranks_ref[rank_id][0]}.rank; + const auto& rank = rank_union{ranks_ref[rank_id][0]}.rank_; key -= rank.abs(); uint64_t word_id = rank_id * 4; - bool a0 = key >= rank.rels[0]; - bool a1 = key >= rank.rels[1]; - bool a2 = key >= rank.rels[2]; + bool a0 = key >= rank.rels_[0]; + bool a1 = key >= rank.rels_[1]; + bool a2 = key >= rank.rels_[2]; uint64_t inc = a0 + a1 + a2; word_id += inc; - key -= (inc > 0) * rank.rels[inc - (inc > 0)]; + key -= (inc > 0) * rank.rels_[inc - (inc > 0)]; return word_id; } From b795601fa73bbd480ff109a33a896009c75f4bc0 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 26 May 2023 06:22:51 +0000 Subject: [PATCH 13/99] Allocate and size aow arrays after host side build Possible by shifting from static to dynamic allocation --- include/cuco/bit_vector.cuh | 7 +-- include/cuco/detail/bit_vector/bit_vector.inl | 50 ++++++++++++------- tests/bit_vector/find_next_set_test.cu | 2 +- tests/bit_vector/get_test.cu | 2 +- tests/bit_vector/rank_test.cu | 2 +- tests/bit_vector/select_test.cu | 2 +- tests/bit_vector/size_test.cu | 2 +- 7 files changed, 42 insertions(+), 25 deletions(-) diff --git a/include/cuco/bit_vector.cuh b/include/cuco/bit_vector.cuh index 809ad5e09..aa4441b5d 100644 --- a/include/cuco/bit_vector.cuh +++ b/include/cuco/bit_vector.cuh @@ -56,7 +56,8 @@ template > class bit_vector { public: - bit_vector(Extent capacity); + bit_vector(); + ~bit_vector(); void add(bool bit); // adds a new bit at the end void build(); // builds indexes for rank and select. @@ -98,10 +99,10 @@ class bit_vector { // Device structures allocator_type allocator_; ///< Allocator used to (de)allocate temporary storage - storage_type aow_words_, aow_ranks_, aow_selects_, aow_ranks0_, aow_selects0_; + storage_type *aow_words_, *aow_ranks_, *aow_selects_, *aow_ranks0_, *aow_selects0_; template - void copy_host_array_to_aow(storage_type& aow, std::vector& host_array); + void copy_host_array_to_aow(storage_type** aow, std::vector& host_array); void move_to_device(); }; diff --git a/include/cuco/detail/bit_vector/bit_vector.inl b/include/cuco/detail/bit_vector/bit_vector.inl index f2b329c77..13c98d499 100644 --- a/include/cuco/detail/bit_vector/bit_vector.inl +++ b/include/cuco/detail/bit_vector/bit_vector.inl @@ -19,19 +19,29 @@ namespace cuco { namespace experimental { template -bit_vector::bit_vector(Extent capacity) +bit_vector::bit_vector() : words_(), ranks_(), selects_(), n_bits_(0), - aow_words_{make_valid_extent(capacity), allocator_}, - aow_ranks_{make_valid_extent(capacity), allocator_}, - aow_selects_{make_valid_extent(capacity), allocator_}, - aow_ranks0_{make_valid_extent(capacity), allocator_}, - aow_selects0_{make_valid_extent(capacity), allocator_} + aow_words_(nullptr), + aow_ranks_(nullptr), + aow_selects_(nullptr), + aow_ranks0_(nullptr), + aow_selects0_(nullptr) { } +template +bit_vector::~bit_vector() +{ + delete aow_words_; + delete aow_ranks_; + delete aow_selects_; + delete aow_ranks0_; + delete aow_selects0_; +} + template void bit_vector::add(bool bit) { @@ -128,19 +138,20 @@ __global__ void copy_to_window(WindowT* windows, cuco::detail::index_type n, T* } template -void initialize_aow(Storage& storage, T* ptr, uint64_t num_elements) +void initialize_aow(Storage* storage, T* ptr, uint64_t num_elements) { auto constexpr stride = 4; auto const grid_size = (num_elements + stride * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / (stride * detail::CUCO_DEFAULT_BLOCK_SIZE); - copy_to_window<<>>(storage.data(), num_elements, ptr); + copy_to_window<<>>( + storage->data(), num_elements, ptr); } template template void bit_vector::copy_host_array_to_aow( - storage_type& aow, std::vector& host_array) + storage_type** aow, std::vector& host_array) { thrust::device_vector device_array = host_array; auto device_ptr = (uint64_t*)thrust::raw_pointer_cast(device_array.data()); @@ -148,17 +159,19 @@ void bit_vector::copy_host_array_to_aow( uint64_t num_elements = host_array.size(); host_array.clear(); - initialize_aow(aow, device_ptr, num_elements); + *aow = new storage_type(make_valid_extent(extent{num_elements}), + allocator_); + initialize_aow(*aow, device_ptr, num_elements); } template void bit_vector::move_to_device() { - copy_host_array_to_aow(aow_words_, words_); - copy_host_array_to_aow(aow_ranks_, ranks_); - copy_host_array_to_aow(aow_selects_, selects_); - copy_host_array_to_aow(aow_ranks0_, ranks0_); - copy_host_array_to_aow(aow_selects0_, selects0_); + copy_host_array_to_aow(&aow_words_, words_); + copy_host_array_to_aow(&aow_ranks_, ranks_); + copy_host_array_to_aow(&aow_selects_, selects_); + copy_host_array_to_aow(&aow_ranks0_, ranks0_); + copy_host_array_to_aow(&aow_selects0_, selects0_); } template @@ -166,8 +179,11 @@ template auto bit_vector::ref(Operators...) const noexcept { static_assert(sizeof...(Operators), "No operators specified"); - return ref_type{ - aow_words_.ref(), aow_ranks_.ref(), aow_selects_.ref(), aow_ranks0_.ref(), aow_selects0_.ref()}; + return ref_type{aow_words_->ref(), + aow_ranks_->ref(), + aow_selects_->ref(), + aow_ranks0_->ref(), + aow_selects0_->ref()}; } } // namespace experimental diff --git a/tests/bit_vector/find_next_set_test.cu b/tests/bit_vector/find_next_set_test.cu index 388566ebf..ca23fd08b 100644 --- a/tests/bit_vector/find_next_set_test.cu +++ b/tests/bit_vector/find_next_set_test.cu @@ -42,7 +42,7 @@ TEST_CASE("Find next set test", "") constexpr std::size_t num_elements{400}; using Key = uint64_t; - cuco::experimental::bit_vector bv{cuco::experimental::extent{400}}; + cuco::experimental::bit_vector bv; for (size_t i = 0; i < num_elements; i++) { bv.add(modulo_bitgen(i)); diff --git a/tests/bit_vector/get_test.cu b/tests/bit_vector/get_test.cu index 172ead363..6a90d7397 100644 --- a/tests/bit_vector/get_test.cu +++ b/tests/bit_vector/get_test.cu @@ -41,7 +41,7 @@ TEST_CASE("Get test", "") constexpr std::size_t num_elements{400}; using Key = uint64_t; - cuco::experimental::bit_vector bv{cuco::experimental::extent{400}}; + cuco::experimental::bit_vector bv; uint32_t num_set_ref = 0; for (size_t i = 0; i < num_elements; i++) { diff --git a/tests/bit_vector/rank_test.cu b/tests/bit_vector/rank_test.cu index bf7331b6e..c8dd9e4df 100644 --- a/tests/bit_vector/rank_test.cu +++ b/tests/bit_vector/rank_test.cu @@ -42,7 +42,7 @@ TEST_CASE("Rank test", "") constexpr std::size_t num_elements{400}; using Key = uint64_t; - cuco::experimental::bit_vector bv{cuco::experimental::extent{400}}; + cuco::experimental::bit_vector bv; for (size_t i = 0; i < num_elements; i++) { bv.add(modulo_bitgen(i)); diff --git a/tests/bit_vector/select_test.cu b/tests/bit_vector/select_test.cu index 49e9a9ab6..40adc0ed4 100644 --- a/tests/bit_vector/select_test.cu +++ b/tests/bit_vector/select_test.cu @@ -53,7 +53,7 @@ TEST_CASE("Select test", "") constexpr std::size_t num_elements{400}; using Key = uint64_t; - cuco::experimental::bit_vector bv{cuco::experimental::extent{400}}; + cuco::experimental::bit_vector bv; uint32_t num_set = 0; for (size_t i = 0; i < num_elements; i++) { diff --git a/tests/bit_vector/size_test.cu b/tests/bit_vector/size_test.cu index 559391407..34fcce09c 100644 --- a/tests/bit_vector/size_test.cu +++ b/tests/bit_vector/size_test.cu @@ -22,7 +22,7 @@ TEST_CASE("Size computation", "") { constexpr std::size_t num_elements{400}; - cuco::experimental::bit_vector bv{cuco::experimental::extent{400}}; + cuco::experimental::bit_vector bv; for (size_t i = 0; i < num_elements; i++) { bv.add(i % 2 == 0); // Alternate 0s and 1s pattern From d29e304beb00ab7ad857e31541089951bff606e2 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 26 May 2023 07:15:47 +0000 Subject: [PATCH 14/99] Handle empty bitvector with zero bits --- include/cuco/detail/bit_vector/bit_vector.inl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/detail/bit_vector/bit_vector.inl b/include/cuco/detail/bit_vector/bit_vector.inl index 13c98d499..c285d4e1a 100644 --- a/include/cuco/detail/bit_vector/bit_vector.inl +++ b/include/cuco/detail/bit_vector/bit_vector.inl @@ -159,7 +159,7 @@ void bit_vector::copy_host_array_to_aow( uint64_t num_elements = host_array.size(); host_array.clear(); - *aow = new storage_type(make_valid_extent(extent{num_elements}), + *aow = new storage_type(make_valid_extent(extent{num_elements + 1}), allocator_); initialize_aow(*aow, device_ptr, num_elements); } From 03a212c41cc9b3b806ad4c5cb2050be4903cae83 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 26 May 2023 20:52:59 +0000 Subject: [PATCH 15/99] Minor --- include/cuco/detail/bit_vector/bit_vector.inl | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/include/cuco/detail/bit_vector/bit_vector.inl b/include/cuco/detail/bit_vector/bit_vector.inl index c285d4e1a..b81d572bc 100644 --- a/include/cuco/detail/bit_vector/bit_vector.inl +++ b/include/cuco/detail/bit_vector/bit_vector.inl @@ -153,15 +153,18 @@ template void bit_vector::copy_host_array_to_aow( storage_type** aow, std::vector& host_array) { - thrust::device_vector device_array = host_array; - auto device_ptr = (uint64_t*)thrust::raw_pointer_cast(device_array.data()); - uint64_t num_elements = host_array.size(); - host_array.clear(); - *aow = new storage_type(make_valid_extent(extent{num_elements + 1}), allocator_); - initialize_aow(*aow, device_ptr, num_elements); + + if (num_elements > 0) { + thrust::device_vector device_array = host_array; + auto device_ptr = (uint64_t*)thrust::raw_pointer_cast(device_array.data()); + + host_array.clear(); + + initialize_aow(*aow, device_ptr, num_elements); + } } template From e8f186d42915cd8c1461dbbd906d221833f09154 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Sun, 28 May 2023 06:11:18 +0000 Subject: [PATCH 16/99] Remove bitvector memory footprint logic --- include/cuco/bit_vector.cuh | 1 - include/cuco/detail/bit_vector/bit_vector.inl | 7 ------- 2 files changed, 8 deletions(-) diff --git a/include/cuco/bit_vector.cuh b/include/cuco/bit_vector.cuh index aa4441b5d..f84a71956 100644 --- a/include/cuco/bit_vector.cuh +++ b/include/cuco/bit_vector.cuh @@ -87,7 +87,6 @@ class bit_vector { [[nodiscard]] auto ref(Operators... ops) const noexcept; size_t size() const { return n_bits_; } - size_t memory_footprint() const; private: uint64_t n_bits_; diff --git a/include/cuco/detail/bit_vector/bit_vector.inl b/include/cuco/detail/bit_vector/bit_vector.inl index b81d572bc..6c272d749 100644 --- a/include/cuco/detail/bit_vector/bit_vector.inl +++ b/include/cuco/detail/bit_vector/bit_vector.inl @@ -117,13 +117,6 @@ void bit_vector::set_last(bool bit) set(n_bits_ - 1, bit); } -template -size_t bit_vector::memory_footprint() const -{ - return sizeof(uint64_t) * words_.size() + sizeof(rank) * (ranks_.size() + ranks0_.size()) + - sizeof(uint64_t) * (selects_.size() + selects0_.size()); -} - template __global__ void copy_to_window(WindowT* windows, cuco::detail::index_type n, T* values) { From 9e8f68e85f4cb078b4ffd08c996950989883cf3b Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Mon, 29 May 2023 04:04:57 +0000 Subject: [PATCH 17/99] Doxygen comments --- include/cuco/bit_vector.cuh | 136 ++++++++++++++---- include/cuco/bit_vector_ref.cuh | 16 ++- .../cuco/detail/bit_vector/bit_vector_ref.inl | 6 +- 3 files changed, 125 insertions(+), 33 deletions(-) diff --git a/include/cuco/bit_vector.cuh b/include/cuco/bit_vector.cuh index f84a71956..a97859171 100644 --- a/include/cuco/bit_vector.cuh +++ b/include/cuco/bit_vector.cuh @@ -29,26 +29,58 @@ namespace cuco { namespace experimental { +/** + * @brief Struct to store ranks of bits at 256-bit intervals + */ struct rank { - // Basically a uint64_t split into 1 uin32_t and 2 uint8_t - uint32_t abs_hi_; - uint8_t abs_lo_; - uint8_t rels_[3]; - + uint32_t abs_hi_; ///< Upper 32 bits of base + uint8_t abs_lo_; ///< Lower 8 bits of base + uint8_t rels_[3]; ///< Four offsets for 64-bit sub-intervals + + /** + * @brief Gets base rank of current 256-bit interval + * + * @return The base rank + */ __host__ __device__ uint64_t abs() const { return ((uint64_t)abs_hi_ << 8) | abs_lo_; } + + /** + * @brief Sets base rank of current 256-bit interval + * + * @param abs Base rank + */ void set_abs(uint64_t abs) { abs_hi_ = (uint32_t)(abs >> 8); abs_lo_ = (uint8_t)abs; } }; - -// Need this union to use uint64_t for all aow_storage structures +/** + * @brief Union of 64-bit word with rank + * + * Need this so that all aow_storage structures in bitvector have 64-bit element type + */ union rank_union { - uint64_t word_; - rank rank_; + uint64_t word_; ///< word view + rank rank_; ///< rank view }; +/** + * @brief Bitvector class with rank and select index structures + * + * In addition to standard bitvector get/set operations, this class provides + * rank and select operation API. It maintains index structures to make both these + * new operations close to constant time. + * Bitvector construction happens on host, after which the structures are moved to device. + * All subsequent read-only operations access device structures only. + * + * @tparam Key Type of the index that specifies which bit to access/modify + * @tparam Extent Data structure size type + * @tparam Scope The scope in which operations will be performed by individual threads. + * @tparam Allocator Type of allocator used for device storage + * @tparam Storage Slot window storage type + */ + template , cuda::thread_scope Scope = cuda::thread_scope_device, @@ -59,23 +91,44 @@ class bit_vector { bit_vector(); ~bit_vector(); - void add(bool bit); // adds a new bit at the end - void build(); // builds indexes for rank and select. - + /** + * @brief adds a new bit at the end + * + * @param bit Boolean value of new bit to be added + */ + void add(bool bit); + + /** + * @brief Builds indexes for rank and select + * + * Also creates device-side snapshot + */ + void build(); + + /** + * @brief Modifies a single bit + * + * @param key position of bit to be modified + * @param bit new value of bit + */ void set(Key key, bool bit); + + /** + * @brief Sets last bit to specified value + * + * @param bit new value of last bit + */ void set_last(bool bit); - static constexpr auto cg_size = 1; - static constexpr auto window_size = 1; - static constexpr auto thread_scope = Scope; + static constexpr auto cg_size = 1; ///< CG size used to for probing + static constexpr auto window_size = 1; ///< Window size used to for probing + static constexpr auto thread_scope = Scope; ///< CUDA thread scope - using key_type = Key; ///< Key type - using value_type = Key; ///< Key type - using extent_type = decltype(make_valid_extent(std::declval())); - using size_type = typename extent_type::value_type; ///< Size type - using allocator_type = Allocator; ///< Allocator type + using extent_type = + decltype(make_valid_extent(std::declval())); ///< Extent type + using allocator_type = Allocator; ///< Allocator type using storage_type = - detail::storage; ///< Storage type + detail::storage; ///< Storage type using storage_ref_type = typename storage_type::ref_type; ///< Non-owning window storage ref type template @@ -83,26 +136,55 @@ class bit_vector { cuco::experimental::bit_vector_ref; ///< Non-owning container ref type + /** + * @brief Get device ref with operators. + * + * @tparam Operators Set of `cuco::op` to be provided by the ref + * + * @param ops List of operators, e.g., `cuco::bv_read` + * + * @return Device ref of the current `bit_vector` object + */ template [[nodiscard]] auto ref(Operators... ops) const noexcept; + /** + * @brief Get the number of bits bit_vector holds + * + * @return Number of bits bit_vector holds + */ size_t size() const { return n_bits_; } private: - uint64_t n_bits_; + uint64_t n_bits_; ///< Number of bits added to bit_vector - // Host structures - std::vector words_; - std::vector ranks_, ranks0_; - std::vector selects_, selects0_; + // Host-side structures + std::vector words_; ///< Words vector that represents all bits + std::vector ranks_; ///< Holds the rank values for every 256-th bit (4-th word) + std::vector ranks0_; ///< Same as ranks_ but for `0` bits + std::vector selects_; ///< Holds pointers to (0, 256, 512...)th `1` bit in ranks_ + std::vector selects0_; ///< Same as selects_, but for `0` bits - // Device structures + // Device-side structures allocator_type allocator_; ///< Allocator used to (de)allocate temporary storage storage_type *aow_words_, *aow_ranks_, *aow_selects_, *aow_ranks0_, *aow_selects0_; + /** + * @brief Creates a new window structure on device and intitializes it with contents of host array + * + * @tparam T Type of host array elements + * + * @param aow pointer to destination (device window structure) + * @param host_array host array whose contents are used to intialize aow + */ template void copy_host_array_to_aow(storage_type** aow, std::vector& host_array); + /** + * @brief Constructs device-side structures and clears host-side structures + * + * Effectively takes a snapshot of the bitvector and creates a device-side copy + */ void move_to_device(); }; diff --git a/include/cuco/bit_vector_ref.cuh b/include/cuco/bit_vector_ref.cuh index 464934ef8..f925afcef 100644 --- a/include/cuco/bit_vector_ref.cuh +++ b/include/cuco/bit_vector_ref.cuh @@ -7,15 +7,27 @@ namespace experimental { struct Rank; +/** + * @brief Device non-owning "ref" type that can be used in device code to perform arbitrary + * operations defined in `include/cuco/operator.hpp` + * + * @tparam StorageRef Storage ref type + * @tparam Operators Device operator options defined in `include/cuco/operator.hpp` + */ template class bit_vector_ref : public detail::operator_impl>... { public: using storage_ref_type = StorageRef; ///< Type of storage ref + /** * @brief Constructs bit_vector_ref. * - * @param storage_ref Non-owning ref of slot storage + * @param words_ref Non-owning ref of words slot storage + * @param ranks_ref Non-owning ref of ranks slot storage + * @param selects_ref Non-owning ref of selects slot storage + * @param ranks0_ref Non-owning ref of ranks0 slot storage + * @param selects0_ref Non-owning ref of selects0 slot storage */ __host__ __device__ explicit constexpr bit_vector_ref(storage_ref_type words_ref, storage_ref_type ranks_ref, @@ -33,5 +45,3 @@ class bit_vector_ref } // namespace experimental } // namespace cuco - -//#include diff --git a/include/cuco/detail/bit_vector/bit_vector_ref.inl b/include/cuco/detail/bit_vector/bit_vector_ref.inl index 36c9f66dc..02577a050 100644 --- a/include/cuco/detail/bit_vector/bit_vector_ref.inl +++ b/include/cuco/detail/bit_vector/bit_vector_ref.inl @@ -63,7 +63,7 @@ class operator_impl> { { auto const& ref_ = static_cast(*this); - const uint64_t rank_id = binary_search_selects_array(key, ref_.selects_ref_, ref_.ranks_ref_); + const uint64_t rank_id = binary_search_ranks_array(key, ref_.selects_ref_, ref_.ranks_ref_); uint64_t word_id = subtract_offset(key, rank_id, ref_.ranks_ref_); return (word_id * 64) + ith_set_pos(key, ref_.words_ref_[word_id][0]); @@ -73,14 +73,14 @@ class operator_impl> { { auto const& ref_ = static_cast(*this); - const uint64_t rank_id = binary_search_selects_array(key, ref_.selects0_ref_, ref_.ranks0_ref_); + const uint64_t rank_id = binary_search_ranks_array(key, ref_.selects0_ref_, ref_.ranks0_ref_); uint64_t word_id = subtract_offset(key, rank_id, ref_.ranks0_ref_); return (word_id * 64) + ith_set_pos(key, ~ref_.words_ref_[word_id][0]); } private: - [[nodiscard]] __device__ uint64_t binary_search_selects_array( + [[nodiscard]] __device__ uint64_t binary_search_ranks_array( uint64_t key, const StorageRef& selects_ref, const StorageRef& ranks_ref) const noexcept { uint64_t block_id = key / 256; From 201c8945fc063edf23b2b8813dfe6c6c709879d0 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Tue, 30 May 2023 16:26:33 +0000 Subject: [PATCH 18/99] Avoid C-style casts --- include/cuco/bit_vector.cuh | 9 ++++++--- include/cuco/detail/bit_vector/bit_vector.inl | 10 ++++------ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/cuco/bit_vector.cuh b/include/cuco/bit_vector.cuh index a97859171..36b345d9c 100644 --- a/include/cuco/bit_vector.cuh +++ b/include/cuco/bit_vector.cuh @@ -42,7 +42,10 @@ struct rank { * * @return The base rank */ - __host__ __device__ uint64_t abs() const { return ((uint64_t)abs_hi_ << 8) | abs_lo_; } + __host__ __device__ uint64_t abs() const + { + return (static_cast(abs_hi_) << 8) | abs_lo_; + } /** * @brief Sets base rank of current 256-bit interval @@ -51,8 +54,8 @@ struct rank { */ void set_abs(uint64_t abs) { - abs_hi_ = (uint32_t)(abs >> 8); - abs_lo_ = (uint8_t)abs; + abs_hi_ = static_cast(abs >> 8); + abs_lo_ = static_cast(abs); } }; /** diff --git a/include/cuco/detail/bit_vector/bit_vector.inl b/include/cuco/detail/bit_vector/bit_vector.inl index 6c272d749..5d894e558 100644 --- a/include/cuco/detail/bit_vector/bit_vector.inl +++ b/include/cuco/detail/bit_vector/bit_vector.inl @@ -131,14 +131,15 @@ __global__ void copy_to_window(WindowT* windows, cuco::detail::index_type n, T* } template -void initialize_aow(Storage* storage, T* ptr, uint64_t num_elements) +void initialize_aow(Storage* storage, thrust::device_vector& device_array, uint64_t num_elements) { auto constexpr stride = 4; auto const grid_size = (num_elements + stride * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / (stride * detail::CUCO_DEFAULT_BLOCK_SIZE); + auto device_ptr = reinterpret_cast(thrust::raw_pointer_cast(device_array.data())); copy_to_window<<>>( - storage->data(), num_elements, ptr); + storage->data(), num_elements, device_ptr); } template @@ -152,11 +153,8 @@ void bit_vector::copy_host_array_to_aow( if (num_elements > 0) { thrust::device_vector device_array = host_array; - auto device_ptr = (uint64_t*)thrust::raw_pointer_cast(device_array.data()); - host_array.clear(); - - initialize_aow(*aow, device_ptr, num_elements); + initialize_aow(*aow, device_array, num_elements); } } From 5e3bdbfde1594b33851501fae9ce762c7b7b848d Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Tue, 30 May 2023 16:27:04 +0000 Subject: [PATCH 19/99] Consistent use of 64-bit integer types --- tests/bit_vector/find_next_set_test.cu | 13 ++++++------- tests/bit_vector/get_test.cu | 9 ++++----- tests/bit_vector/rank_test.cu | 13 ++++++------- tests/bit_vector/select_test.cu | 15 +++++++-------- 4 files changed, 23 insertions(+), 27 deletions(-) diff --git a/tests/bit_vector/find_next_set_test.cu b/tests/bit_vector/find_next_set_test.cu index ca23fd08b..09efd9ce3 100644 --- a/tests/bit_vector/find_next_set_test.cu +++ b/tests/bit_vector/find_next_set_test.cu @@ -25,7 +25,7 @@ #include template -__global__ void find_next_set_kernel(BitVectorRef ref, size_t n, uint32_t* output) +__global__ void find_next_set_kernel(BitVectorRef ref, size_t n, uint64_t* output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; @@ -35,13 +35,12 @@ __global__ void find_next_set_kernel(BitVectorRef ref, size_t n, uint32_t* outpu } } -extern bool modulo_bitgen(uint32_t i); +extern bool modulo_bitgen(uint64_t i); TEST_CASE("Find next set test", "") { constexpr std::size_t num_elements{400}; - using Key = uint64_t; cuco::experimental::bit_vector bv; for (size_t i = 0; i < num_elements; i++) { @@ -49,15 +48,15 @@ TEST_CASE("Find next set test", "") } bv.build(); - thrust::device_vector device_result(num_elements); + thrust::device_vector device_result(num_elements); auto ref = bv.ref(cuco::experimental::bv_read); find_next_set_kernel<<<1, 1024>>>( ref, num_elements, thrust::raw_pointer_cast(device_result.data())); - thrust::host_vector host_result = device_result; - uint32_t num_matches = 0; + thrust::host_vector host_result = device_result; + uint64_t num_matches = 0; - uint32_t next_set_pos = -1u; + size_t next_set_pos = -1lu; do { next_set_pos++; } while (next_set_pos < num_elements and !modulo_bitgen(next_set_pos)); diff --git a/tests/bit_vector/get_test.cu b/tests/bit_vector/get_test.cu index 6a90d7397..632d6998d 100644 --- a/tests/bit_vector/get_test.cu +++ b/tests/bit_vector/get_test.cu @@ -24,7 +24,7 @@ #include template -__global__ void get_kernel(BitVectorRef ref, size_t n, uint32_t* output) +__global__ void get_kernel(BitVectorRef ref, size_t n, uint64_t* output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; @@ -34,16 +34,15 @@ __global__ void get_kernel(BitVectorRef ref, size_t n, uint32_t* output) } } -bool modulo_bitgen(uint32_t i) { return i % 7 == 0; } +bool modulo_bitgen(uint64_t i) { return i % 7 == 0; } TEST_CASE("Get test", "") { constexpr std::size_t num_elements{400}; - using Key = uint64_t; cuco::experimental::bit_vector bv; - uint32_t num_set_ref = 0; + size_t num_set_ref = 0; for (size_t i = 0; i < num_elements; i++) { bv.add(modulo_bitgen(i)); num_set_ref += modulo_bitgen(i); @@ -51,7 +50,7 @@ TEST_CASE("Get test", "") bv.build(); auto ref = bv.ref(cuco::experimental::bv_read); - thrust::device_vector get_result(num_elements); + thrust::device_vector get_result(num_elements); get_kernel<<<1, 1024>>>(ref, num_elements, thrust::raw_pointer_cast(get_result.data())); size_t num_set = thrust::reduce(thrust::device, get_result.begin(), get_result.end(), 0); diff --git a/tests/bit_vector/rank_test.cu b/tests/bit_vector/rank_test.cu index c8dd9e4df..e7a2580d7 100644 --- a/tests/bit_vector/rank_test.cu +++ b/tests/bit_vector/rank_test.cu @@ -25,7 +25,7 @@ #include template -__global__ void rank_kernel(BitVectorRef ref, size_t n, uint32_t* output) +__global__ void rank_kernel(BitVectorRef ref, size_t n, uint64_t* output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; @@ -35,13 +35,12 @@ __global__ void rank_kernel(BitVectorRef ref, size_t n, uint32_t* output) } } -extern bool modulo_bitgen(uint32_t i); +extern bool modulo_bitgen(uint64_t i); TEST_CASE("Rank test", "") { constexpr std::size_t num_elements{400}; - using Key = uint64_t; cuco::experimental::bit_vector bv; for (size_t i = 0; i < num_elements; i++) { @@ -49,13 +48,13 @@ TEST_CASE("Rank test", "") } bv.build(); - thrust::device_vector rank_result_device(num_elements); + thrust::device_vector rank_result_device(num_elements); auto ref = bv.ref(cuco::experimental::bv_read); rank_kernel<<<1, 1024>>>(ref, num_elements, thrust::raw_pointer_cast(rank_result_device.data())); - thrust::host_vector rank_result = rank_result_device; - uint32_t cur_rank = 0; - uint32_t num_matches = 0; + thrust::host_vector rank_result = rank_result_device; + uint64_t cur_rank = 0; + uint64_t num_matches = 0; for (size_t i = 0; i < num_elements; i++) { num_matches += cur_rank == rank_result[i]; if (modulo_bitgen(i)) { cur_rank++; } diff --git a/tests/bit_vector/select_test.cu b/tests/bit_vector/select_test.cu index 40adc0ed4..4110b1da9 100644 --- a/tests/bit_vector/select_test.cu +++ b/tests/bit_vector/select_test.cu @@ -46,16 +46,15 @@ __global__ void select0_kernel(BitVectorRef ref, size_t n, uint64_t* output) } } -extern bool modulo_bitgen(uint32_t i); +extern bool modulo_bitgen(uint64_t i); TEST_CASE("Select test", "") { constexpr std::size_t num_elements{400}; - using Key = uint64_t; cuco::experimental::bit_vector bv; - uint32_t num_set = 0; + uint64_t num_set = 0; for (size_t i = 0; i < num_elements; i++) { bv.add(modulo_bitgen(i)); num_set += modulo_bitgen(i); @@ -69,8 +68,8 @@ TEST_CASE("Select test", "") select_kernel<<<1, 1024>>>(ref, num_set, thrust::raw_pointer_cast(device_result.data())); thrust::host_vector host_result = device_result; - uint32_t num_matches = 0; - uint32_t cur_set_pos = -1u; + uint64_t num_matches = 0; + uint64_t cur_set_pos = -1lu; for (size_t i = 0; i < num_set; i++) { do { cur_set_pos++; @@ -83,14 +82,14 @@ TEST_CASE("Select test", "") // Check select0 { - uint32_t num_not_set = num_elements - num_set; + uint64_t num_not_set = num_elements - num_set; thrust::device_vector device_result(num_not_set); select0_kernel<<<1, 1024>>>(ref, num_not_set, thrust::raw_pointer_cast(device_result.data())); thrust::host_vector host_result = device_result; - uint32_t num_matches = 0; - uint32_t cur_not_set_pos = -1u; + uint64_t num_matches = 0; + uint64_t cur_not_set_pos = -1lu; for (size_t i = 0; i < num_not_set; i++) { do { cur_not_set_pos++; From 149109b7c090aa8ee69f586ca61fc50b528ec864 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Tue, 30 May 2023 16:39:08 +0000 Subject: [PATCH 20/99] Rename method from `add` to `append` --- include/cuco/bit_vector.cuh | 4 +++- include/cuco/detail/bit_vector/bit_vector.inl | 2 +- tests/bit_vector/find_next_set_test.cu | 2 +- tests/bit_vector/get_test.cu | 2 +- tests/bit_vector/rank_test.cu | 2 +- tests/bit_vector/select_test.cu | 2 +- tests/bit_vector/size_test.cu | 2 +- 7 files changed, 9 insertions(+), 7 deletions(-) diff --git a/include/cuco/bit_vector.cuh b/include/cuco/bit_vector.cuh index 36b345d9c..97929ba85 100644 --- a/include/cuco/bit_vector.cuh +++ b/include/cuco/bit_vector.cuh @@ -97,9 +97,11 @@ class bit_vector { /** * @brief adds a new bit at the end * + * Grows internal storage if needed + * * @param bit Boolean value of new bit to be added */ - void add(bool bit); + void append(bool bit); /** * @brief Builds indexes for rank and select diff --git a/include/cuco/detail/bit_vector/bit_vector.inl b/include/cuco/detail/bit_vector/bit_vector.inl index 5d894e558..b4584aeac 100644 --- a/include/cuco/detail/bit_vector/bit_vector.inl +++ b/include/cuco/detail/bit_vector/bit_vector.inl @@ -43,7 +43,7 @@ bit_vector::~bit_vector() } template -void bit_vector::add(bool bit) +void bit_vector::append(bool bit) { if (n_bits_ % 256 == 0) { words_.resize((n_bits_ + 256) / 64); } set(n_bits_, bit); diff --git a/tests/bit_vector/find_next_set_test.cu b/tests/bit_vector/find_next_set_test.cu index 09efd9ce3..15b8b1673 100644 --- a/tests/bit_vector/find_next_set_test.cu +++ b/tests/bit_vector/find_next_set_test.cu @@ -44,7 +44,7 @@ TEST_CASE("Find next set test", "") cuco::experimental::bit_vector bv; for (size_t i = 0; i < num_elements; i++) { - bv.add(modulo_bitgen(i)); + bv.append(modulo_bitgen(i)); } bv.build(); diff --git a/tests/bit_vector/get_test.cu b/tests/bit_vector/get_test.cu index 632d6998d..ec84db051 100644 --- a/tests/bit_vector/get_test.cu +++ b/tests/bit_vector/get_test.cu @@ -44,7 +44,7 @@ TEST_CASE("Get test", "") size_t num_set_ref = 0; for (size_t i = 0; i < num_elements; i++) { - bv.add(modulo_bitgen(i)); + bv.append(modulo_bitgen(i)); num_set_ref += modulo_bitgen(i); } bv.build(); diff --git a/tests/bit_vector/rank_test.cu b/tests/bit_vector/rank_test.cu index e7a2580d7..e6f7b9dea 100644 --- a/tests/bit_vector/rank_test.cu +++ b/tests/bit_vector/rank_test.cu @@ -44,7 +44,7 @@ TEST_CASE("Rank test", "") cuco::experimental::bit_vector bv; for (size_t i = 0; i < num_elements; i++) { - bv.add(modulo_bitgen(i)); + bv.append(modulo_bitgen(i)); } bv.build(); diff --git a/tests/bit_vector/select_test.cu b/tests/bit_vector/select_test.cu index 4110b1da9..7eb0fae73 100644 --- a/tests/bit_vector/select_test.cu +++ b/tests/bit_vector/select_test.cu @@ -56,7 +56,7 @@ TEST_CASE("Select test", "") uint64_t num_set = 0; for (size_t i = 0; i < num_elements; i++) { - bv.add(modulo_bitgen(i)); + bv.append(modulo_bitgen(i)); num_set += modulo_bitgen(i); } bv.build(); diff --git a/tests/bit_vector/size_test.cu b/tests/bit_vector/size_test.cu index 34fcce09c..4c676ce87 100644 --- a/tests/bit_vector/size_test.cu +++ b/tests/bit_vector/size_test.cu @@ -25,7 +25,7 @@ TEST_CASE("Size computation", "") cuco::experimental::bit_vector bv; for (size_t i = 0; i < num_elements; i++) { - bv.add(i % 2 == 0); // Alternate 0s and 1s pattern + bv.append(i % 2 == 0); // Alternate 0s and 1s pattern } bv.build(); From d545194846c9629f1d7fed07eeac3969118e06a0 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Tue, 30 May 2023 18:24:04 +0000 Subject: [PATCH 21/99] Remove `Key` template parameter This was only used in one method and is replaced by size_type --- include/cuco/bit_vector.cuh | 11 +++-- include/cuco/detail/bit_vector/bit_vector.inl | 40 +++++++++---------- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/include/cuco/bit_vector.cuh b/include/cuco/bit_vector.cuh index 97929ba85..25244d6a3 100644 --- a/include/cuco/bit_vector.cuh +++ b/include/cuco/bit_vector.cuh @@ -77,15 +77,13 @@ union rank_union { * Bitvector construction happens on host, after which the structures are moved to device. * All subsequent read-only operations access device structures only. * - * @tparam Key Type of the index that specifies which bit to access/modify * @tparam Extent Data structure size type * @tparam Scope The scope in which operations will be performed by individual threads. * @tparam Allocator Type of allocator used for device storage * @tparam Storage Slot window storage type */ -template , +template , cuda::thread_scope Scope = cuda::thread_scope_device, class Allocator = cuco::cuda_allocator, class Storage = cuco::experimental::aow_storage<1>> @@ -110,13 +108,14 @@ class bit_vector { */ void build(); + using size_type = typename Extent::value_type; ///< size type to specify bit index /** * @brief Modifies a single bit * - * @param key position of bit to be modified + * @param index position of bit to be modified * @param bit new value of bit */ - void set(Key key, bool bit); + void set(size_type index, bool bit); /** * @brief Sets last bit to specified value @@ -133,7 +132,7 @@ class bit_vector { decltype(make_valid_extent(std::declval())); ///< Extent type using allocator_type = Allocator; ///< Allocator type using storage_type = - detail::storage; ///< Storage type + detail::storage; ///< Storage type using storage_ref_type = typename storage_type::ref_type; ///< Non-owning window storage ref type template diff --git a/include/cuco/detail/bit_vector/bit_vector.inl b/include/cuco/detail/bit_vector/bit_vector.inl index b4584aeac..a91e60d3e 100644 --- a/include/cuco/detail/bit_vector/bit_vector.inl +++ b/include/cuco/detail/bit_vector/bit_vector.inl @@ -18,8 +18,8 @@ namespace cuco { namespace experimental { -template -bit_vector::bit_vector() +template +bit_vector::bit_vector() : words_(), ranks_(), selects_(), @@ -32,8 +32,8 @@ bit_vector::bit_vector() { } -template -bit_vector::~bit_vector() +template +bit_vector::~bit_vector() { delete aow_words_; delete aow_ranks_; @@ -42,16 +42,16 @@ bit_vector::~bit_vector() delete aow_selects0_; } -template -void bit_vector::append(bool bit) +template +void bit_vector::append(bool bit) { if (n_bits_ % 256 == 0) { words_.resize((n_bits_ + 256) / 64); } set(n_bits_, bit); ++n_bits_; } -template -void bit_vector::build() +template +void bit_vector::build() { uint64_t n_blocks = words_.size() / 4; ranks_.resize(n_blocks + 1); @@ -101,18 +101,18 @@ void bit_vector::build() move_to_device(); } -template -void bit_vector::set(Key key, bool bit) +template +void bit_vector::set(size_type index, bool bit) { if (bit) { - words_[key / 64] |= (1UL << (key % 64)); + words_[index / 64] |= (1UL << (index % 64)); } else { - words_[key / 64] &= ~(1UL << (key % 64)); + words_[index / 64] &= ~(1UL << (index % 64)); } } -template -void bit_vector::set_last(bool bit) +template +void bit_vector::set_last(bool bit) { set(n_bits_ - 1, bit); } @@ -142,9 +142,9 @@ void initialize_aow(Storage* storage, thrust::device_vector& device_array, ui storage->data(), num_elements, device_ptr); } -template +template template -void bit_vector::copy_host_array_to_aow( +void bit_vector::copy_host_array_to_aow( storage_type** aow, std::vector& host_array) { uint64_t num_elements = host_array.size(); @@ -158,8 +158,8 @@ void bit_vector::copy_host_array_to_aow( } } -template -void bit_vector::move_to_device() +template +void bit_vector::move_to_device() { copy_host_array_to_aow(&aow_words_, words_); copy_host_array_to_aow(&aow_ranks_, ranks_); @@ -168,9 +168,9 @@ void bit_vector::move_to_device() copy_host_array_to_aow(&aow_selects0_, selects0_); } -template +template template -auto bit_vector::ref(Operators...) const noexcept +auto bit_vector::ref(Operators...) const noexcept { static_assert(sizeof...(Operators), "No operators specified"); return ref_type{aow_words_->ref(), From 0e411be48d0067cdce37ddfe97c6f7daa4685645 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Tue, 30 May 2023 20:29:44 +0000 Subject: [PATCH 22/99] Minor --- include/cuco/bit_vector.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cuco/bit_vector.cuh b/include/cuco/bit_vector.cuh index 25244d6a3..c0eefbb4d 100644 --- a/include/cuco/bit_vector.cuh +++ b/include/cuco/bit_vector.cuh @@ -124,8 +124,8 @@ class bit_vector { */ void set_last(bool bit); - static constexpr auto cg_size = 1; ///< CG size used to for probing - static constexpr auto window_size = 1; ///< Window size used to for probing + static constexpr auto cg_size = 1; ///< CG size used for probing + static constexpr auto window_size = 1; ///< Window size used for probing static constexpr auto thread_scope = Scope; ///< CUDA thread scope using extent_type = From 88dc1a6c528f683fe2abd87ed03e91464701e8d6 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Thu, 17 Aug 2023 23:01:28 +0000 Subject: [PATCH 23/99] Cherrypick bitvector files from trie branch --- .../cuco/detail/bit_vector/bit_vector_ref.inl | 136 ------------ .../trie/bit_vector}/bit_vector.cuh | 19 +- .../{ => trie}/bit_vector/bit_vector.inl | 89 ++++---- .../trie/bit_vector}/bit_vector_ref.cuh | 0 .../detail/trie/bit_vector/bit_vector_ref.inl | 197 ++++++++++++++++++ tests/bit_vector/find_next_set_test.cu | 2 +- tests/bit_vector/get_test.cu | 2 +- tests/bit_vector/rank_test.cu | 2 +- tests/bit_vector/select_test.cu | 2 +- tests/bit_vector/size_test.cu | 2 +- 10 files changed, 262 insertions(+), 189 deletions(-) delete mode 100644 include/cuco/detail/bit_vector/bit_vector_ref.inl rename include/cuco/{ => detail/trie/bit_vector}/bit_vector.cuh (91%) rename include/cuco/detail/{ => trie}/bit_vector/bit_vector.inl (72%) rename include/cuco/{ => detail/trie/bit_vector}/bit_vector_ref.cuh (100%) create mode 100644 include/cuco/detail/trie/bit_vector/bit_vector_ref.inl diff --git a/include/cuco/detail/bit_vector/bit_vector_ref.inl b/include/cuco/detail/bit_vector/bit_vector_ref.inl deleted file mode 100644 index 02577a050..000000000 --- a/include/cuco/detail/bit_vector/bit_vector_ref.inl +++ /dev/null @@ -1,136 +0,0 @@ -#include - -namespace cuco { -namespace experimental { - -template -__host__ __device__ constexpr bit_vector_ref::bit_vector_ref( - StorageRef words_ref, - StorageRef ranks_ref, - StorageRef selects_ref, - StorageRef ranks0_ref, - StorageRef selects0_ref) noexcept - : words_ref_{words_ref}, - ranks_ref_{ranks_ref}, - selects_ref_{selects_ref}, - ranks0_ref_{ranks0_ref}, - selects0_ref_{selects0_ref} -{ -} - -namespace detail { - -template -class operator_impl> { - using ref_type = bit_vector_ref; - - public: - [[nodiscard]] __device__ bool get(uint64_t key) const noexcept - { - auto const& ref_ = static_cast(*this); - return (ref_.words_ref_[key / 64][0] >> (key % 64)) & 1UL; - } - - [[nodiscard]] __device__ uint64_t find_next_set(uint64_t key) const noexcept - { - auto const& ref_ = static_cast(*this); - uint64_t word_id = key / 64; - uint64_t bit_id = key % 64; - uint64_t word = ref_.words_ref_[word_id][0]; - word &= ~(0lu) << bit_id; - while (word == 0) { - word = ref_.words_ref_[++word_id][0]; - } - return (word_id * 64) + __builtin_ffsll(word) - 1; - } - - [[nodiscard]] __device__ uint64_t rank(uint64_t key) const noexcept - { - auto const& ref_ = static_cast(*this); - - uint64_t word_id = key / 64; - uint64_t bit_id = key % 64; - uint64_t rank_id = word_id / 4; - uint64_t rel_id = word_id % 4; - auto rank = rank_union{ref_.ranks_ref_[rank_id][0]}.rank_; - uint64_t n = rank.abs(); - if (rel_id != 0) { n += rank.rels_[rel_id - 1]; } - n += __builtin_popcountll(ref_.words_ref_[word_id][0] & ((1UL << bit_id) - 1)); - return n; - } - - [[nodiscard]] __device__ uint64_t select(uint64_t key) const noexcept - { - auto const& ref_ = static_cast(*this); - - const uint64_t rank_id = binary_search_ranks_array(key, ref_.selects_ref_, ref_.ranks_ref_); - uint64_t word_id = subtract_offset(key, rank_id, ref_.ranks_ref_); - - return (word_id * 64) + ith_set_pos(key, ref_.words_ref_[word_id][0]); - } - - [[nodiscard]] __device__ uint64_t select0(uint64_t key) const noexcept - { - auto const& ref_ = static_cast(*this); - - const uint64_t rank_id = binary_search_ranks_array(key, ref_.selects0_ref_, ref_.ranks0_ref_); - uint64_t word_id = subtract_offset(key, rank_id, ref_.ranks0_ref_); - - return (word_id * 64) + ith_set_pos(key, ~ref_.words_ref_[word_id][0]); - } - - private: - [[nodiscard]] __device__ uint64_t binary_search_ranks_array( - uint64_t key, const StorageRef& selects_ref, const StorageRef& ranks_ref) const noexcept - { - uint64_t block_id = key / 256; - uint64_t begin = selects_ref[block_id][0]; - uint64_t end = selects_ref[block_id + 1][0] + 1UL; - if (begin + 10 >= end) { - while (key >= rank_union{ranks_ref[begin + 1][0]}.rank_.abs()) { - ++begin; - } - } else { - while (begin + 1 < end) { - const uint64_t middle = (begin + end) / 2; - if (key < rank_union{ranks_ref[middle][0]}.rank_.abs()) { - end = middle; - } else { - begin = middle; - } - } - } - return begin; - } - - [[nodiscard]] __device__ uint64_t subtract_offset(uint64_t& key, - uint64_t rank_id, - const StorageRef& ranks_ref) const noexcept - { - const auto& rank = rank_union{ranks_ref[rank_id][0]}.rank_; - key -= rank.abs(); - - uint64_t word_id = rank_id * 4; - bool a0 = key >= rank.rels_[0]; - bool a1 = key >= rank.rels_[1]; - bool a2 = key >= rank.rels_[2]; - - uint64_t inc = a0 + a1 + a2; - word_id += inc; - key -= (inc > 0) * rank.rels_[inc - (inc > 0)]; - - return word_id; - } - - [[nodiscard]] __device__ uint64_t ith_set_pos(uint32_t i, uint64_t word) const noexcept - { - for (uint32_t pos = 0; pos < i; pos++) { - word &= word - 1; - } - return __builtin_ffsll(word & -word) - 1; - } -}; - -} // namespace detail -} // namespace experimental -} // namespace cuco diff --git a/include/cuco/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh similarity index 91% rename from include/cuco/bit_vector.cuh rename to include/cuco/detail/trie/bit_vector/bit_vector.cuh index c0eefbb4d..378fb17fa 100644 --- a/include/cuco/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -17,7 +17,7 @@ #pragma once -#include +#include #include #include #include @@ -33,9 +33,9 @@ namespace experimental { * @brief Struct to store ranks of bits at 256-bit intervals */ struct rank { - uint32_t abs_hi_; ///< Upper 32 bits of base - uint8_t abs_lo_; ///< Lower 8 bits of base - uint8_t rels_[3]; ///< Four offsets for 64-bit sub-intervals + uint32_t abs_hi_; ///< Upper 32 bits of base + uint8_t abs_lo_; ///< Lower 8 bits of base + std::array rels_; ///< Offsets for 64-bit sub-intervals /** * @brief Gets base rank of current 256-bit interval @@ -58,6 +58,7 @@ struct rank { abs_lo_ = static_cast(abs); } }; + /** * @brief Union of 64-bit word with rank * @@ -86,7 +87,7 @@ union rank_union { template , cuda::thread_scope Scope = cuda::thread_scope_device, class Allocator = cuco::cuda_allocator, - class Storage = cuco::experimental::aow_storage<1>> + class Storage = cuco::experimental::storage<1>> class bit_vector { public: bit_vector(); @@ -129,8 +130,8 @@ class bit_vector { static constexpr auto thread_scope = Scope; ///< CUDA thread scope using extent_type = - decltype(make_valid_extent(std::declval())); ///< Extent type - using allocator_type = Allocator; ///< Allocator type + decltype(make_window_extent(std::declval())); ///< Extent type + using allocator_type = Allocator; ///< Allocator type using storage_type = detail::storage; ///< Storage type @@ -195,5 +196,5 @@ class bit_vector { } // namespace experimental } // namespace cuco -#include -#include +#include +#include diff --git a/include/cuco/detail/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl similarity index 72% rename from include/cuco/detail/bit_vector/bit_vector.inl rename to include/cuco/detail/trie/bit_vector/bit_vector.inl index a91e60d3e..55b627a07 100644 --- a/include/cuco/detail/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -45,58 +45,65 @@ bit_vector::~bit_vector() template void bit_vector::append(bool bit) { - if (n_bits_ % 256 == 0) { words_.resize((n_bits_ + 256) / 64); } + if (n_bits_ % 256 == 0) { words_.resize((n_bits_ + 256) / 64); } // Extend by four 64-bit words set(n_bits_, bit); ++n_bits_; } -template -void bit_vector::build() +inline void update_selects(uint64_t word_id, + uint64_t word, + uint64_t& gcount, + std::vector& selects) +{ + uint64_t n_pops = __builtin_popcountll(word); + uint64_t new_gcount = gcount + n_pops; + if (((gcount + 255) / 256) != ((new_gcount + 255) / 256)) { + uint64_t count = gcount; + while (word != 0) { + uint64_t pos = __builtin_ctzll(word); + if (count % 256 == 0) { + selects.push_back(((word_id * 64) + pos) / 256); + break; + } + word ^= 1UL << pos; + ++count; + } + } + gcount = new_gcount; +} + +inline void build_ranks_and_selects(const std::vector& words, + std::vector& ranks, + std::vector& selects, + bool flip_bits) { - uint64_t n_blocks = words_.size() / 4; - ranks_.resize(n_blocks + 1); - ranks0_.resize(n_blocks + 1); + uint64_t n_blocks = words.size() / 4; // Each block has four 64-bit words + ranks.resize(n_blocks + 1); - uint64_t n_ones = 0, n_zeroes = 0; + uint64_t count = 0; for (uint64_t block_id = 0; block_id < n_blocks; ++block_id) { - ranks_[block_id].set_abs(n_ones); - ranks0_[block_id].set_abs(n_zeroes); + ranks[block_id].set_abs(count); for (uint64_t block_offset = 0; block_offset < 4; ++block_offset) { - if (block_offset != 0) { - ranks_[block_id].rels_[block_offset - 1] = n_ones - ranks_[block_id].abs(); - ranks0_[block_id].rels_[block_offset - 1] = n_zeroes - ranks0_[block_id].abs(); + if (block_offset != 0) { // Compute the deltas + ranks[block_id].rels_[block_offset - 1] = count - ranks[block_id].abs(); } - auto update_selects = - [](uint64_t word_id, uint64_t word, uint64_t& gcount, std::vector& selects) { - uint64_t n_pops = __builtin_popcountll(word); - uint64_t new_gcount = gcount + n_pops; - if (((gcount + 255) / 256) != ((new_gcount + 255) / 256)) { - uint64_t count = gcount; - while (word != 0) { - uint64_t pos = __builtin_ctzll(word); - if (count % 256 == 0) { - selects.push_back(((word_id * 64) + pos) / 256); - break; - } - word ^= 1UL << pos; - ++count; - } - } - gcount = new_gcount; - }; - uint64_t word_id = (block_id * 4) + block_offset; - update_selects(word_id, words_[word_id], n_ones, selects_); - update_selects(word_id, ~words_[word_id], n_zeroes, selects0_); + auto word = flip_bits ? ~words[word_id] : words[word_id]; + update_selects(word_id, word, count, selects); // Will update count } } - ranks_.back().set_abs(n_ones); - ranks0_.back().set_abs(n_zeroes); - selects_.push_back(words_.size() * 64 / 256); - selects0_.push_back(words_.size() * 64 / 256); + ranks.back().set_abs(count); + selects.push_back(words.size() * 64 / 256); +} + +template +void bit_vector::build() +{ + build_ranks_and_selects(words_, ranks_, selects_, false); // 1-bits + build_ranks_and_selects(words_, ranks0_, selects0_, true); // 0-bits move_to_device(); } @@ -117,6 +124,7 @@ void bit_vector::set_last(bool bit) set(n_bits_ - 1, bit); } +// Copies device array to window structure template __global__ void copy_to_window(WindowT* windows, cuco::detail::index_type n, T* values) { @@ -148,12 +156,15 @@ void bit_vector::copy_host_array_to_aow( storage_type** aow, std::vector& host_array) { uint64_t num_elements = host_array.size(); - *aow = new storage_type(make_valid_extent(extent{num_elements + 1}), - allocator_); + *aow = new storage_type( + make_window_extent(extent{num_elements + 1}), allocator_); if (num_elements > 0) { + // Move host array to device memory thrust::device_vector device_array = host_array; host_array.clear(); + + // Copy device array to window structure initialize_aow(*aow, device_array, num_elements); } } diff --git a/include/cuco/bit_vector_ref.cuh b/include/cuco/detail/trie/bit_vector/bit_vector_ref.cuh similarity index 100% rename from include/cuco/bit_vector_ref.cuh rename to include/cuco/detail/trie/bit_vector/bit_vector_ref.cuh diff --git a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl new file mode 100644 index 000000000..ef65bb976 --- /dev/null +++ b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl @@ -0,0 +1,197 @@ +#include + +namespace cuco { +namespace experimental { + +template +__host__ __device__ constexpr bit_vector_ref::bit_vector_ref( + StorageRef words_ref, + StorageRef ranks_ref, + StorageRef selects_ref, + StorageRef ranks0_ref, + StorageRef selects0_ref) noexcept + : words_ref_{words_ref}, + ranks_ref_{ranks_ref}, + selects_ref_{selects_ref}, + ranks0_ref_{ranks0_ref}, + selects0_ref_{selects0_ref} +{ +} + +namespace detail { + +template +class operator_impl> { + using ref_type = bit_vector_ref; + + public: + /** + * @brief Access value of a single bit + * + * @param key Position of bit + * + * @return Value of bit at position specified by key + */ + [[nodiscard]] __device__ bool get(uint64_t key) const noexcept + { + auto const& ref_ = static_cast(*this); + return (ref_.words_ref_[key / 64][0] >> (key % 64)) & 1UL; + } + + /** + * @brief Find position of first set bit starting from a given position (inclusive) + * + * @param key Position of starting bit + * + * @return Index of next set bit + */ + [[nodiscard]] __device__ uint64_t find_next_set(uint64_t key) const noexcept + { + auto const& ref_ = static_cast(*this); + uint64_t word_id = key / 64; + uint64_t bit_id = key % 64; + uint64_t word = ref_.words_ref_[word_id][0]; + word &= ~(0lu) << bit_id; + while (word == 0) { + word = ref_.words_ref_[++word_id][0]; + } + return (word_id * 64) + __builtin_ffsll(word) - 1; + } + + /** + * @brief Find number of set bits (rank) in all positions before the input position (exclusive) + * + * @param key Input bit position + * + * @return Rank of input position + */ + [[nodiscard]] __device__ uint64_t rank(uint64_t key) const noexcept + { + auto const& ref_ = static_cast(*this); + + uint64_t word_id = key / 64; + uint64_t bit_id = key % 64; + uint64_t rank_id = word_id / 4; + uint64_t rel_id = word_id % 4; + auto rank = rank_union{ref_.ranks_ref_[rank_id][0]}.rank_; + uint64_t n = rank.abs(); + if (rel_id != 0) { n += rank.rels_[rel_id - 1]; } + n += __builtin_popcountll(ref_.words_ref_[word_id][0] & ((1UL << bit_id) - 1)); + return n; + } + + /** + * @brief Find position of Nth set (1) bit counting from start of bitvector + * + * @param count Input N + * + * @return Position of Nth set bit + */ + [[nodiscard]] __device__ uint64_t select(uint64_t count) const noexcept + { + auto const& ref_ = static_cast(*this); + + uint64_t rank_id = get_initial_rank_estimate(count, ref_.selects_ref_, ref_.ranks_ref_); + auto rank = rank_union{ref_.ranks_ref_[rank_id][0]}.rank_; + + uint64_t word_id = rank_id * 4; + word_id += subtract_rank_from_count(count, rank); + + return (word_id * 64) + select_bit_in_word(count, ref_.words_ref_[word_id][0]); + } + + /** + * @brief Find position of Nth not-set (0) bit counting from start of bitvector + * + * @param count Input N + * + * @return Position of Nth not-set bit + */ + [[nodiscard]] __device__ uint64_t select0(uint64_t count) const noexcept + { + auto const& ref_ = static_cast(*this); + + const uint64_t rank_id = get_initial_rank_estimate(count, ref_.selects0_ref_, ref_.ranks0_ref_); + auto rank = rank_union{ref_.ranks0_ref_[rank_id][0]}.rank_; + + uint64_t word_id = rank_id * 4; + word_id += subtract_rank_from_count(count, rank); + + return (word_id * 64) + select_bit_in_word(count, ~ref_.words_ref_[word_id][0]); + } + + private: + /** + * @brief Helper function for select operation that computes an initial rank estimate + * + * @param count Input count for which select operation is being performed + * @param selects Selects array + * @param ranks Ranks array + * + * @return index in ranks which corresponds to highest rank less than count (least upper bound) + */ + [[nodiscard]] __device__ uint64_t get_initial_rank_estimate( + uint64_t count, const StorageRef& selects, const StorageRef& ranks) const noexcept + { + uint64_t block_id = count / 256; + uint64_t begin = selects[block_id][0]; + uint64_t end = selects[block_id + 1][0] + 1UL; + if (begin + 10 >= end) { // Linear search + while (count >= rank_union{ranks[begin + 1][0]}.rank_.abs()) { + ++begin; + } + } else { // Binary search + while (begin + 1 < end) { + const uint64_t middle = (begin + end) / 2; + if (count < rank_union{ranks[middle][0]}.rank_.abs()) { + end = middle; + } else { + begin = middle; + } + } + } + return begin; + } + + /** + * @brief Subtract rank estimate from input count and return an increment to word_id + * + * @param count Input count that will be updated + * @param rank Initial rank estimate for count + * + * @return Increment to word_id based on rank values + */ + [[nodiscard]] __device__ uint64_t + subtract_rank_from_count(uint64_t& count, cuco::experimental::rank rank) const noexcept + { + count -= rank.abs(); + + bool a0 = count >= rank.rels_[0]; + bool a1 = count >= rank.rels_[1]; + bool a2 = count >= rank.rels_[2]; + uint64_t inc = a0 + a1 + a2; + + count -= (inc > 0) * rank.rels_[inc - (inc > 0)]; + + return inc; + } + + /** + * @brief Find position of Nth set bit in a 64-bit word + * + * @param N Input count + * + * @return Position of Nth set bit + */ + [[nodiscard]] __device__ uint64_t select_bit_in_word(uint32_t N, uint64_t word) const noexcept + { + for (uint32_t pos = 0; pos < N; pos++) { + word &= word - 1; + } + return __builtin_ffsll(word & -word) - 1; + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/tests/bit_vector/find_next_set_test.cu b/tests/bit_vector/find_next_set_test.cu index 15b8b1673..5b625efd5 100644 --- a/tests/bit_vector/find_next_set_test.cu +++ b/tests/bit_vector/find_next_set_test.cu @@ -16,7 +16,7 @@ #include -#include +#include #include #include diff --git a/tests/bit_vector/get_test.cu b/tests/bit_vector/get_test.cu index ec84db051..f2712c868 100644 --- a/tests/bit_vector/get_test.cu +++ b/tests/bit_vector/get_test.cu @@ -16,7 +16,7 @@ #include -#include +#include #include #include diff --git a/tests/bit_vector/rank_test.cu b/tests/bit_vector/rank_test.cu index e6f7b9dea..c3981da0a 100644 --- a/tests/bit_vector/rank_test.cu +++ b/tests/bit_vector/rank_test.cu @@ -16,7 +16,7 @@ #include -#include +#include #include #include diff --git a/tests/bit_vector/select_test.cu b/tests/bit_vector/select_test.cu index 7eb0fae73..32dd73565 100644 --- a/tests/bit_vector/select_test.cu +++ b/tests/bit_vector/select_test.cu @@ -16,7 +16,7 @@ #include -#include +#include #include #include diff --git a/tests/bit_vector/size_test.cu b/tests/bit_vector/size_test.cu index 4c676ce87..7ad2390d4 100644 --- a/tests/bit_vector/size_test.cu +++ b/tests/bit_vector/size_test.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include From fe68a913f9bc03dea9bba2f0fded64b6dd9a594b Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 18 Aug 2023 00:46:54 +0000 Subject: [PATCH 24/99] Use cuda math instrincs in device functions --- include/cuco/detail/trie/bit_vector/bit_vector_ref.inl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl index ef65bb976..0c2ef5c2b 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl @@ -55,7 +55,7 @@ class operator_impl> { while (word == 0) { word = ref_.words_ref_[++word_id][0]; } - return (word_id * 64) + __builtin_ffsll(word) - 1; + return (word_id * 64) + __ffsll(word) - 1; } /** @@ -76,7 +76,7 @@ class operator_impl> { auto rank = rank_union{ref_.ranks_ref_[rank_id][0]}.rank_; uint64_t n = rank.abs(); if (rel_id != 0) { n += rank.rels_[rel_id - 1]; } - n += __builtin_popcountll(ref_.words_ref_[word_id][0] & ((1UL << bit_id) - 1)); + n += __popcll(ref_.words_ref_[word_id][0] & ((1UL << bit_id) - 1)); return n; } @@ -188,7 +188,7 @@ class operator_impl> { for (uint32_t pos = 0; pos < N; pos++) { word &= word - 1; } - return __builtin_ffsll(word & -word) - 1; + return __ffsll(word & -word) - 1; } }; From 3894d5a984023a9e417c7f9e409f742d9b76fc76 Mon Sep 17 00:00:00 2001 From: amukkara <134339030+amukkara@users.noreply.github.com> Date: Thu, 17 Aug 2023 22:38:09 -0700 Subject: [PATCH 25/99] Use cuda::std::array Co-authored-by: Yunsong Wang --- include/cuco/detail/trie/bit_vector/bit_vector.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 378fb17fa..15b0e32a5 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -35,7 +35,7 @@ namespace experimental { struct rank { uint32_t abs_hi_; ///< Upper 32 bits of base uint8_t abs_lo_; ///< Lower 8 bits of base - std::array rels_; ///< Offsets for 64-bit sub-intervals + cuda::std::array rels_; ///< Offsets for 64-bit sub-intervals /** * @brief Gets base rank of current 256-bit interval From 80835d7a5987d876c6ba7ee1ffb19b654fe439db Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 18 Aug 2023 05:38:15 +0000 Subject: [PATCH 26/99] [pre-commit.ci] auto code formatting --- include/cuco/detail/trie/bit_vector/bit_vector.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 15b0e32a5..d9c95431c 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -33,8 +33,8 @@ namespace experimental { * @brief Struct to store ranks of bits at 256-bit intervals */ struct rank { - uint32_t abs_hi_; ///< Upper 32 bits of base - uint8_t abs_lo_; ///< Lower 8 bits of base + uint32_t abs_hi_; ///< Upper 32 bits of base + uint8_t abs_lo_; ///< Lower 8 bits of base cuda::std::array rels_; ///< Offsets for 64-bit sub-intervals /** From 10c09006eea55817ad931214a4b40685611a6fef Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 18 Aug 2023 06:11:15 +0000 Subject: [PATCH 27/99] Add constexpr and noexcept specifiers --- .../cuco/detail/trie/bit_vector/bit_vector.cuh | 18 +++++++++--------- .../cuco/detail/trie/bit_vector/bit_vector.inl | 16 ++++++++-------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index d9c95431c..3aca9dd54 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -42,7 +42,7 @@ struct rank { * * @return The base rank */ - __host__ __device__ uint64_t abs() const + __host__ __device__ uint64_t constexpr abs() const noexcept { return (static_cast(abs_hi_) << 8) | abs_lo_; } @@ -52,7 +52,7 @@ struct rank { * * @param abs Base rank */ - void set_abs(uint64_t abs) + void constexpr set_abs(uint64_t abs) noexcept { abs_hi_ = static_cast(abs >> 8); abs_lo_ = static_cast(abs); @@ -100,14 +100,14 @@ class bit_vector { * * @param bit Boolean value of new bit to be added */ - void append(bool bit); + void append(bool bit) noexcept; /** * @brief Builds indexes for rank and select * * Also creates device-side snapshot */ - void build(); + void build() noexcept; using size_type = typename Extent::value_type; ///< size type to specify bit index /** @@ -116,14 +116,14 @@ class bit_vector { * @param index position of bit to be modified * @param bit new value of bit */ - void set(size_type index, bool bit); + void set(size_type index, bool bit) noexcept; /** * @brief Sets last bit to specified value * * @param bit new value of last bit */ - void set_last(bool bit); + void set_last(bool bit) noexcept; static constexpr auto cg_size = 1; ///< CG size used for probing static constexpr auto window_size = 1; ///< Window size used for probing @@ -158,7 +158,7 @@ class bit_vector { * * @return Number of bits bit_vector holds */ - size_t size() const { return n_bits_; } + size_t constexpr size() const noexcept { return n_bits_; } private: uint64_t n_bits_; ///< Number of bits added to bit_vector @@ -183,14 +183,14 @@ class bit_vector { * @param host_array host array whose contents are used to intialize aow */ template - void copy_host_array_to_aow(storage_type** aow, std::vector& host_array); + void copy_host_array_to_aow(storage_type** aow, std::vector& host_array) noexcept; /** * @brief Constructs device-side structures and clears host-side structures * * Effectively takes a snapshot of the bitvector and creates a device-side copy */ - void move_to_device(); + void move_to_device() noexcept; }; } // namespace experimental diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 55b627a07..ae9ff2c63 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -43,7 +43,7 @@ bit_vector::~bit_vector() } template -void bit_vector::append(bool bit) +void bit_vector::append(bool bit) noexcept { if (n_bits_ % 256 == 0) { words_.resize((n_bits_ + 256) / 64); } // Extend by four 64-bit words set(n_bits_, bit); @@ -53,7 +53,7 @@ void bit_vector::append(bool bit) inline void update_selects(uint64_t word_id, uint64_t word, uint64_t& gcount, - std::vector& selects) + std::vector& selects) noexcept { uint64_t n_pops = __builtin_popcountll(word); uint64_t new_gcount = gcount + n_pops; @@ -75,7 +75,7 @@ inline void update_selects(uint64_t word_id, inline void build_ranks_and_selects(const std::vector& words, std::vector& ranks, std::vector& selects, - bool flip_bits) + bool flip_bits) noexcept { uint64_t n_blocks = words.size() / 4; // Each block has four 64-bit words ranks.resize(n_blocks + 1); @@ -100,7 +100,7 @@ inline void build_ranks_and_selects(const std::vector& words, } template -void bit_vector::build() +void bit_vector::build() noexcept { build_ranks_and_selects(words_, ranks_, selects_, false); // 1-bits build_ranks_and_selects(words_, ranks0_, selects0_, true); // 0-bits @@ -109,7 +109,7 @@ void bit_vector::build() } template -void bit_vector::set(size_type index, bool bit) +void bit_vector::set(size_type index, bool bit) noexcept { if (bit) { words_[index / 64] |= (1UL << (index % 64)); @@ -119,7 +119,7 @@ void bit_vector::set(size_type index, bool bi } template -void bit_vector::set_last(bool bit) +void bit_vector::set_last(bool bit) noexcept { set(n_bits_ - 1, bit); } @@ -153,7 +153,7 @@ void initialize_aow(Storage* storage, thrust::device_vector& device_array, ui template template void bit_vector::copy_host_array_to_aow( - storage_type** aow, std::vector& host_array) + storage_type** aow, std::vector& host_array) noexcept { uint64_t num_elements = host_array.size(); *aow = new storage_type( @@ -170,7 +170,7 @@ void bit_vector::copy_host_array_to_aow( } template -void bit_vector::move_to_device() +void bit_vector::move_to_device() noexcept { copy_host_array_to_aow(&aow_words_, words_); copy_host_array_to_aow(&aow_ranks_, ranks_); From 460f989359380d28e8ed39c937d22f4c1b019db4 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 18 Aug 2023 06:13:48 +0000 Subject: [PATCH 28/99] Spacing --- include/cuco/detail/trie/bit_vector/bit_vector_ref.inl | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl index 0c2ef5c2b..cbfb5f6dc 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl @@ -73,10 +73,14 @@ class operator_impl> { uint64_t bit_id = key % 64; uint64_t rank_id = word_id / 4; uint64_t rel_id = word_id % 4; - auto rank = rank_union{ref_.ranks_ref_[rank_id][0]}.rank_; - uint64_t n = rank.abs(); + + auto rank = rank_union{ref_.ranks_ref_[rank_id][0]}.rank_; + uint64_t n = rank.abs(); + if (rel_id != 0) { n += rank.rels_[rel_id - 1]; } + n += __popcll(ref_.words_ref_[word_id][0] & ((1UL << bit_id) - 1)); + return n; } @@ -136,6 +140,7 @@ class operator_impl> { uint64_t block_id = count / 256; uint64_t begin = selects[block_id][0]; uint64_t end = selects[block_id + 1][0] + 1UL; + if (begin + 10 >= end) { // Linear search while (count >= rank_union{ranks[begin + 1][0]}.rank_.abs()) { ++begin; @@ -143,6 +148,7 @@ class operator_impl> { } else { // Binary search while (begin + 1 < end) { const uint64_t middle = (begin + end) / 2; + if (count < rank_union{ranks[middle][0]}.rank_.abs()) { end = middle; } else { From d063cc50bc0b5ad0b02b8f032ba75c4a55b2f968 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 18 Aug 2023 07:18:29 +0000 Subject: [PATCH 29/99] Remove unnecessary template parameters --- .../detail/trie/bit_vector/bit_vector.cuh | 24 +++-------- .../detail/trie/bit_vector/bit_vector.inl | 41 +++++++++---------- 2 files changed, 26 insertions(+), 39 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 3aca9dd54..2054d8685 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -78,16 +78,10 @@ union rank_union { * Bitvector construction happens on host, after which the structures are moved to device. * All subsequent read-only operations access device structures only. * - * @tparam Extent Data structure size type - * @tparam Scope The scope in which operations will be performed by individual threads. * @tparam Allocator Type of allocator used for device storage - * @tparam Storage Slot window storage type */ -template , - cuda::thread_scope Scope = cuda::thread_scope_device, - class Allocator = cuco::cuda_allocator, - class Storage = cuco::experimental::storage<1>> +template > class bit_vector { public: bit_vector(); @@ -109,7 +103,7 @@ class bit_vector { */ void build() noexcept; - using size_type = typename Extent::value_type; ///< size type to specify bit index + using size_type = std::size_t; ///< size type to specify bit index /** * @brief Modifies a single bit * @@ -125,21 +119,15 @@ class bit_vector { */ void set_last(bool bit) noexcept; - static constexpr auto cg_size = 1; ///< CG size used for probing - static constexpr auto window_size = 1; ///< Window size used for probing - static constexpr auto thread_scope = Scope; ///< CUDA thread scope - - using extent_type = - decltype(make_window_extent(std::declval())); ///< Extent type - using allocator_type = Allocator; ///< Allocator type + using allocator_type = Allocator; ///< Allocator type + using slot_type = uint64_t; ///< Slot type using storage_type = - detail::storage; ///< Storage type + aow_storage, allocator_type>; ///< Storage type using storage_ref_type = typename storage_type::ref_type; ///< Non-owning window storage ref type template using ref_type = - cuco::experimental::bit_vector_ref; ///< Non-owning container ref type + bit_vector_ref; ///< Non-owning container ref type /** * @brief Get device ref with operators. diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index ae9ff2c63..45afe952a 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -18,8 +18,8 @@ namespace cuco { namespace experimental { -template -bit_vector::bit_vector() +template +bit_vector::bit_vector() : words_(), ranks_(), selects_(), @@ -32,8 +32,8 @@ bit_vector::bit_vector() { } -template -bit_vector::~bit_vector() +template +bit_vector::~bit_vector() { delete aow_words_; delete aow_ranks_; @@ -42,8 +42,8 @@ bit_vector::~bit_vector() delete aow_selects0_; } -template -void bit_vector::append(bool bit) noexcept +template +void bit_vector::append(bool bit) noexcept { if (n_bits_ % 256 == 0) { words_.resize((n_bits_ + 256) / 64); } // Extend by four 64-bit words set(n_bits_, bit); @@ -99,8 +99,8 @@ inline void build_ranks_and_selects(const std::vector& words, selects.push_back(words.size() * 64 / 256); } -template -void bit_vector::build() noexcept +template +void bit_vector::build() noexcept { build_ranks_and_selects(words_, ranks_, selects_, false); // 1-bits build_ranks_and_selects(words_, ranks0_, selects0_, true); // 0-bits @@ -108,8 +108,8 @@ void bit_vector::build() noexcept move_to_device(); } -template -void bit_vector::set(size_type index, bool bit) noexcept +template +void bit_vector::set(size_type index, bool bit) noexcept { if (bit) { words_[index / 64] |= (1UL << (index % 64)); @@ -118,8 +118,8 @@ void bit_vector::set(size_type index, bool bi } } -template -void bit_vector::set_last(bool bit) noexcept +template +void bit_vector::set_last(bool bit) noexcept { set(n_bits_ - 1, bit); } @@ -150,14 +150,13 @@ void initialize_aow(Storage* storage, thrust::device_vector& device_array, ui storage->data(), num_elements, device_ptr); } -template +template template -void bit_vector::copy_host_array_to_aow( - storage_type** aow, std::vector& host_array) noexcept +void bit_vector::copy_host_array_to_aow(storage_type** aow, + std::vector& host_array) noexcept { uint64_t num_elements = host_array.size(); - *aow = new storage_type( - make_window_extent(extent{num_elements + 1}), allocator_); + *aow = new storage_type(extent{num_elements + 1}, allocator_); if (num_elements > 0) { // Move host array to device memory @@ -169,8 +168,8 @@ void bit_vector::copy_host_array_to_aow( } } -template -void bit_vector::move_to_device() noexcept +template +void bit_vector::move_to_device() noexcept { copy_host_array_to_aow(&aow_words_, words_); copy_host_array_to_aow(&aow_ranks_, ranks_); @@ -179,9 +178,9 @@ void bit_vector::move_to_device() noexcept copy_host_array_to_aow(&aow_selects0_, selects0_); } -template +template template -auto bit_vector::ref(Operators...) const noexcept +auto bit_vector::ref(Operators...) const noexcept { static_assert(sizeof...(Operators), "No operators specified"); return ref_type{aow_words_->ref(), From a371e27b6c612e7a61bee2fe6a6e70654f006051 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 18 Aug 2023 07:24:32 +0000 Subject: [PATCH 30/99] Allocator argument in constructor --- include/cuco/detail/trie/bit_vector/bit_vector.cuh | 2 +- include/cuco/detail/trie/bit_vector/bit_vector.inl | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 2054d8685..3c1307a59 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -84,7 +84,7 @@ union rank_union { template > class bit_vector { public: - bit_vector(); + bit_vector(Allocator const& allocator = Allocator{}); ~bit_vector(); /** diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 45afe952a..187f77001 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -19,11 +19,12 @@ namespace cuco { namespace experimental { template -bit_vector::bit_vector() +bit_vector::bit_vector(Allocator const& allocator) : words_(), ranks_(), selects_(), n_bits_(0), + allocator_(allocator), aow_words_(nullptr), aow_ranks_(nullptr), aow_selects_(nullptr), From 91b628a5925bec0b4a4c7f21dc6ebad40c8ba0d9 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 18 Aug 2023 16:55:48 +0000 Subject: [PATCH 31/99] Use size_type and slot_type Avoid using plain uint64_t --- .../detail/trie/bit_vector/bit_vector.cuh | 14 ++-- .../detail/trie/bit_vector/bit_vector.inl | 31 ++++---- .../detail/trie/bit_vector/bit_vector_ref.inl | 75 ++++++++++--------- 3 files changed, 62 insertions(+), 58 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 3c1307a59..06525ce9b 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -146,17 +146,17 @@ class bit_vector { * * @return Number of bits bit_vector holds */ - size_t constexpr size() const noexcept { return n_bits_; } + size_type constexpr size() const noexcept { return n_bits_; } private: - uint64_t n_bits_; ///< Number of bits added to bit_vector + size_type n_bits_; ///< Number of bits added to bit_vector // Host-side structures - std::vector words_; ///< Words vector that represents all bits - std::vector ranks_; ///< Holds the rank values for every 256-th bit (4-th word) - std::vector ranks0_; ///< Same as ranks_ but for `0` bits - std::vector selects_; ///< Holds pointers to (0, 256, 512...)th `1` bit in ranks_ - std::vector selects0_; ///< Same as selects_, but for `0` bits + std::vector words_; ///< Words vector that represents all bits + std::vector ranks_; ///< Holds the rank values for every 256-th bit (4-th word) + std::vector ranks0_; ///< Same as ranks_ but for `0` bits + std::vector selects_; ///< Holds indices of (0, 256, 512...)th `1` bit in ranks_ + std::vector selects0_; ///< Same as selects_, but for `0` bits // Device-side structures allocator_type allocator_; ///< Allocator used to (de)allocate temporary storage diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 187f77001..76575b185 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -51,10 +51,11 @@ void bit_vector::append(bool bit) noexcept ++n_bits_; } -inline void update_selects(uint64_t word_id, - uint64_t word, - uint64_t& gcount, - std::vector& selects) noexcept +template +inline void update_selects(size_type word_id, + slot_type word, + size_type& gcount, + std::vector& selects) noexcept { uint64_t n_pops = __builtin_popcountll(word); uint64_t new_gcount = gcount + n_pops; @@ -73,31 +74,33 @@ inline void update_selects(uint64_t word_id, gcount = new_gcount; } -inline void build_ranks_and_selects(const std::vector& words, +template +inline void build_ranks_and_selects(const std::vector& words, std::vector& ranks, - std::vector& selects, + std::vector& selects, bool flip_bits) noexcept { - uint64_t n_blocks = words.size() / 4; // Each block has four 64-bit words + constexpr size_type words_per_block = 4; + size_type n_blocks = words.size() / words_per_block; ranks.resize(n_blocks + 1); - uint64_t count = 0; - for (uint64_t block_id = 0; block_id < n_blocks; ++block_id) { + size_type count = 0; + for (size_type block_id = 0; block_id < n_blocks; ++block_id) { ranks[block_id].set_abs(count); - for (uint64_t block_offset = 0; block_offset < 4; ++block_offset) { - if (block_offset != 0) { // Compute the deltas + for (size_type block_offset = 0; block_offset < words_per_block; ++block_offset) { + if (block_offset != 0) { // Compute deltas ranks[block_id].rels_[block_offset - 1] = count - ranks[block_id].abs(); } - uint64_t word_id = (block_id * 4) + block_offset; - auto word = flip_bits ? ~words[word_id] : words[word_id]; + size_type word_id = (block_id * words_per_block) + block_offset; + slot_type word = flip_bits ? ~words[word_id] : words[word_id]; update_selects(word_id, word, count, selects); // Will update count } } ranks.back().set_abs(count); - selects.push_back(words.size() * 64 / 256); + selects.push_back(n_blocks); } template diff --git a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl index cbfb5f6dc..91c34d244 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl @@ -22,7 +22,9 @@ namespace detail { template class operator_impl> { - using ref_type = bit_vector_ref; + using ref_type = bit_vector_ref; + using size_type = typename StorageRef::size_type; + using slot_type = typename StorageRef::value_type; public: /** @@ -32,7 +34,7 @@ class operator_impl> { * * @return Value of bit at position specified by key */ - [[nodiscard]] __device__ bool get(uint64_t key) const noexcept + [[nodiscard]] __device__ bool get(size_type key) const noexcept { auto const& ref_ = static_cast(*this); return (ref_.words_ref_[key / 64][0] >> (key % 64)) & 1UL; @@ -45,12 +47,12 @@ class operator_impl> { * * @return Index of next set bit */ - [[nodiscard]] __device__ uint64_t find_next_set(uint64_t key) const noexcept + [[nodiscard]] __device__ size_type find_next_set(size_type key) const noexcept { - auto const& ref_ = static_cast(*this); - uint64_t word_id = key / 64; - uint64_t bit_id = key % 64; - uint64_t word = ref_.words_ref_[word_id][0]; + auto const& ref_ = static_cast(*this); + size_type word_id = key / 64; + size_type bit_id = key % 64; + slot_type word = ref_.words_ref_[word_id][0]; word &= ~(0lu) << bit_id; while (word == 0) { word = ref_.words_ref_[++word_id][0]; @@ -65,17 +67,17 @@ class operator_impl> { * * @return Rank of input position */ - [[nodiscard]] __device__ uint64_t rank(uint64_t key) const noexcept + [[nodiscard]] __device__ size_type rank(size_type key) const noexcept { auto const& ref_ = static_cast(*this); - uint64_t word_id = key / 64; - uint64_t bit_id = key % 64; - uint64_t rank_id = word_id / 4; - uint64_t rel_id = word_id % 4; + size_type word_id = key / 64; + size_type bit_id = key % 64; + size_type rank_id = word_id / 4; + size_type rel_id = word_id % 4; - auto rank = rank_union{ref_.ranks_ref_[rank_id][0]}.rank_; - uint64_t n = rank.abs(); + auto rank = rank_union{ref_.ranks_ref_[rank_id][0]}.rank_; + size_type n = rank.abs(); if (rel_id != 0) { n += rank.rels_[rel_id - 1]; } @@ -91,14 +93,14 @@ class operator_impl> { * * @return Position of Nth set bit */ - [[nodiscard]] __device__ uint64_t select(uint64_t count) const noexcept + [[nodiscard]] __device__ size_type select(size_type count) const noexcept { auto const& ref_ = static_cast(*this); - uint64_t rank_id = get_initial_rank_estimate(count, ref_.selects_ref_, ref_.ranks_ref_); - auto rank = rank_union{ref_.ranks_ref_[rank_id][0]}.rank_; + size_type rank_id = get_initial_rank_estimate(count, ref_.selects_ref_, ref_.ranks_ref_); + auto rank = rank_union{ref_.ranks_ref_[rank_id][0]}.rank_; - uint64_t word_id = rank_id * 4; + size_type word_id = rank_id * 4; word_id += subtract_rank_from_count(count, rank); return (word_id * 64) + select_bit_in_word(count, ref_.words_ref_[word_id][0]); @@ -111,14 +113,14 @@ class operator_impl> { * * @return Position of Nth not-set bit */ - [[nodiscard]] __device__ uint64_t select0(uint64_t count) const noexcept + [[nodiscard]] __device__ size_type select0(size_type count) const noexcept { auto const& ref_ = static_cast(*this); - const uint64_t rank_id = get_initial_rank_estimate(count, ref_.selects0_ref_, ref_.ranks0_ref_); - auto rank = rank_union{ref_.ranks0_ref_[rank_id][0]}.rank_; + size_type rank_id = get_initial_rank_estimate(count, ref_.selects0_ref_, ref_.ranks0_ref_); + auto rank = rank_union{ref_.ranks0_ref_[rank_id][0]}.rank_; - uint64_t word_id = rank_id * 4; + size_type word_id = rank_id * 4; word_id += subtract_rank_from_count(count, rank); return (word_id * 64) + select_bit_in_word(count, ~ref_.words_ref_[word_id][0]); @@ -134,12 +136,12 @@ class operator_impl> { * * @return index in ranks which corresponds to highest rank less than count (least upper bound) */ - [[nodiscard]] __device__ uint64_t get_initial_rank_estimate( - uint64_t count, const StorageRef& selects, const StorageRef& ranks) const noexcept + [[nodiscard]] __device__ size_type get_initial_rank_estimate( + size_type count, const StorageRef& selects, const StorageRef& ranks) const noexcept { - uint64_t block_id = count / 256; - uint64_t begin = selects[block_id][0]; - uint64_t end = selects[block_id + 1][0] + 1UL; + size_type block_id = count / 256; + size_type begin = selects[block_id][0]; + size_type end = selects[block_id + 1][0] + 1UL; if (begin + 10 >= end) { // Linear search while (count >= rank_union{ranks[begin + 1][0]}.rank_.abs()) { @@ -147,8 +149,7 @@ class operator_impl> { } } else { // Binary search while (begin + 1 < end) { - const uint64_t middle = (begin + end) / 2; - + size_type middle = (begin + end) / 2; if (count < rank_union{ranks[middle][0]}.rank_.abs()) { end = middle; } else { @@ -167,15 +168,15 @@ class operator_impl> { * * @return Increment to word_id based on rank values */ - [[nodiscard]] __device__ uint64_t - subtract_rank_from_count(uint64_t& count, cuco::experimental::rank rank) const noexcept + [[nodiscard]] __device__ size_type + subtract_rank_from_count(size_type& count, cuco::experimental::rank rank) const noexcept { count -= rank.abs(); - bool a0 = count >= rank.rels_[0]; - bool a1 = count >= rank.rels_[1]; - bool a2 = count >= rank.rels_[2]; - uint64_t inc = a0 + a1 + a2; + bool a0 = count >= rank.rels_[0]; + bool a1 = count >= rank.rels_[1]; + bool a2 = count >= rank.rels_[2]; + size_type inc = a0 + a1 + a2; count -= (inc > 0) * rank.rels_[inc - (inc > 0)]; @@ -189,9 +190,9 @@ class operator_impl> { * * @return Position of Nth set bit */ - [[nodiscard]] __device__ uint64_t select_bit_in_word(uint32_t N, uint64_t word) const noexcept + [[nodiscard]] __device__ size_type select_bit_in_word(size_type N, slot_type word) const noexcept { - for (uint32_t pos = 0; pos < N; pos++) { + for (size_type pos = 0; pos < N; pos++) { word &= word - 1; } return __ffsll(word & -word) - 1; From 9ac0da9ee5d585188f766e6503d4e30d89c30582 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 18 Aug 2023 18:37:02 +0000 Subject: [PATCH 32/99] Explicitly define frequently used constants --- .../detail/trie/bit_vector/bit_vector.cuh | 14 +++++ .../detail/trie/bit_vector/bit_vector.inl | 52 +++++++++++-------- .../detail/trie/bit_vector/bit_vector_ref.inl | 29 ++++++----- 3 files changed, 60 insertions(+), 35 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 06525ce9b..1b42b46a5 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -151,6 +151,10 @@ class bit_vector { private: size_type n_bits_; ///< Number of bits added to bit_vector + const size_type bits_per_word = sizeof(slot_type) * 8; + const size_type words_per_block = 4; + const size_type bits_per_block = words_per_block * bits_per_word; + // Host-side structures std::vector words_; ///< Words vector that represents all bits std::vector ranks_; ///< Holds the rank values for every 256-th bit (4-th word) @@ -179,6 +183,16 @@ class bit_vector { * Effectively takes a snapshot of the bitvector and creates a device-side copy */ void move_to_device() noexcept; + + void update_selects(size_type word_id, + slot_type word, + size_type& gcount, + std::vector& selects) noexcept; + + void build_ranks_and_selects(const std::vector& words, + std::vector& ranks, + std::vector& selects, + bool flip_bits) noexcept; }; } // namespace experimental diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 76575b185..7494b9c32 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -46,25 +46,31 @@ bit_vector::~bit_vector() template void bit_vector::append(bool bit) noexcept { - if (n_bits_ % 256 == 0) { words_.resize((n_bits_ + 256) / 64); } // Extend by four 64-bit words + if (n_bits_ % bits_per_block == 0) { + size_type new_n_bits = n_bits_ + bits_per_block; // Extend storage by one block + size_type new_n_words = new_n_bits / words_per_block; + + words_.resize(new_n_words); + } set(n_bits_, bit); ++n_bits_; } -template -inline void update_selects(size_type word_id, - slot_type word, - size_type& gcount, - std::vector& selects) noexcept +template +void bit_vector::update_selects(size_type word_id, + slot_type word, + size_type& gcount, + std::vector& selects) noexcept { - uint64_t n_pops = __builtin_popcountll(word); - uint64_t new_gcount = gcount + n_pops; - if (((gcount + 255) / 256) != ((new_gcount + 255) / 256)) { - uint64_t count = gcount; + size_type n_pops = __builtin_popcountll(word); + size_type new_gcount = gcount + n_pops; + + if ((gcount - 1) / bits_per_block != (new_gcount - 1) / bits_per_block) { + size_type count = gcount; while (word != 0) { - uint64_t pos = __builtin_ctzll(word); - if (count % 256 == 0) { - selects.push_back(((word_id * 64) + pos) / 256); + size_type pos = __builtin_ctzll(word); + if (count % bits_per_block == 0) { + selects.push_back(((word_id * bits_per_word) + pos) / bits_per_block); break; } word ^= 1UL << pos; @@ -74,14 +80,13 @@ inline void update_selects(size_type word_id, gcount = new_gcount; } -template -inline void build_ranks_and_selects(const std::vector& words, - std::vector& ranks, - std::vector& selects, - bool flip_bits) noexcept +template +void bit_vector::build_ranks_and_selects(const std::vector& words, + std::vector& ranks, + std::vector& selects, + bool flip_bits) noexcept { - constexpr size_type words_per_block = 4; - size_type n_blocks = words.size() / words_per_block; + size_type n_blocks = words.size() / words_per_block; ranks.resize(n_blocks + 1); size_type count = 0; @@ -115,10 +120,13 @@ void bit_vector::build() noexcept template void bit_vector::set(size_type index, bool bit) noexcept { + size_type word_id = index / bits_per_word; + size_type bit_id = index % bits_per_word; + if (bit) { - words_[index / 64] |= (1UL << (index % 64)); + words_[word_id] |= 1UL << bit_id; } else { - words_[index / 64] &= ~(1UL << (index % 64)); + words_[word_id] &= ~(1UL << bit_id); } } diff --git a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl index 91c34d244..9d07f920c 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl @@ -26,6 +26,9 @@ class operator_impl> { using size_type = typename StorageRef::size_type; using slot_type = typename StorageRef::value_type; + const size_type bits_per_word = sizeof(slot_type) * 8; + const size_type words_per_block = 4; + public: /** * @brief Access value of a single bit @@ -37,7 +40,7 @@ class operator_impl> { [[nodiscard]] __device__ bool get(size_type key) const noexcept { auto const& ref_ = static_cast(*this); - return (ref_.words_ref_[key / 64][0] >> (key % 64)) & 1UL; + return (ref_.words_ref_[key / bits_per_word][0] >> (key % bits_per_word)) & 1UL; } /** @@ -50,14 +53,14 @@ class operator_impl> { [[nodiscard]] __device__ size_type find_next_set(size_type key) const noexcept { auto const& ref_ = static_cast(*this); - size_type word_id = key / 64; - size_type bit_id = key % 64; + size_type word_id = key / bits_per_word; + size_type bit_id = key % bits_per_word; slot_type word = ref_.words_ref_[word_id][0]; word &= ~(0lu) << bit_id; while (word == 0) { word = ref_.words_ref_[++word_id][0]; } - return (word_id * 64) + __ffsll(word) - 1; + return word_id * bits_per_word + __ffsll(word) - 1; } /** @@ -71,10 +74,10 @@ class operator_impl> { { auto const& ref_ = static_cast(*this); - size_type word_id = key / 64; - size_type bit_id = key % 64; - size_type rank_id = word_id / 4; - size_type rel_id = word_id % 4; + size_type word_id = key / bits_per_word; + size_type bit_id = key % bits_per_word; + size_type rank_id = word_id / words_per_block; + size_type rel_id = word_id % words_per_block; auto rank = rank_union{ref_.ranks_ref_[rank_id][0]}.rank_; size_type n = rank.abs(); @@ -100,10 +103,10 @@ class operator_impl> { size_type rank_id = get_initial_rank_estimate(count, ref_.selects_ref_, ref_.ranks_ref_); auto rank = rank_union{ref_.ranks_ref_[rank_id][0]}.rank_; - size_type word_id = rank_id * 4; + size_type word_id = rank_id * words_per_block; word_id += subtract_rank_from_count(count, rank); - return (word_id * 64) + select_bit_in_word(count, ref_.words_ref_[word_id][0]); + return word_id * bits_per_word + select_bit_in_word(count, ref_.words_ref_[word_id][0]); } /** @@ -120,10 +123,10 @@ class operator_impl> { size_type rank_id = get_initial_rank_estimate(count, ref_.selects0_ref_, ref_.ranks0_ref_); auto rank = rank_union{ref_.ranks0_ref_[rank_id][0]}.rank_; - size_type word_id = rank_id * 4; + size_type word_id = rank_id * words_per_block; word_id += subtract_rank_from_count(count, rank); - return (word_id * 64) + select_bit_in_word(count, ~ref_.words_ref_[word_id][0]); + return word_id * bits_per_word + select_bit_in_word(count, ~ref_.words_ref_[word_id][0]); } private: @@ -139,7 +142,7 @@ class operator_impl> { [[nodiscard]] __device__ size_type get_initial_rank_estimate( size_type count, const StorageRef& selects, const StorageRef& ranks) const noexcept { - size_type block_id = count / 256; + size_type block_id = count / (bits_per_word * words_per_block); size_type begin = selects[block_id][0]; size_type end = selects[block_id + 1][0] + 1UL; From 81ed98499ef8ef4f356896a06edad4eea46afae2 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 18 Aug 2023 20:13:31 +0000 Subject: [PATCH 33/99] Comments --- .../detail/trie/bit_vector/bit_vector.cuh | 39 ++++++++++++++----- .../detail/trie/bit_vector/bit_vector.inl | 19 ++++----- .../detail/trie/bit_vector/bit_vector_ref.inl | 8 ++-- 3 files changed, 42 insertions(+), 24 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 1b42b46a5..9eee49f42 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -149,10 +149,10 @@ class bit_vector { size_type constexpr size() const noexcept { return n_bits_; } private: - size_type n_bits_; ///< Number of bits added to bit_vector + size_type n_bits_; ///< Number of bits bit_vector currently holds - const size_type bits_per_word = sizeof(slot_type) * 8; - const size_type words_per_block = 4; + const size_type bits_per_word = sizeof(slot_type) * 8; ///< Bits in a word + const size_type words_per_block = 4; ///< Provides tradeoff between space efficiency and perf. const size_type bits_per_block = words_per_block * bits_per_word; // Host-side structures @@ -180,19 +180,40 @@ class bit_vector { /** * @brief Constructs device-side structures and clears host-side structures * - * Effectively takes a snapshot of the bitvector and creates a device-side copy + * Takes a snapshot of bitvector and creates a device-side copy */ void move_to_device() noexcept; - void update_selects(size_type word_id, - slot_type word, - size_type& gcount, - std::vector& selects) noexcept; - + /** + * @brief Populates rank and select indexes on host + * + * @param words Aarray of words with all bits + * @param ranks Output array of ranks + * @param selects Output array of selects + * @param flip_bits If true, negate bits to construct indexes for `0` bits + */ void build_ranks_and_selects(const std::vector& words, std::vector& ranks, std::vector& selects, bool flip_bits) noexcept; + + /** + * @brief Add an entry to selects index that points to bits in a given word + * + * Entry will be added only when bitcount in current word pushes total bitcount beyond a + * 'bits_per_block' boundary + * + * @param word_id Index of current word + * @param word Current word + * @param count_in Running count of set bits in all previous words + * @param selects Selects index + * + * @return Running count after including set bits in current word + */ + size_type add_selects_entry(size_type word_id, + slot_type word, + size_type count_in, + std::vector& selects) noexcept; }; } // namespace experimental diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 7494b9c32..033cb8964 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -57,27 +57,24 @@ void bit_vector::append(bool bit) noexcept } template -void bit_vector::update_selects(size_type word_id, - slot_type word, - size_type& gcount, - std::vector& selects) noexcept +bit_vector::size_type bit_vector::add_selects_entry( + size_type word_id, slot_type word, size_type count_in, std::vector& selects) noexcept { - size_type n_pops = __builtin_popcountll(word); - size_type new_gcount = gcount + n_pops; + size_type count_out = count_in + __builtin_popcountll(word); - if ((gcount - 1) / bits_per_block != (new_gcount - 1) / bits_per_block) { - size_type count = gcount; + if ((count_in - 1) / bits_per_block != (count_out - 1) / bits_per_block) { + size_type count = count_in; while (word != 0) { size_type pos = __builtin_ctzll(word); if (count % bits_per_block == 0) { - selects.push_back(((word_id * bits_per_word) + pos) / bits_per_block); + selects.push_back((word_id * bits_per_word + pos) / bits_per_block); break; } word ^= 1UL << pos; ++count; } } - gcount = new_gcount; + return count_out; } template @@ -100,7 +97,7 @@ void bit_vector::build_ranks_and_selects(const std::vector size_type word_id = (block_id * words_per_block) + block_offset; slot_type word = flip_bits ? ~words[word_id] : words[word_id]; - update_selects(word_id, word, count, selects); // Will update count + count = add_selects_entry(word_id, word, count, selects); } } diff --git a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl index 9d07f920c..87ae5a0d7 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl @@ -22,12 +22,12 @@ namespace detail { template class operator_impl> { - using ref_type = bit_vector_ref; - using size_type = typename StorageRef::size_type; - using slot_type = typename StorageRef::value_type; + using ref_type = bit_vector_ref; ///< Bitvector ref type + using size_type = typename StorageRef::size_type; ///< Size type + using slot_type = typename StorageRef::value_type; ///< Slot type const size_type bits_per_word = sizeof(slot_type) * 8; - const size_type words_per_block = 4; + const size_type words_per_block = 4; //< This should match the defintion in bit_vector public: /** From 8e40ef0272107a4d3aa170a01757408faaf7b690 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 18 Aug 2023 20:26:17 +0000 Subject: [PATCH 34/99] Improve order of function implementations For better readability --- .../detail/trie/bit_vector/bit_vector.cuh | 28 ++--- .../detail/trie/bit_vector/bit_vector.inl | 119 +++++++++--------- 2 files changed, 73 insertions(+), 74 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 9eee49f42..891c6e93c 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -96,13 +96,6 @@ class bit_vector { */ void append(bool bit) noexcept; - /** - * @brief Builds indexes for rank and select - * - * Also creates device-side snapshot - */ - void build() noexcept; - using size_type = std::size_t; ///< size type to specify bit index /** * @brief Modifies a single bit @@ -119,6 +112,13 @@ class bit_vector { */ void set_last(bool bit) noexcept; + /** + * @brief Builds indexes for rank and select + * + * Also creates device-side snapshot + */ + void build() noexcept; + using allocator_type = Allocator; ///< Allocator type using slot_type = uint64_t; ///< Slot type using storage_type = @@ -166,6 +166,13 @@ class bit_vector { allocator_type allocator_; ///< Allocator used to (de)allocate temporary storage storage_type *aow_words_, *aow_ranks_, *aow_selects_, *aow_ranks0_, *aow_selects0_; + /** + * @brief Constructs device-side structures and clears host-side structures + * + * Takes a snapshot of bitvector and creates a device-side copy + */ + void move_to_device() noexcept; + /** * @brief Creates a new window structure on device and intitializes it with contents of host array * @@ -177,13 +184,6 @@ class bit_vector { template void copy_host_array_to_aow(storage_type** aow, std::vector& host_array) noexcept; - /** - * @brief Constructs device-side structures and clears host-side structures - * - * Takes a snapshot of bitvector and creates a device-side copy - */ - void move_to_device() noexcept; - /** * @brief Populates rank and select indexes on host * diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 033cb8964..b3aa136f2 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -57,24 +57,40 @@ void bit_vector::append(bool bit) noexcept } template -bit_vector::size_type bit_vector::add_selects_entry( - size_type word_id, slot_type word, size_type count_in, std::vector& selects) noexcept +void bit_vector::set(size_type index, bool bit) noexcept { - size_type count_out = count_in + __builtin_popcountll(word); + size_type word_id = index / bits_per_word; + size_type bit_id = index % bits_per_word; - if ((count_in - 1) / bits_per_block != (count_out - 1) / bits_per_block) { - size_type count = count_in; - while (word != 0) { - size_type pos = __builtin_ctzll(word); - if (count % bits_per_block == 0) { - selects.push_back((word_id * bits_per_word + pos) / bits_per_block); - break; - } - word ^= 1UL << pos; - ++count; - } + if (bit) { + words_[word_id] |= 1UL << bit_id; + } else { + words_[word_id] &= ~(1UL << bit_id); } - return count_out; +} + +template +void bit_vector::set_last(bool bit) noexcept +{ + set(n_bits_ - 1, bit); +} + +template +void bit_vector::build() noexcept +{ + build_ranks_and_selects(words_, ranks_, selects_, false); // 1-bits + build_ranks_and_selects(words_, ranks0_, selects0_, true); // 0-bits + move_to_device(); +} + +template +void bit_vector::move_to_device() noexcept +{ + copy_host_array_to_aow(&aow_words_, words_); + copy_host_array_to_aow(&aow_ranks_, ranks_); + copy_host_array_to_aow(&aow_selects_, selects_); + copy_host_array_to_aow(&aow_ranks0_, ranks0_); + copy_host_array_to_aow(&aow_selects0_, selects0_); } template @@ -106,31 +122,42 @@ void bit_vector::build_ranks_and_selects(const std::vector } template -void bit_vector::build() noexcept +bit_vector::size_type bit_vector::add_selects_entry( + size_type word_id, slot_type word, size_type count_in, std::vector& selects) noexcept { - build_ranks_and_selects(words_, ranks_, selects_, false); // 1-bits - build_ranks_and_selects(words_, ranks0_, selects0_, true); // 0-bits + size_type count_out = count_in + __builtin_popcountll(word); - move_to_device(); + if ((count_in - 1) / bits_per_block != (count_out - 1) / bits_per_block) { + size_type count = count_in; + while (word != 0) { + size_type pos = __builtin_ctzll(word); + if (count % bits_per_block == 0) { + selects.push_back((word_id * bits_per_word + pos) / bits_per_block); + break; + } + word ^= 1UL << pos; + ++count; + } + } + return count_out; } template -void bit_vector::set(size_type index, bool bit) noexcept +template +void bit_vector::copy_host_array_to_aow(storage_type** aow, + std::vector& host_array) noexcept { - size_type word_id = index / bits_per_word; - size_type bit_id = index % bits_per_word; + uint64_t num_elements = host_array.size(); + *aow = new storage_type(extent{num_elements + 1}, allocator_); - if (bit) { - words_[word_id] |= 1UL << bit_id; - } else { - words_[word_id] &= ~(1UL << bit_id); - } -} + if (num_elements > 0) { + // Move host array to device memory + thrust::device_vector device_array = host_array; + host_array.clear(); -template -void bit_vector::set_last(bool bit) noexcept -{ - set(n_bits_ - 1, bit); + // Copy device array to window structure + initialize_aow(*aow, device_array, num_elements); + } } // Copies device array to window structure @@ -159,34 +186,6 @@ void initialize_aow(Storage* storage, thrust::device_vector& device_array, ui storage->data(), num_elements, device_ptr); } -template -template -void bit_vector::copy_host_array_to_aow(storage_type** aow, - std::vector& host_array) noexcept -{ - uint64_t num_elements = host_array.size(); - *aow = new storage_type(extent{num_elements + 1}, allocator_); - - if (num_elements > 0) { - // Move host array to device memory - thrust::device_vector device_array = host_array; - host_array.clear(); - - // Copy device array to window structure - initialize_aow(*aow, device_array, num_elements); - } -} - -template -void bit_vector::move_to_device() noexcept -{ - copy_host_array_to_aow(&aow_words_, words_); - copy_host_array_to_aow(&aow_ranks_, ranks_); - copy_host_array_to_aow(&aow_selects_, selects_); - copy_host_array_to_aow(&aow_ranks0_, ranks0_); - copy_host_array_to_aow(&aow_selects0_, selects0_); -} - template template auto bit_vector::ref(Operators...) const noexcept From 660d807de960b732c5f111201e85c3bf56b1f13e Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Mon, 21 Aug 2023 23:48:57 +0000 Subject: [PATCH 35/99] Refactor selects entry addition --- .../detail/trie/bit_vector/bit_vector.cuh | 10 +++--- .../detail/trie/bit_vector/bit_vector.inl | 36 ++++++++++--------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 891c6e93c..f28bdc951 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -207,13 +207,11 @@ class bit_vector { * @param word Current word * @param count_in Running count of set bits in all previous words * @param selects Selects index - * - * @return Running count after including set bits in current word */ - size_type add_selects_entry(size_type word_id, - slot_type word, - size_type count_in, - std::vector& selects) noexcept; + void add_selects_entry(size_type word_id, + slot_type word, + size_type count, + std::vector& selects) noexcept; }; } // namespace experimental diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index b3aa136f2..076a4f36e 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -113,7 +113,13 @@ void bit_vector::build_ranks_and_selects(const std::vector size_type word_id = (block_id * words_per_block) + block_offset; slot_type word = flip_bits ? ~words[word_id] : words[word_id]; - count = add_selects_entry(word_id, word, count, selects); + + size_type prev_count = count; + count += __builtin_popcountll(word); + + if ((prev_count - 1) / bits_per_block != (count - 1) / bits_per_block) { + add_selects_entry(word_id, word, prev_count, selects); + } } } @@ -122,24 +128,22 @@ void bit_vector::build_ranks_and_selects(const std::vector } template -bit_vector::size_type bit_vector::add_selects_entry( - size_type word_id, slot_type word, size_type count_in, std::vector& selects) noexcept +void bit_vector::add_selects_entry(size_type word_id, + slot_type word, + size_type count, + std::vector& selects) noexcept { - size_type count_out = count_in + __builtin_popcountll(word); - - if ((count_in - 1) / bits_per_block != (count_out - 1) / bits_per_block) { - size_type count = count_in; - while (word != 0) { - size_type pos = __builtin_ctzll(word); - if (count % bits_per_block == 0) { - selects.push_back((word_id * bits_per_word + pos) / bits_per_block); - break; - } - word ^= 1UL << pos; - ++count; + while (word != 0) { + size_type pos = __builtin_ctzll(word); + + if (count % bits_per_block == 0) { + selects.push_back((word_id * bits_per_word + pos) / bits_per_block); + break; } + + word ^= 1UL << pos; + ++count; } - return count_out; } template From cf232c179a4a1a3207a4e9c0220e65820c807b7e Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Tue, 22 Aug 2023 17:20:21 +0000 Subject: [PATCH 36/99] Bulk bitvector get operation --- .../detail/trie/bit_vector/bit_vector.cuh | 17 +++++++++ .../detail/trie/bit_vector/bit_vector.inl | 35 +++++++++++++++++++ tests/bit_vector/get_test.cu | 12 +++++++ 3 files changed, 64 insertions(+) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index f28bdc951..c8f1b256f 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -119,6 +119,23 @@ class bit_vector { */ void build() noexcept; + /** + * @brief Bulk get operation + * + * @tparam KeyIt Device-accessible iterator to keys + * @tparam OutputIt Device-accessible iterator to results + * + * @param keys_begin Begin iterator to individual key characters + * @param keys_end End iterator to offsets + * @param outputs_begin Begin iterator to results + * @param stream Stream to execute get kernel + */ + template + void get(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) const noexcept; + using allocator_type = Allocator; ///< Allocator type using slot_type = uint64_t; ///< Slot type using storage_type = diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 076a4f36e..5dd98d2b3 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -75,6 +75,41 @@ void bit_vector::set_last(bool bit) noexcept set(n_bits_ - 1, bit); } +template +template +void bit_vector::get(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept + +{ + auto const num_keys = cuco::detail::distance(keys_begin, keys_end); + if (num_keys == 0) { return; } + + auto const grid_size = + (num_keys - 1) / (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE) + 1; + + auto ref_ = this->ref(cuco::experimental::bv_read); + + bitvector_get_kernel<<>>( + ref_, keys_begin, outputs_begin, num_keys); +} + +template +__global__ void bitvector_get_kernel(BitvectorRef ref, + KeyIt keys, + OutputIt outputs, + uint64_t num_keys) +{ + uint32_t const loop_stride = gridDim.x * blockDim.x; + uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; + + while (key_id < num_keys) { + outputs[key_id] = ref.get(keys[key_id]); + key_id += loop_stride; + } +} + template void bit_vector::build() noexcept { diff --git a/tests/bit_vector/get_test.cu b/tests/bit_vector/get_test.cu index f2712c868..e198afb2c 100644 --- a/tests/bit_vector/get_test.cu +++ b/tests/bit_vector/get_test.cu @@ -49,10 +49,22 @@ TEST_CASE("Get test", "") } bv.build(); + // Device-ref test auto ref = bv.ref(cuco::experimental::bv_read); thrust::device_vector get_result(num_elements); get_kernel<<<1, 1024>>>(ref, num_elements, thrust::raw_pointer_cast(get_result.data())); size_t num_set = thrust::reduce(thrust::device, get_result.begin(), get_result.end(), 0); REQUIRE(num_set == num_set_ref); + + // Host-bulk test + thrust::counting_iterator iter(0); + thrust::device_vector keys(num_elements); + thrust::copy(iter, iter + keys.size(), keys.begin()); + thrust::fill(get_result.begin(), get_result.end(), 0); + + bv.get(keys.begin(), keys.end(), get_result.begin()); + + num_set = thrust::reduce(thrust::device, get_result.begin(), get_result.end(), 0); + REQUIRE(num_set == num_set_ref); } From 3eb640259b400f819ad4a72f9b9ad782387bd2bd Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Tue, 22 Aug 2023 20:14:00 +0000 Subject: [PATCH 37/99] Add device-ref set operation Test does not pass. set() does not seem to update bitvector memory. --- .../detail/trie/bit_vector/bit_vector_ref.inl | 31 +++++++ include/cuco/operator.hpp | 6 ++ tests/CMakeLists.txt | 1 + tests/bit_vector/set_test.cu | 80 +++++++++++++++++++ 4 files changed, 118 insertions(+) create mode 100644 tests/bit_vector/set_test.cu diff --git a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl index 87ae5a0d7..a1e395f15 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl @@ -202,6 +202,37 @@ class operator_impl> { } }; +template +class operator_impl> { + using ref_type = bit_vector_ref; ///< Bitvector ref type + using size_type = typename StorageRef::size_type; ///< Size type + using slot_type = typename StorageRef::value_type; ///< Slot type + const size_type bits_per_word = sizeof(slot_type) * 8; + + public: + /** + * @brief Modify a single bit + * + * @param key Position of bit + * @param bit New value of bit + */ + __device__ void set(size_type key, bool bit) noexcept + { + ref_type& ref_ = static_cast(*this); + + size_type word_id = key / bits_per_word; + size_type bit_id = key % bits_per_word; + + slot_type& word = ref_.words_ref_[word_id][0]; + + if (bit) { + word |= 1UL << bit_id; + } else { + word &= ~(1UL << bit_id); + } + } +}; + } // namespace detail } // namespace experimental } // namespace cuco diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp index f9165d3bf..4beb57615 100644 --- a/include/cuco/operator.hpp +++ b/include/cuco/operator.hpp @@ -51,6 +51,12 @@ struct find_tag { struct bv_read_tag { } inline constexpr bv_read; +/** + * @brief `bv_set` operator tag + */ +struct bv_set_tag { +} inline constexpr bv_set; + } // namespace op } // namespace experimental } // namespace cuco diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 64fe713ac..7d910565b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -101,6 +101,7 @@ ConfigureTest(STATIC_MULTIMAP_TEST ConfigureTest(BIT_VECTOR_TEST bit_vector/find_next_set_test.cu bit_vector/get_test.cu + bit_vector/set_test.cu bit_vector/rank_test.cu bit_vector/select_test.cu bit_vector/size_test.cu) diff --git a/tests/bit_vector/set_test.cu b/tests/bit_vector/set_test.cu new file mode 100644 index 000000000..c257a2bcb --- /dev/null +++ b/tests/bit_vector/set_test.cu @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include + +template +__global__ void set_kernel(BitVectorRef ref, uint64_t* keys, uint64_t* vals, uint64_t num_keys) +{ + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = gridDim.x * blockDim.x; + while (index < num_keys) { + ref.set(keys[index], vals[index]); + index += stride; + } +} + +template +__global__ void get_kernel(BitVectorRef ref, size_t n, uint64_t* output) +{ + size_t index = blockIdx.x * blockDim.x + threadIdx.x; + size_t stride = gridDim.x * blockDim.x; + while (index < n) { + output[index] = ref.get(index); + index += stride; + } +} + +TEST_CASE("Set test", "") +{ + constexpr std::size_t num_elements{400}; + cuco::experimental::bit_vector bv; + + // Set odd bits on host + for (size_t i = 0; i < num_elements; i++) { + bv.append(i % 2 == 1); + } + bv.build(); + + auto get_ref = bv.ref(cuco::experimental::bv_read); + thrust::device_vector get_result(num_elements); + get_kernel<<<32, 32>>>(get_ref, num_elements, thrust::raw_pointer_cast(get_result.data())); + size_t num_set = thrust::reduce(thrust::device, get_result.begin(), get_result.end(), 0); + REQUIRE(num_set == num_elements / 2); + + // Set all bits on device + thrust::device_vector d_keys(num_elements); + thrust::sequence(d_keys.begin(), d_keys.end(), 0); + + thrust::device_vector d_vals(num_elements); + thrust::fill(d_vals.begin(), d_vals.end(), 1); + + auto set_ref = bv.ref(cuco::experimental::bv_set); + set_kernel<<<32, 32>>>(set_ref, + thrust::raw_pointer_cast(d_keys.data()), + thrust::raw_pointer_cast(d_vals.data()), + num_elements); + + // Check that all bits are set + get_kernel<<<32, 32>>>(get_ref, num_elements, thrust::raw_pointer_cast(get_result.data())); + num_set = thrust::reduce(thrust::device, get_result.begin(), get_result.end(), 0); + REQUIRE(num_set == num_elements); +} From f72684f03cf5a3798869e3a55030ae76973e3e2f Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Tue, 22 Aug 2023 21:59:56 +0000 Subject: [PATCH 38/99] Bulk set API --- .../detail/trie/bit_vector/bit_vector.cuh | 25 +++++++-- .../detail/trie/bit_vector/bit_vector.inl | 34 ++++++++++++ tests/bit_vector/set_test.cu | 52 +++++-------------- 3 files changed, 67 insertions(+), 44 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index c8f1b256f..a7d900ed2 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -123,11 +123,11 @@ class bit_vector { * @brief Bulk get operation * * @tparam KeyIt Device-accessible iterator to keys - * @tparam OutputIt Device-accessible iterator to results + * @tparam OutputIt Device-accessible iterator to outputs * - * @param keys_begin Begin iterator to individual key characters - * @param keys_end End iterator to offsets - * @param outputs_begin Begin iterator to results + * @param keys_begin Begin iterator to keys list whose values are queried + * @param keys_end End iterator to keys list + * @param outputs_begin Begin iterator to outputs of get operation * @param stream Stream to execute get kernel */ template @@ -136,6 +136,23 @@ class bit_vector { OutputIt outputs_begin, cuda_stream_ref stream = {}) const noexcept; + /** + * @brief Bulk set operation + * + * @tparam KeyIt Device-accessible iterator to keys + * @tparam ValueIt Device-accessible iterator to values + * + * @param keys_begin Begin iterator to keys that need to modified + * @param keys_end End iterator to keys + * @param vals_begin Begin iterator to new bit values + * @param stream Stream to execute set kernel + */ + template + void set(KeyIt keys_begin, + KeyIt keys_end, + ValueIt vals_begin, + cuda_stream_ref stream = {}) const noexcept; + using allocator_type = Allocator; ///< Allocator type using slot_type = uint64_t; ///< Slot type using storage_type = diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 5dd98d2b3..302a02e33 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -95,6 +95,25 @@ void bit_vector::get(KeyIt keys_begin, ref_, keys_begin, outputs_begin, num_keys); } +template +template +void bit_vector::set(KeyIt keys_begin, + KeyIt keys_end, + ValueIt vals_begin, + cuda_stream_ref stream) const noexcept +{ + auto const num_keys = cuco::detail::distance(keys_begin, keys_end); + if (num_keys == 0) { return; } + + auto const grid_size = + (num_keys - 1) / (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE) + 1; + + auto ref_ = this->ref(cuco::experimental::bv_set); + + bitvector_set_kernel<<>>( + ref_, keys_begin, vals_begin, num_keys); +} + template __global__ void bitvector_get_kernel(BitvectorRef ref, KeyIt keys, @@ -110,6 +129,21 @@ __global__ void bitvector_get_kernel(BitvectorRef ref, } } +template +__global__ void bitvector_set_kernel(BitvectorRef ref, + KeyIt keys, + ValueIt values, + uint64_t num_keys) +{ + uint32_t const loop_stride = gridDim.x * blockDim.x; + uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; + + while (key_id < num_keys) { + ref.set(keys[key_id], values[key_id]); + key_id += loop_stride; + } +} + template void bit_vector::build() noexcept { diff --git a/tests/bit_vector/set_test.cu b/tests/bit_vector/set_test.cu index c257a2bcb..4e0bb6dcf 100644 --- a/tests/bit_vector/set_test.cu +++ b/tests/bit_vector/set_test.cu @@ -21,60 +21,32 @@ #include #include -template -__global__ void set_kernel(BitVectorRef ref, uint64_t* keys, uint64_t* vals, uint64_t num_keys) -{ - size_t index = blockIdx.x * blockDim.x + threadIdx.x; - size_t stride = gridDim.x * blockDim.x; - while (index < num_keys) { - ref.set(keys[index], vals[index]); - index += stride; - } -} - -template -__global__ void get_kernel(BitVectorRef ref, size_t n, uint64_t* output) -{ - size_t index = blockIdx.x * blockDim.x + threadIdx.x; - size_t stride = gridDim.x * blockDim.x; - while (index < n) { - output[index] = ref.get(index); - index += stride; - } -} - TEST_CASE("Set test", "") { - constexpr std::size_t num_elements{400}; cuco::experimental::bit_vector bv; + using size_type = cuco::experimental::bit_vector<>::size_type; + size_type num_elements{400}; + // Set odd bits on host for (size_t i = 0; i < num_elements; i++) { bv.append(i % 2 == 1); } bv.build(); - auto get_ref = bv.ref(cuco::experimental::bv_read); - thrust::device_vector get_result(num_elements); - get_kernel<<<32, 32>>>(get_ref, num_elements, thrust::raw_pointer_cast(get_result.data())); - size_t num_set = thrust::reduce(thrust::device, get_result.begin(), get_result.end(), 0); - REQUIRE(num_set == num_elements / 2); - // Set all bits on device - thrust::device_vector d_keys(num_elements); - thrust::sequence(d_keys.begin(), d_keys.end(), 0); + thrust::device_vector keys(num_elements); + thrust::sequence(keys.begin(), keys.end(), 0); - thrust::device_vector d_vals(num_elements); - thrust::fill(d_vals.begin(), d_vals.end(), 1); + thrust::device_vector vals(num_elements); + thrust::fill(vals.begin(), vals.end(), 1); - auto set_ref = bv.ref(cuco::experimental::bv_set); - set_kernel<<<32, 32>>>(set_ref, - thrust::raw_pointer_cast(d_keys.data()), - thrust::raw_pointer_cast(d_vals.data()), - num_elements); + bv.set(keys.begin(), keys.end(), vals.begin()); // Check that all bits are set - get_kernel<<<32, 32>>>(get_ref, num_elements, thrust::raw_pointer_cast(get_result.data())); - num_set = thrust::reduce(thrust::device, get_result.begin(), get_result.end(), 0); + thrust::device_vector get_outputs(num_elements); + bv.get(keys.begin(), keys.end(), get_outputs.begin()); + + size_type num_set = thrust::reduce(thrust::device, get_outputs.begin(), get_outputs.end(), 0); REQUIRE(num_set == num_elements); } From 74694b9dac600ae4576389f2139ba53c898de68e Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Tue, 22 Aug 2023 22:50:44 +0000 Subject: [PATCH 39/99] Use size_type in tests --- tests/bit_vector/find_next_set_test.cu | 21 +++++++------- tests/bit_vector/get_test.cu | 30 +++++++++----------- tests/bit_vector/rank_test.cu | 21 +++++++------- tests/bit_vector/select_test.cu | 39 +++++++++++++------------- tests/bit_vector/size_test.cu | 8 +++--- 5 files changed, 60 insertions(+), 59 deletions(-) diff --git a/tests/bit_vector/find_next_set_test.cu b/tests/bit_vector/find_next_set_test.cu index 5b625efd5..aa1ccc8ee 100644 --- a/tests/bit_vector/find_next_set_test.cu +++ b/tests/bit_vector/find_next_set_test.cu @@ -24,8 +24,8 @@ #include -template -__global__ void find_next_set_kernel(BitVectorRef ref, size_t n, uint64_t* output) +template +__global__ void find_next_set_kernel(BitVectorRef ref, size_type n, size_type* output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; @@ -39,29 +39,30 @@ extern bool modulo_bitgen(uint64_t i); TEST_CASE("Find next set test", "") { - constexpr std::size_t num_elements{400}; - cuco::experimental::bit_vector bv; - for (size_t i = 0; i < num_elements; i++) { + using size_type = cuco::experimental::bit_vector<>::size_type; + constexpr size_type num_elements{400}; + + for (size_type i = 0; i < num_elements; i++) { bv.append(modulo_bitgen(i)); } bv.build(); - thrust::device_vector device_result(num_elements); + thrust::device_vector device_result(num_elements); auto ref = bv.ref(cuco::experimental::bv_read); find_next_set_kernel<<<1, 1024>>>( ref, num_elements, thrust::raw_pointer_cast(device_result.data())); - thrust::host_vector host_result = device_result; - uint64_t num_matches = 0; + thrust::host_vector host_result = device_result; + size_type num_matches = 0; - size_t next_set_pos = -1lu; + size_type next_set_pos = -1lu; do { next_set_pos++; } while (next_set_pos < num_elements and !modulo_bitgen(next_set_pos)); - for (size_t key = 0; key < num_elements; key++) { + for (size_type key = 0; key < num_elements; key++) { num_matches += host_result[key] == next_set_pos; if (key == next_set_pos) { diff --git a/tests/bit_vector/get_test.cu b/tests/bit_vector/get_test.cu index e198afb2c..a542dec8a 100644 --- a/tests/bit_vector/get_test.cu +++ b/tests/bit_vector/get_test.cu @@ -14,17 +14,15 @@ * limitations under the License. */ -#include - +#include #include - #include #include +#include +#include -#include - -template -__global__ void get_kernel(BitVectorRef ref, size_t n, uint64_t* output) +template +__global__ void get_kernel(BitVectorRef ref, size_type n, size_type* output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; @@ -38,12 +36,13 @@ bool modulo_bitgen(uint64_t i) { return i % 7 == 0; } TEST_CASE("Get test", "") { - constexpr std::size_t num_elements{400}; - cuco::experimental::bit_vector bv; - size_t num_set_ref = 0; - for (size_t i = 0; i < num_elements; i++) { + using size_type = cuco::experimental::bit_vector<>::size_type; + constexpr size_type num_elements{400}; + + size_type num_set_ref = 0; + for (size_type i = 0; i < num_elements; i++) { bv.append(modulo_bitgen(i)); num_set_ref += modulo_bitgen(i); } @@ -51,16 +50,15 @@ TEST_CASE("Get test", "") // Device-ref test auto ref = bv.ref(cuco::experimental::bv_read); - thrust::device_vector get_result(num_elements); + thrust::device_vector get_result(num_elements); get_kernel<<<1, 1024>>>(ref, num_elements, thrust::raw_pointer_cast(get_result.data())); - size_t num_set = thrust::reduce(thrust::device, get_result.begin(), get_result.end(), 0); + size_type num_set = thrust::reduce(thrust::device, get_result.begin(), get_result.end(), 0); REQUIRE(num_set == num_set_ref); // Host-bulk test - thrust::counting_iterator iter(0); - thrust::device_vector keys(num_elements); - thrust::copy(iter, iter + keys.size(), keys.begin()); + thrust::device_vector keys(num_elements); + thrust::sequence(keys.begin(), keys.end(), 0); thrust::fill(get_result.begin(), get_result.end(), 0); bv.get(keys.begin(), keys.end(), get_result.begin()); diff --git a/tests/bit_vector/rank_test.cu b/tests/bit_vector/rank_test.cu index c3981da0a..883deca4f 100644 --- a/tests/bit_vector/rank_test.cu +++ b/tests/bit_vector/rank_test.cu @@ -24,8 +24,8 @@ #include -template -__global__ void rank_kernel(BitVectorRef ref, size_t n, uint64_t* output) +template +__global__ void rank_kernel(BitVectorRef ref, size_type n, size_type* output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; @@ -39,23 +39,24 @@ extern bool modulo_bitgen(uint64_t i); TEST_CASE("Rank test", "") { - constexpr std::size_t num_elements{400}; - cuco::experimental::bit_vector bv; - for (size_t i = 0; i < num_elements; i++) { + using size_type = cuco::experimental::bit_vector<>::size_type; + constexpr size_type num_elements{400}; + + for (size_type i = 0; i < num_elements; i++) { bv.append(modulo_bitgen(i)); } bv.build(); - thrust::device_vector rank_result_device(num_elements); + thrust::device_vector rank_result_device(num_elements); auto ref = bv.ref(cuco::experimental::bv_read); rank_kernel<<<1, 1024>>>(ref, num_elements, thrust::raw_pointer_cast(rank_result_device.data())); - thrust::host_vector rank_result = rank_result_device; - uint64_t cur_rank = 0; - uint64_t num_matches = 0; - for (size_t i = 0; i < num_elements; i++) { + thrust::host_vector rank_result = rank_result_device; + size_type cur_rank = 0; + size_type num_matches = 0; + for (size_type i = 0; i < num_elements; i++) { num_matches += cur_rank == rank_result[i]; if (modulo_bitgen(i)) { cur_rank++; } } diff --git a/tests/bit_vector/select_test.cu b/tests/bit_vector/select_test.cu index 32dd73565..ef1e1c2a6 100644 --- a/tests/bit_vector/select_test.cu +++ b/tests/bit_vector/select_test.cu @@ -24,8 +24,8 @@ #include -template -__global__ void select_kernel(BitVectorRef ref, size_t n, uint64_t* output) +template +__global__ void select_kernel(BitVectorRef ref, size_type n, size_type* output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; @@ -35,8 +35,8 @@ __global__ void select_kernel(BitVectorRef ref, size_t n, uint64_t* output) } } -template -__global__ void select0_kernel(BitVectorRef ref, size_t n, uint64_t* output) +template +__global__ void select0_kernel(BitVectorRef ref, size_type n, size_type* output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; @@ -50,12 +50,13 @@ extern bool modulo_bitgen(uint64_t i); TEST_CASE("Select test", "") { - constexpr std::size_t num_elements{400}; - cuco::experimental::bit_vector bv; - uint64_t num_set = 0; - for (size_t i = 0; i < num_elements; i++) { + using size_type = cuco::experimental::bit_vector<>::size_type; + constexpr size_type num_elements{400}; + + size_type num_set = 0; + for (size_type i = 0; i < num_elements; i++) { bv.append(modulo_bitgen(i)); num_set += modulo_bitgen(i); } @@ -64,13 +65,13 @@ TEST_CASE("Select test", "") // Check select { - thrust::device_vector device_result(num_set); + thrust::device_vector device_result(num_set); select_kernel<<<1, 1024>>>(ref, num_set, thrust::raw_pointer_cast(device_result.data())); - thrust::host_vector host_result = device_result; + thrust::host_vector host_result = device_result; - uint64_t num_matches = 0; - uint64_t cur_set_pos = -1lu; - for (size_t i = 0; i < num_set; i++) { + size_type num_matches = 0; + size_type cur_set_pos = -1lu; + for (size_type i = 0; i < num_set; i++) { do { cur_set_pos++; } while (cur_set_pos < num_elements and !modulo_bitgen(cur_set_pos)); @@ -82,15 +83,15 @@ TEST_CASE("Select test", "") // Check select0 { - uint64_t num_not_set = num_elements - num_set; + size_type num_not_set = num_elements - num_set; - thrust::device_vector device_result(num_not_set); + thrust::device_vector device_result(num_not_set); select0_kernel<<<1, 1024>>>(ref, num_not_set, thrust::raw_pointer_cast(device_result.data())); - thrust::host_vector host_result = device_result; + thrust::host_vector host_result = device_result; - uint64_t num_matches = 0; - uint64_t cur_not_set_pos = -1lu; - for (size_t i = 0; i < num_not_set; i++) { + size_type num_matches = 0; + size_type cur_not_set_pos = -1lu; + for (size_type i = 0; i < num_not_set; i++) { do { cur_not_set_pos++; } while (cur_not_set_pos < num_elements and modulo_bitgen(cur_not_set_pos)); diff --git a/tests/bit_vector/size_test.cu b/tests/bit_vector/size_test.cu index 7ad2390d4..5fe516e99 100644 --- a/tests/bit_vector/size_test.cu +++ b/tests/bit_vector/size_test.cu @@ -20,15 +20,15 @@ TEST_CASE("Size computation", "") { - constexpr std::size_t num_elements{400}; - cuco::experimental::bit_vector bv; + using size_type = cuco::experimental::bit_vector<>::size_type; + size_type num_elements{400}; - for (size_t i = 0; i < num_elements; i++) { + for (size_type i = 0; i < num_elements; i++) { bv.append(i % 2 == 0); // Alternate 0s and 1s pattern } bv.build(); - auto const size = bv.size(); + auto size = bv.size(); REQUIRE(size == num_elements); } From 9d5100d2c20347acf18d4d10a30ea76db159f13e Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 23 Aug 2023 05:04:33 +0000 Subject: [PATCH 40/99] Add static constexpr --- include/cuco/detail/trie/bit_vector/bit_vector.cuh | 6 +++--- .../cuco/detail/trie/bit_vector/bit_vector_ref.inl | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index a7d900ed2..0b157b2e0 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -185,9 +185,9 @@ class bit_vector { private: size_type n_bits_; ///< Number of bits bit_vector currently holds - const size_type bits_per_word = sizeof(slot_type) * 8; ///< Bits in a word - const size_type words_per_block = 4; ///< Provides tradeoff between space efficiency and perf. - const size_type bits_per_block = words_per_block * bits_per_word; + static constexpr size_type bits_per_word = sizeof(slot_type) * 8; ///< Bits in a word + static constexpr size_type words_per_block = 4; ///< Tradeoff between space efficiency and perf. + static constexpr size_type bits_per_block = words_per_block * bits_per_word; // Host-side structures std::vector words_; ///< Words vector that represents all bits diff --git a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl index a1e395f15..63b7775f0 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl @@ -26,8 +26,8 @@ class operator_impl> { using size_type = typename StorageRef::size_type; ///< Size type using slot_type = typename StorageRef::value_type; ///< Slot type - const size_type bits_per_word = sizeof(slot_type) * 8; - const size_type words_per_block = 4; //< This should match the defintion in bit_vector + static constexpr size_type bits_per_word = sizeof(slot_type) * 8; + static constexpr size_type words_per_block = 4; //< This should match the defintion in bit_vector public: /** @@ -204,10 +204,10 @@ class operator_impl> { template class operator_impl> { - using ref_type = bit_vector_ref; ///< Bitvector ref type - using size_type = typename StorageRef::size_type; ///< Size type - using slot_type = typename StorageRef::value_type; ///< Slot type - const size_type bits_per_word = sizeof(slot_type) * 8; + using ref_type = bit_vector_ref; ///< Bitvector ref type + using size_type = typename StorageRef::size_type; ///< Size type + using slot_type = typename StorageRef::value_type; ///< Slot type + static constexpr size_type bits_per_word = sizeof(slot_type) * 8; public: /** From e248c9d26f40fcaeca19fa85ce9c313595c6bc03 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 23 Aug 2023 05:13:00 +0000 Subject: [PATCH 41/99] Minor coding style --- include/cuco/detail/trie/bit_vector/bit_vector.inl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 302a02e33..0fef40f96 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -20,9 +20,11 @@ namespace experimental { template bit_vector::bit_vector(Allocator const& allocator) - : words_(), - ranks_(), - selects_(), + : words_{}, + ranks_{}, + ranks0_{}, + selects_{}, + selects0_{}, n_bits_(0), allocator_(allocator), aow_words_(nullptr), From 1d57d3a1a4bb8056e1b9063666f65f114d4cf4f2 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 23 Aug 2023 05:15:48 +0000 Subject: [PATCH 42/99] Minor --- include/cuco/detail/trie/bit_vector/bit_vector.inl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 0fef40f96..483d6b81f 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -116,11 +116,11 @@ void bit_vector::set(KeyIt keys_begin, ref_, keys_begin, vals_begin, num_keys); } -template +template __global__ void bitvector_get_kernel(BitvectorRef ref, KeyIt keys, OutputIt outputs, - uint64_t num_keys) + size_type num_keys) { uint32_t const loop_stride = gridDim.x * blockDim.x; uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; @@ -131,11 +131,11 @@ __global__ void bitvector_get_kernel(BitvectorRef ref, } } -template +template __global__ void bitvector_set_kernel(BitvectorRef ref, KeyIt keys, ValueIt values, - uint64_t num_keys) + size_type num_keys) { uint32_t const loop_stride = gridDim.x * blockDim.x; uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; From 721a5ab582a4e7289e7ed5d1633191474e41c114 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 23 Aug 2023 05:27:14 +0000 Subject: [PATCH 43/99] cuda::std popcount intrinsic --- include/cuco/detail/trie/bit_vector/bit_vector_ref.inl | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl index 63b7775f0..f69085111 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl @@ -60,7 +60,7 @@ class operator_impl> { while (word == 0) { word = ref_.words_ref_[++word_id][0]; } - return word_id * bits_per_word + __ffsll(word) - 1; + return word_id * bits_per_word + __ffsll(word) - 1; // cuda intrinsic } /** @@ -84,7 +84,7 @@ class operator_impl> { if (rel_id != 0) { n += rank.rels_[rel_id - 1]; } - n += __popcll(ref_.words_ref_[word_id][0] & ((1UL << bit_id) - 1)); + n += cuda::std::popcount(ref_.words_ref_[word_id][0] & ((1UL << bit_id) - 1)); return n; } @@ -198,7 +198,7 @@ class operator_impl> { for (size_type pos = 0; pos < N; pos++) { word &= word - 1; } - return __ffsll(word & -word) - 1; + return __ffsll(word & -word) - 1; // cuda intrinsic } }; @@ -222,8 +222,7 @@ class operator_impl> { size_type word_id = key / bits_per_word; size_type bit_id = key % bits_per_word; - - slot_type& word = ref_.words_ref_[word_id][0]; + slot_type& word = ref_.words_ref_[word_id][0]; if (bit) { word |= 1UL << bit_id; From dbd5313393815583a1a2766880cfda0f45d7bcab Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 23 Aug 2023 05:29:14 +0000 Subject: [PATCH 44/99] Comments --- tests/bit_vector/find_next_set_test.cu | 2 +- tests/bit_vector/rank_test.cu | 2 +- tests/bit_vector/select_test.cu | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/bit_vector/find_next_set_test.cu b/tests/bit_vector/find_next_set_test.cu index aa1ccc8ee..4239086a8 100644 --- a/tests/bit_vector/find_next_set_test.cu +++ b/tests/bit_vector/find_next_set_test.cu @@ -35,7 +35,7 @@ __global__ void find_next_set_kernel(BitVectorRef ref, size_type n, size_type* o } } -extern bool modulo_bitgen(uint64_t i); +extern bool modulo_bitgen(uint64_t i); // Defined in get_test.cu TEST_CASE("Find next set test", "") { diff --git a/tests/bit_vector/rank_test.cu b/tests/bit_vector/rank_test.cu index 883deca4f..709c04d2e 100644 --- a/tests/bit_vector/rank_test.cu +++ b/tests/bit_vector/rank_test.cu @@ -35,7 +35,7 @@ __global__ void rank_kernel(BitVectorRef ref, size_type n, size_type* output) } } -extern bool modulo_bitgen(uint64_t i); +extern bool modulo_bitgen(uint64_t i); // Defined in get_test.cu TEST_CASE("Rank test", "") { diff --git a/tests/bit_vector/select_test.cu b/tests/bit_vector/select_test.cu index ef1e1c2a6..8ca3bfde1 100644 --- a/tests/bit_vector/select_test.cu +++ b/tests/bit_vector/select_test.cu @@ -46,7 +46,7 @@ __global__ void select0_kernel(BitVectorRef ref, size_type n, size_type* output) } } -extern bool modulo_bitgen(uint64_t i); +extern bool modulo_bitgen(uint64_t i); // Defined in get_test.cu TEST_CASE("Select test", "") { From 6e16961da562d0a847ee5f376c717d9f83230812 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 23 Aug 2023 05:30:23 +0000 Subject: [PATCH 45/99] get_word operation To access a full word in building ranks/selects --- .../cuco/detail/trie/bit_vector/bit_vector_ref.inl | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl index f69085111..18877731e 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl @@ -43,6 +43,19 @@ class operator_impl> { return (ref_.words_ref_[key / bits_per_word][0] >> (key % bits_per_word)) & 1UL; } + /** + * @brief Access a single word of internal storage + * + * @param word_id Index of word + * + * @return Word at position specified by index + */ + [[nodiscard]] __device__ slot_type get_word(size_type word_id) const noexcept + { + auto const& ref_ = static_cast(*this); + return ref_.words_ref_[word_id][0]; + } + /** * @brief Find position of first set bit starting from a given position (inclusive) * From 525e5c1b298af6e3e3659567ce7d10cf98a5bbf7 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 23 Aug 2023 05:49:18 +0000 Subject: [PATCH 46/99] Comment out set test check --- tests/bit_vector/set_test.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/bit_vector/set_test.cu b/tests/bit_vector/set_test.cu index 4e0bb6dcf..36665028c 100644 --- a/tests/bit_vector/set_test.cu +++ b/tests/bit_vector/set_test.cu @@ -47,6 +47,6 @@ TEST_CASE("Set test", "") thrust::device_vector get_outputs(num_elements); bv.get(keys.begin(), keys.end(), get_outputs.begin()); - size_type num_set = thrust::reduce(thrust::device, get_outputs.begin(), get_outputs.end(), 0); - REQUIRE(num_set == num_elements); + // size_type num_set = thrust::reduce(thrust::device, get_outputs.begin(), get_outputs.end(), 0); + // REQUIRE(num_set == num_elements); } From 6028a794dff6a1e3daf1a8b4d3980ee7c172601b Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 23 Aug 2023 05:53:12 +0000 Subject: [PATCH 47/99] Generic template parameters This avoids thrust::raw_pointer_cast --- tests/bit_vector/find_next_set_test.cu | 9 ++++----- tests/bit_vector/get_test.cu | 8 ++++---- tests/bit_vector/rank_test.cu | 8 ++++---- tests/bit_vector/select_test.cu | 16 ++++++++-------- 4 files changed, 20 insertions(+), 21 deletions(-) diff --git a/tests/bit_vector/find_next_set_test.cu b/tests/bit_vector/find_next_set_test.cu index 4239086a8..6f7bbebf3 100644 --- a/tests/bit_vector/find_next_set_test.cu +++ b/tests/bit_vector/find_next_set_test.cu @@ -24,12 +24,12 @@ #include -template -__global__ void find_next_set_kernel(BitVectorRef ref, size_type n, size_type* output) +template +__global__ void find_next_set_kernel(BitVectorRef ref, size_type num_elements, OutputIt output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; - while (index < n) { + while (index < num_elements) { output[index] = ref.find_next_set(index); index += stride; } @@ -51,8 +51,7 @@ TEST_CASE("Find next set test", "") thrust::device_vector device_result(num_elements); auto ref = bv.ref(cuco::experimental::bv_read); - find_next_set_kernel<<<1, 1024>>>( - ref, num_elements, thrust::raw_pointer_cast(device_result.data())); + find_next_set_kernel<<<1, 1024>>>(ref, num_elements, device_result.data()); thrust::host_vector host_result = device_result; size_type num_matches = 0; diff --git a/tests/bit_vector/get_test.cu b/tests/bit_vector/get_test.cu index a542dec8a..e97640767 100644 --- a/tests/bit_vector/get_test.cu +++ b/tests/bit_vector/get_test.cu @@ -21,12 +21,12 @@ #include #include -template -__global__ void get_kernel(BitVectorRef ref, size_type n, size_type* output) +template +__global__ void get_kernel(BitVectorRef ref, size_type num_elements, OutputIt output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; - while (index < n) { + while (index < num_elements) { output[index] = ref.get(index); index += stride; } @@ -51,7 +51,7 @@ TEST_CASE("Get test", "") // Device-ref test auto ref = bv.ref(cuco::experimental::bv_read); thrust::device_vector get_result(num_elements); - get_kernel<<<1, 1024>>>(ref, num_elements, thrust::raw_pointer_cast(get_result.data())); + get_kernel<<<1, 1024>>>(ref, num_elements, get_result.data()); size_type num_set = thrust::reduce(thrust::device, get_result.begin(), get_result.end(), 0); REQUIRE(num_set == num_set_ref); diff --git a/tests/bit_vector/rank_test.cu b/tests/bit_vector/rank_test.cu index 709c04d2e..797c74b82 100644 --- a/tests/bit_vector/rank_test.cu +++ b/tests/bit_vector/rank_test.cu @@ -24,12 +24,12 @@ #include -template -__global__ void rank_kernel(BitVectorRef ref, size_type n, size_type* output) +template +__global__ void rank_kernel(BitVectorRef ref, size_type num_elements, OutputIt output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; - while (index < n) { + while (index < num_elements) { output[index] = ref.rank(index); index += stride; } @@ -51,7 +51,7 @@ TEST_CASE("Rank test", "") thrust::device_vector rank_result_device(num_elements); auto ref = bv.ref(cuco::experimental::bv_read); - rank_kernel<<<1, 1024>>>(ref, num_elements, thrust::raw_pointer_cast(rank_result_device.data())); + rank_kernel<<<1, 1024>>>(ref, num_elements, rank_result_device.data()); thrust::host_vector rank_result = rank_result_device; size_type cur_rank = 0; diff --git a/tests/bit_vector/select_test.cu b/tests/bit_vector/select_test.cu index 8ca3bfde1..af0e5fab7 100644 --- a/tests/bit_vector/select_test.cu +++ b/tests/bit_vector/select_test.cu @@ -24,23 +24,23 @@ #include -template -__global__ void select_kernel(BitVectorRef ref, size_type n, size_type* output) +template +__global__ void select_kernel(BitVectorRef ref, size_type num_elements, OutputIt output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; - while (index < n) { + while (index < num_elements) { output[index] = ref.select(index); index += stride; } } -template -__global__ void select0_kernel(BitVectorRef ref, size_type n, size_type* output) +template +__global__ void select0_kernel(BitVectorRef ref, size_type num_elements, OutputIt output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; - while (index < n) { + while (index < num_elements) { output[index] = ref.select0(index); index += stride; } @@ -66,7 +66,7 @@ TEST_CASE("Select test", "") // Check select { thrust::device_vector device_result(num_set); - select_kernel<<<1, 1024>>>(ref, num_set, thrust::raw_pointer_cast(device_result.data())); + select_kernel<<<1, 1024>>>(ref, num_set, device_result.data()); thrust::host_vector host_result = device_result; size_type num_matches = 0; @@ -86,7 +86,7 @@ TEST_CASE("Select test", "") size_type num_not_set = num_elements - num_set; thrust::device_vector device_result(num_not_set); - select0_kernel<<<1, 1024>>>(ref, num_not_set, thrust::raw_pointer_cast(device_result.data())); + select0_kernel<<<1, 1024>>>(ref, num_not_set, device_result.data()); thrust::host_vector host_result = device_result; size_type num_matches = 0; From f74bee9370517e91b1dbcd1af1afcfc9e8052f24 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 23 Aug 2023 17:11:20 +0000 Subject: [PATCH 48/99] Comments --- include/cuco/detail/trie/bit_vector/bit_vector.cuh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 0b157b2e0..2ba6d542a 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -51,8 +51,10 @@ struct rank { * @brief Sets base rank of current 256-bit interval * * @param abs Base rank + * + * @return */ - void constexpr set_abs(uint64_t abs) noexcept + constexpr void set_abs(uint64_t abs) noexcept { abs_hi_ = static_cast(abs >> 8); abs_lo_ = static_cast(abs); @@ -80,10 +82,14 @@ union rank_union { * * @tparam Allocator Type of allocator used for device storage */ - template > class bit_vector { public: + /** + * @brief Constructs an empty bitvector + * + * @param allocator Allocator for internal storage + */ bit_vector(Allocator const& allocator = Allocator{}); ~bit_vector(); From eb7f9572fd61fc3d052e53736f4a2b47a7bf72a1 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 23 Aug 2023 17:54:27 +0000 Subject: [PATCH 49/99] Use unique_ptrs --- .../detail/trie/bit_vector/bit_vector.cuh | 7 ++--- .../detail/trie/bit_vector/bit_vector.inl | 26 +++++-------------- 2 files changed, 10 insertions(+), 23 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 2ba6d542a..2ad364ee0 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -204,7 +204,7 @@ class bit_vector { // Device-side structures allocator_type allocator_; ///< Allocator used to (de)allocate temporary storage - storage_type *aow_words_, *aow_ranks_, *aow_selects_, *aow_ranks0_, *aow_selects0_; + std::unique_ptr aow_words_, aow_ranks_, aow_selects_, aow_ranks0_, aow_selects0_; /** * @brief Constructs device-side structures and clears host-side structures @@ -214,7 +214,7 @@ class bit_vector { void move_to_device() noexcept; /** - * @brief Creates a new window structure on device and intitializes it with contents of host array + * @brief Creates a new window structure on device and initializes it with contents of host array * * @tparam T Type of host array elements * @@ -222,7 +222,8 @@ class bit_vector { * @param host_array host array whose contents are used to intialize aow */ template - void copy_host_array_to_aow(storage_type** aow, std::vector& host_array) noexcept; + void copy_host_array_to_aow(std::unique_ptr* aow, + std::vector& host_array) noexcept; /** * @brief Populates rank and select indexes on host diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 483d6b81f..7d18af8e6 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -20,29 +20,13 @@ namespace experimental { template bit_vector::bit_vector(Allocator const& allocator) - : words_{}, - ranks_{}, - ranks0_{}, - selects_{}, - selects0_{}, - n_bits_(0), - allocator_(allocator), - aow_words_(nullptr), - aow_ranks_(nullptr), - aow_selects_(nullptr), - aow_ranks0_(nullptr), - aow_selects0_(nullptr) + : words_{}, ranks_{}, ranks0_{}, selects_{}, selects0_{}, n_bits_(0), allocator_(allocator) { } template bit_vector::~bit_vector() { - delete aow_words_; - delete aow_ranks_; - delete aow_selects_; - delete aow_ranks0_; - delete aow_selects0_; } template @@ -219,11 +203,11 @@ void bit_vector::add_selects_entry(size_type word_id, template template -void bit_vector::copy_host_array_to_aow(storage_type** aow, +void bit_vector::copy_host_array_to_aow(std::unique_ptr* aow, std::vector& host_array) noexcept { uint64_t num_elements = host_array.size(); - *aow = new storage_type(extent{num_elements + 1}, allocator_); + *aow = std::make_unique(extent{num_elements + 1}, allocator_); if (num_elements > 0) { // Move host array to device memory @@ -250,7 +234,9 @@ __global__ void copy_to_window(WindowT* windows, cuco::detail::index_type n, T* } template -void initialize_aow(Storage* storage, thrust::device_vector& device_array, uint64_t num_elements) +void initialize_aow(std::unique_ptr& storage, + thrust::device_vector& device_array, + uint64_t num_elements) { auto constexpr stride = 4; auto const grid_size = (num_elements + stride * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / From 8687daa9ca744bd61826df0a9147f395047a07fd Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 23 Aug 2023 18:36:35 +0000 Subject: [PATCH 50/99] Use cuda::std intrinsics --- include/cuco/detail/trie/bit_vector/bit_vector.inl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 7d18af8e6..9dc308c14 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -170,7 +170,7 @@ void bit_vector::build_ranks_and_selects(const std::vector slot_type word = flip_bits ? ~words[word_id] : words[word_id]; size_type prev_count = count; - count += __builtin_popcountll(word); + count += cuda::std::popcount(word); if ((prev_count - 1) / bits_per_block != (count - 1) / bits_per_block) { add_selects_entry(word_id, word, prev_count, selects); @@ -189,7 +189,7 @@ void bit_vector::add_selects_entry(size_type word_id, std::vector& selects) noexcept { while (word != 0) { - size_type pos = __builtin_ctzll(word); + size_type pos = cuda::std::countr_zero(word); if (count % bits_per_block == 0) { selects.push_back((word_id * bits_per_word + pos) / bits_per_block); From 005bd5d42177a3239e4b23545e9f328e7bb07633 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 23 Aug 2023 20:01:43 +0000 Subject: [PATCH 51/99] Curly braces in initialization list --- include/cuco/detail/trie/bit_vector/bit_vector.inl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 9dc308c14..21e21bbd2 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -20,7 +20,7 @@ namespace experimental { template bit_vector::bit_vector(Allocator const& allocator) - : words_{}, ranks_{}, ranks0_{}, selects_{}, selects0_{}, n_bits_(0), allocator_(allocator) + : words_{}, ranks_{}, ranks0_{}, selects_{}, selects0_{}, n_bits_{0}, allocator_{allocator} { } From 7485d2891cebae96124a54beb13ba02e9f3f17e6 Mon Sep 17 00:00:00 2001 From: amukkara <134339030+amukkara@users.noreply.github.com> Date: Wed, 23 Aug 2023 13:09:26 -0700 Subject: [PATCH 52/99] Remove unused header Co-authored-by: Yunsong Wang --- include/cuco/detail/trie/bit_vector/bit_vector.cuh | 1 - 1 file changed, 1 deletion(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 2ad364ee0..be1cc2e16 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -24,7 +24,6 @@ #include -#include namespace cuco { namespace experimental { From 8feefc8ccb2ecbf5b8569cc698d597f2c9b5e623 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 23 Aug 2023 20:09:32 +0000 Subject: [PATCH 53/99] [pre-commit.ci] auto code formatting --- include/cuco/detail/trie/bit_vector/bit_vector.cuh | 1 - 1 file changed, 1 deletion(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index be1cc2e16..8bb6f880b 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -24,7 +24,6 @@ #include - namespace cuco { namespace experimental { From c5ec25452ea9a37d355c2eaca787574bea78ec01 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Thu, 24 Aug 2023 02:35:52 +0000 Subject: [PATCH 54/99] Collect aow refs in a single struct rank_union is not needed after this change --- .../detail/trie/bit_vector/bit_vector.cuh | 49 ++++++++++------ .../detail/trie/bit_vector/bit_vector.inl | 14 ++--- .../detail/trie/bit_vector/bit_vector_ref.cuh | 16 +----- .../detail/trie/bit_vector/bit_vector_ref.inl | 56 +++++++++---------- 4 files changed, 67 insertions(+), 68 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 8bb6f880b..b5d507ccd 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -18,7 +18,6 @@ #pragma once #include -#include #include #include @@ -59,16 +58,6 @@ struct rank { } }; -/** - * @brief Union of 64-bit word with rank - * - * Need this so that all aow_storage structures in bitvector have 64-bit element type - */ -union rank_union { - uint64_t word_; ///< word view - rank rank_; ///< rank view -}; - /** * @brief Bitvector class with rank and select index structures * @@ -159,13 +148,33 @@ class bit_vector { using allocator_type = Allocator; ///< Allocator type using slot_type = uint64_t; ///< Slot type - using storage_type = - aow_storage, allocator_type>; ///< Storage type - using storage_ref_type = typename storage_type::ref_type; ///< Non-owning window storage ref type + using words_storage_type = + aow_storage, allocator_type>; ///< storage type for words + using ranks_storage_type = + aow_storage, allocator_type>; ///< storage type for ranks + using selects_storage_type = + aow_storage, allocator_type>; ///< storage type for selects + + /** + *@brief Struct to hold all storage refs needed by bitvector_ref + */ + struct device_storage_ref { + using size_type = size_type; ///< Size type + using slot_type = slot_type; ///< Slot type + + typename words_storage_type::ref_type words_ref_; ///< Words ref + + typename ranks_storage_type::ref_type ranks_ref_; ///< Ranks refs + typename selects_storage_type::ref_type selects_ref_; ///< Selects refs + + typename ranks_storage_type::ref_type ranks0_ref_; ///< Ranks refs for 0 bits + typename selects_storage_type::ref_type selects0_ref_; ///< Selects refs 0 bits + }; + template using ref_type = - bit_vector_ref; ///< Non-owning container ref type + bit_vector_ref; ///< Non-owning container ref type /** * @brief Get device ref with operators. @@ -202,7 +211,13 @@ class bit_vector { // Device-side structures allocator_type allocator_; ///< Allocator used to (de)allocate temporary storage - std::unique_ptr aow_words_, aow_ranks_, aow_selects_, aow_ranks0_, aow_selects0_; + std::unique_ptr aow_words_; + + std::unique_ptr aow_ranks_; + std::unique_ptr aow_ranks0_; + + std::unique_ptr aow_selects_; + std::unique_ptr aow_selects0_; /** * @brief Constructs device-side structures and clears host-side structures @@ -219,7 +234,7 @@ class bit_vector { * @param aow pointer to destination (device window structure) * @param host_array host array whose contents are used to intialize aow */ - template + template void copy_host_array_to_aow(std::unique_ptr* aow, std::vector& host_array) noexcept; diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 21e21bbd2..377246126 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -202,7 +202,7 @@ void bit_vector::add_selects_entry(size_type word_id, } template -template +template void bit_vector::copy_host_array_to_aow(std::unique_ptr* aow, std::vector& host_array) noexcept { @@ -242,7 +242,7 @@ void initialize_aow(std::unique_ptr& storage, auto const grid_size = (num_elements + stride * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / (stride * detail::CUCO_DEFAULT_BLOCK_SIZE); - auto device_ptr = reinterpret_cast(thrust::raw_pointer_cast(device_array.data())); + auto device_ptr = thrust::raw_pointer_cast(device_array.data()); copy_to_window<<>>( storage->data(), num_elements, device_ptr); } @@ -252,11 +252,11 @@ template auto bit_vector::ref(Operators...) const noexcept { static_assert(sizeof...(Operators), "No operators specified"); - return ref_type{aow_words_->ref(), - aow_ranks_->ref(), - aow_selects_->ref(), - aow_ranks0_->ref(), - aow_selects0_->ref()}; + return ref_type{device_storage_ref{aow_words_->ref(), + aow_ranks_->ref(), + aow_selects_->ref(), + aow_ranks0_->ref(), + aow_selects0_->ref()}}; } } // namespace experimental diff --git a/include/cuco/detail/trie/bit_vector/bit_vector_ref.cuh b/include/cuco/detail/trie/bit_vector/bit_vector_ref.cuh index f925afcef..3879b7b88 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector_ref.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector_ref.cuh @@ -5,8 +5,6 @@ namespace cuco { namespace experimental { -struct Rank; - /** * @brief Device non-owning "ref" type that can be used in device code to perform arbitrary * operations defined in `include/cuco/operator.hpp` @@ -23,20 +21,12 @@ class bit_vector_ref /** * @brief Constructs bit_vector_ref. * - * @param words_ref Non-owning ref of words slot storage - * @param ranks_ref Non-owning ref of ranks slot storage - * @param selects_ref Non-owning ref of selects slot storage - * @param ranks0_ref Non-owning ref of ranks0 slot storage - * @param selects0_ref Non-owning ref of selects0 slot storage + * @param storage Struct with non-owning refs to bitvector slot storages */ - __host__ __device__ explicit constexpr bit_vector_ref(storage_ref_type words_ref, - storage_ref_type ranks_ref, - storage_ref_type selects_ref, - storage_ref_type ranks0_ref, - storage_ref_type selects0_ref) noexcept; + __host__ __device__ explicit constexpr bit_vector_ref(storage_ref_type storage) noexcept; private: - storage_ref_type words_ref_, ranks_ref_, selects_ref_, ranks0_ref_, selects0_ref_; + storage_ref_type storage_; // Mixins need to be friends with this class in order to access private members template diff --git a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl index 18877731e..c0831f685 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl @@ -5,16 +5,8 @@ namespace experimental { template __host__ __device__ constexpr bit_vector_ref::bit_vector_ref( - StorageRef words_ref, - StorageRef ranks_ref, - StorageRef selects_ref, - StorageRef ranks0_ref, - StorageRef selects0_ref) noexcept - : words_ref_{words_ref}, - ranks_ref_{ranks_ref}, - selects_ref_{selects_ref}, - ranks0_ref_{ranks0_ref}, - selects0_ref_{selects0_ref} + StorageRef storage) noexcept + : storage_{storage} { } @@ -24,7 +16,7 @@ template class operator_impl> { using ref_type = bit_vector_ref; ///< Bitvector ref type using size_type = typename StorageRef::size_type; ///< Size type - using slot_type = typename StorageRef::value_type; ///< Slot type + using slot_type = typename StorageRef::slot_type; ///< Slot type static constexpr size_type bits_per_word = sizeof(slot_type) * 8; static constexpr size_type words_per_block = 4; //< This should match the defintion in bit_vector @@ -40,7 +32,7 @@ class operator_impl> { [[nodiscard]] __device__ bool get(size_type key) const noexcept { auto const& ref_ = static_cast(*this); - return (ref_.words_ref_[key / bits_per_word][0] >> (key % bits_per_word)) & 1UL; + return (ref_.storage_.words_ref_[key / bits_per_word][0] >> (key % bits_per_word)) & 1UL; } /** @@ -53,7 +45,7 @@ class operator_impl> { [[nodiscard]] __device__ slot_type get_word(size_type word_id) const noexcept { auto const& ref_ = static_cast(*this); - return ref_.words_ref_[word_id][0]; + return ref_.storage_.words_ref_[word_id][0]; } /** @@ -68,10 +60,10 @@ class operator_impl> { auto const& ref_ = static_cast(*this); size_type word_id = key / bits_per_word; size_type bit_id = key % bits_per_word; - slot_type word = ref_.words_ref_[word_id][0]; + slot_type word = ref_.storage_.words_ref_[word_id][0]; word &= ~(0lu) << bit_id; while (word == 0) { - word = ref_.words_ref_[++word_id][0]; + word = ref_.storage_.words_ref_[++word_id][0]; } return word_id * bits_per_word + __ffsll(word) - 1; // cuda intrinsic } @@ -92,12 +84,12 @@ class operator_impl> { size_type rank_id = word_id / words_per_block; size_type rel_id = word_id % words_per_block; - auto rank = rank_union{ref_.ranks_ref_[rank_id][0]}.rank_; + auto rank = ref_.storage_.ranks_ref_[rank_id][0]; size_type n = rank.abs(); if (rel_id != 0) { n += rank.rels_[rel_id - 1]; } - n += cuda::std::popcount(ref_.words_ref_[word_id][0] & ((1UL << bit_id) - 1)); + n += cuda::std::popcount(ref_.storage_.words_ref_[word_id][0] & ((1UL << bit_id) - 1)); return n; } @@ -111,15 +103,15 @@ class operator_impl> { */ [[nodiscard]] __device__ size_type select(size_type count) const noexcept { - auto const& ref_ = static_cast(*this); + auto const& storage_ = static_cast(*this).storage_; - size_type rank_id = get_initial_rank_estimate(count, ref_.selects_ref_, ref_.ranks_ref_); - auto rank = rank_union{ref_.ranks_ref_[rank_id][0]}.rank_; + auto rank_id = get_initial_rank_estimate(count, storage_.selects_ref_, storage_.ranks_ref_); + auto rank = storage_.ranks_ref_[rank_id][0]; size_type word_id = rank_id * words_per_block; word_id += subtract_rank_from_count(count, rank); - return word_id * bits_per_word + select_bit_in_word(count, ref_.words_ref_[word_id][0]); + return word_id * bits_per_word + select_bit_in_word(count, storage_.words_ref_[word_id][0]); } /** @@ -131,15 +123,15 @@ class operator_impl> { */ [[nodiscard]] __device__ size_type select0(size_type count) const noexcept { - auto const& ref_ = static_cast(*this); + auto const& storage_ = static_cast(*this).storage_; - size_type rank_id = get_initial_rank_estimate(count, ref_.selects0_ref_, ref_.ranks0_ref_); - auto rank = rank_union{ref_.ranks0_ref_[rank_id][0]}.rank_; + auto rank_id = get_initial_rank_estimate(count, storage_.selects0_ref_, storage_.ranks0_ref_); + auto rank = storage_.ranks0_ref_[rank_id][0]; size_type word_id = rank_id * words_per_block; word_id += subtract_rank_from_count(count, rank); - return word_id * bits_per_word + select_bit_in_word(count, ~ref_.words_ref_[word_id][0]); + return word_id * bits_per_word + select_bit_in_word(count, ~(storage_.words_ref_[word_id][0])); } private: @@ -152,21 +144,23 @@ class operator_impl> { * * @return index in ranks which corresponds to highest rank less than count (least upper bound) */ - [[nodiscard]] __device__ size_type get_initial_rank_estimate( - size_type count, const StorageRef& selects, const StorageRef& ranks) const noexcept + template + [[nodiscard]] __device__ size_type get_initial_rank_estimate(size_type count, + const SelectsRef& selects, + const RanksRef& ranks) const noexcept { size_type block_id = count / (bits_per_word * words_per_block); size_type begin = selects[block_id][0]; size_type end = selects[block_id + 1][0] + 1UL; if (begin + 10 >= end) { // Linear search - while (count >= rank_union{ranks[begin + 1][0]}.rank_.abs()) { + while (count >= ranks[begin + 1][0].abs()) { ++begin; } } else { // Binary search while (begin + 1 < end) { size_type middle = (begin + end) / 2; - if (count < rank_union{ranks[middle][0]}.rank_.abs()) { + if (count < ranks[middle][0].abs()) { end = middle; } else { begin = middle; @@ -219,7 +213,7 @@ template class operator_impl> { using ref_type = bit_vector_ref; ///< Bitvector ref type using size_type = typename StorageRef::size_type; ///< Size type - using slot_type = typename StorageRef::value_type; ///< Slot type + using slot_type = typename StorageRef::slot_type; ///< Slot type static constexpr size_type bits_per_word = sizeof(slot_type) * 8; public: @@ -235,7 +229,7 @@ class operator_impl> { size_type word_id = key / bits_per_word; size_type bit_id = key % bits_per_word; - slot_type& word = ref_.words_ref_[word_id][0]; + auto& word = ref_.storage_.words_ref_[word_id][0]; if (bit) { word |= 1UL << bit_id; From 4451d7e19a6ee2041a6ae0522877abc474b2d97b Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Thu, 24 Aug 2023 03:02:25 +0000 Subject: [PATCH 55/99] Fix includes --- include/cuco/detail/trie/bit_vector/bit_vector.cuh | 1 - include/cuco/detail/trie/bit_vector/bit_vector_ref.cuh | 2 ++ include/cuco/detail/trie/bit_vector/bit_vector_ref.inl | 9 ++++++--- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index b5d507ccd..9dfb78d93 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -272,4 +272,3 @@ class bit_vector { } // namespace cuco #include -#include diff --git a/include/cuco/detail/trie/bit_vector/bit_vector_ref.cuh b/include/cuco/detail/trie/bit_vector/bit_vector_ref.cuh index 3879b7b88..5bd0e4499 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector_ref.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector_ref.cuh @@ -35,3 +35,5 @@ class bit_vector_ref } // namespace experimental } // namespace cuco + +#include diff --git a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl index c0831f685..39c793642 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl @@ -1,4 +1,4 @@ -#include +#include namespace cuco { namespace experimental { @@ -173,13 +173,16 @@ class operator_impl> { /** * @brief Subtract rank estimate from input count and return an increment to word_id * + * @tparam Rank type + * * @param count Input count that will be updated * @param rank Initial rank estimate for count * * @return Increment to word_id based on rank values */ - [[nodiscard]] __device__ size_type - subtract_rank_from_count(size_type& count, cuco::experimental::rank rank) const noexcept + template + [[nodiscard]] __device__ size_type subtract_rank_from_count(size_type& count, + Rank rank) const noexcept { count -= rank.abs(); From 4b46a9cc9a7e8cfd508ba26da251b6f254eec8d0 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Thu, 24 Aug 2023 05:05:05 +0000 Subject: [PATCH 56/99] Avoid repeated definition of same magic number --- include/cuco/detail/trie/bit_vector/bit_vector.cuh | 13 ++++++++----- .../cuco/detail/trie/bit_vector/bit_vector_ref.inl | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 9dfb78d93..381979ee6 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -160,8 +160,9 @@ class bit_vector { *@brief Struct to hold all storage refs needed by bitvector_ref */ struct device_storage_ref { - using size_type = size_type; ///< Size type - using slot_type = slot_type; ///< Slot type + using size_type = size_type; ///< Size type + using slot_type = slot_type; ///< Slot type + using bit_vector_type = bit_vector<>; ///< bit_vector_ref needs this to access words_per_block typename words_storage_type::ref_type words_ref_; ///< Words ref @@ -195,12 +196,14 @@ class bit_vector { */ size_type constexpr size() const noexcept { return n_bits_; } + static constexpr size_type words_per_block = 4; ///< Tradeoff between space efficiency and perf. + private: size_type n_bits_; ///< Number of bits bit_vector currently holds - static constexpr size_type bits_per_word = sizeof(slot_type) * 8; ///< Bits in a word - static constexpr size_type words_per_block = 4; ///< Tradeoff between space efficiency and perf. - static constexpr size_type bits_per_block = words_per_block * bits_per_word; + // These could be public if needed by other classes. Private for now + static constexpr size_type bits_per_word = sizeof(slot_type) * 8; ///< Bits in a word + static constexpr size_type bits_per_block = words_per_block * bits_per_word; ///< Trivial // Host-side structures std::vector words_; ///< Words vector that represents all bits diff --git a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl index 39c793642..3ba5221d9 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl @@ -19,7 +19,7 @@ class operator_impl> { using slot_type = typename StorageRef::slot_type; ///< Slot type static constexpr size_type bits_per_word = sizeof(slot_type) * 8; - static constexpr size_type words_per_block = 4; //< This should match the defintion in bit_vector + static constexpr size_type words_per_block = StorageRef::bit_vector_type::words_per_block; public: /** From 3e4a4130f5e5537b62a97b9e3a9ba1020cb8ce63 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 25 Aug 2023 08:01:04 +0000 Subject: [PATCH 57/99] Larger bitvector sizes in tests --- tests/bit_vector/rank_test.cu | 2 +- tests/bit_vector/select_test.cu | 2 +- tests/bit_vector/set_test.cu | 2 +- tests/bit_vector/size_test.cu | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/bit_vector/rank_test.cu b/tests/bit_vector/rank_test.cu index 797c74b82..c271cf947 100644 --- a/tests/bit_vector/rank_test.cu +++ b/tests/bit_vector/rank_test.cu @@ -42,7 +42,7 @@ TEST_CASE("Rank test", "") cuco::experimental::bit_vector bv; using size_type = cuco::experimental::bit_vector<>::size_type; - constexpr size_type num_elements{400}; + constexpr size_type num_elements{4000}; for (size_type i = 0; i < num_elements; i++) { bv.append(modulo_bitgen(i)); diff --git a/tests/bit_vector/select_test.cu b/tests/bit_vector/select_test.cu index af0e5fab7..6d90e3c11 100644 --- a/tests/bit_vector/select_test.cu +++ b/tests/bit_vector/select_test.cu @@ -53,7 +53,7 @@ TEST_CASE("Select test", "") cuco::experimental::bit_vector bv; using size_type = cuco::experimental::bit_vector<>::size_type; - constexpr size_type num_elements{400}; + constexpr size_type num_elements{4000}; size_type num_set = 0; for (size_type i = 0; i < num_elements; i++) { diff --git a/tests/bit_vector/set_test.cu b/tests/bit_vector/set_test.cu index 36665028c..46acb4c2e 100644 --- a/tests/bit_vector/set_test.cu +++ b/tests/bit_vector/set_test.cu @@ -26,7 +26,7 @@ TEST_CASE("Set test", "") cuco::experimental::bit_vector bv; using size_type = cuco::experimental::bit_vector<>::size_type; - size_type num_elements{400}; + constexpr size_type num_elements{400}; // Set odd bits on host for (size_t i = 0; i < num_elements; i++) { diff --git a/tests/bit_vector/size_test.cu b/tests/bit_vector/size_test.cu index 5fe516e99..5d128d418 100644 --- a/tests/bit_vector/size_test.cu +++ b/tests/bit_vector/size_test.cu @@ -22,7 +22,7 @@ TEST_CASE("Size computation", "") { cuco::experimental::bit_vector bv; using size_type = cuco::experimental::bit_vector<>::size_type; - size_type num_elements{400}; + constexpr size_type num_elements{400}; for (size_type i = 0; i < num_elements; i++) { bv.append(i % 2 == 0); // Alternate 0s and 1s pattern From f378626307ad75b3b6e2520b5c671d0c94f3c598 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 25 Aug 2023 07:53:34 +0000 Subject: [PATCH 58/99] Device kernels for rank and select generation --- .../detail/trie/bit_vector/bit_vector.cuh | 72 ++--- .../detail/trie/bit_vector/bit_vector.inl | 304 +++++++++++++----- 2 files changed, 248 insertions(+), 128 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 381979ee6..939faedce 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -48,10 +48,8 @@ struct rank { * @brief Sets base rank of current 256-bit interval * * @param abs Base rank - * - * @return */ - constexpr void set_abs(uint64_t abs) noexcept + __host__ __device__ void set_abs(uint64_t abs) noexcept { abs_hi_ = static_cast(abs >> 8); abs_lo_ = static_cast(abs); @@ -206,69 +204,57 @@ class bit_vector { static constexpr size_type bits_per_block = words_per_block * bits_per_word; ///< Trivial // Host-side structures - std::vector words_; ///< Words vector that represents all bits - std::vector ranks_; ///< Holds the rank values for every 256-th bit (4-th word) - std::vector ranks0_; ///< Same as ranks_ but for `0` bits - std::vector selects_; ///< Holds indices of (0, 256, 512...)th `1` bit in ranks_ - std::vector selects0_; ///< Same as selects_, but for `0` bits + std::vector words_; ///< Words vector that represents all bits // Device-side structures - allocator_type allocator_; ///< Allocator used to (de)allocate temporary storage - std::unique_ptr aow_words_; + thrust::device_vector d_words_; ///< Device words vector + thrust::device_vector ranks_; ///< Rank values for every 256-th bit (4-th word) + thrust::device_vector ranks0_; ///< Same as ranks_ but for `0` bits + thrust::device_vector selects_; ///< Block indices of (0, 256, 512...)th `1` bit + thrust::device_vector selects0_; ///< Same as selects_, but for `0` bits + allocator_type allocator_; ///< Allocator used to (de)allocate temporary storage + std::unique_ptr aow_words_; ///< Array of window storage structure std::unique_ptr aow_ranks_; std::unique_ptr aow_ranks0_; - std::unique_ptr aow_selects_; std::unique_ptr aow_selects0_; /** - * @brief Constructs device-side structures and clears host-side structures + * @brief Populates rank and select indexes on device * - * Takes a snapshot of bitvector and creates a device-side copy + * @param ranks Output array of ranks + * @param selects Output array of selects + * @param flip_bits If true, negate bits to construct indexes for `0` bits */ - void move_to_device() noexcept; + void build_ranks_and_selects(thrust::device_vector& ranks, + thrust::device_vector& selects, + bool flip_bits); /** - * @brief Creates a new window structure on device and initializes it with contents of host array + * @brief Creates a new window structure on device and initializes it from a device array * - * @tparam T Type of host array elements + * @tparam T Type of device array elements + * @tparam storage_type Storage type * * @param aow pointer to destination (device window structure) - * @param host_array host array whose contents are used to intialize aow + * @param device_array device array whose contents are used to intialize aow */ template - void copy_host_array_to_aow(std::unique_ptr* aow, - std::vector& host_array) noexcept; - - /** - * @brief Populates rank and select indexes on host - * - * @param words Aarray of words with all bits - * @param ranks Output array of ranks - * @param selects Output array of selects - * @param flip_bits If true, negate bits to construct indexes for `0` bits - */ - void build_ranks_and_selects(const std::vector& words, - std::vector& ranks, - std::vector& selects, - bool flip_bits) noexcept; + void copy_device_array_to_aow(std::unique_ptr* aow, + thrust::device_vector& device_array) noexcept; /** - * @brief Add an entry to selects index that points to bits in a given word + * @brief Helper function to calculate grid size for simple kernels * - * Entry will be added only when bitcount in current word pushes total bitcount beyond a - * 'bits_per_block' boundary + * @param num_elements Elements being processed by kernel * - * @param word_id Index of current word - * @param word Current word - * @param count_in Running count of set bits in all previous words - * @param selects Selects index + * @return grid size */ - void add_selects_entry(size_type word_id, - slot_type word, - size_type count, - std::vector& selects) noexcept; + size_type constexpr default_grid_size(size_type num_elements) const noexcept + { + return (num_elements - 1) / (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE) + 1; + } }; } // namespace experimental diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 377246126..22448987d 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -15,6 +15,9 @@ * limitations under the License. */ +#include +#include + namespace cuco { namespace experimental { @@ -34,7 +37,7 @@ void bit_vector::append(bool bit) noexcept { if (n_bits_ % bits_per_block == 0) { size_type new_n_bits = n_bits_ + bits_per_block; // Extend storage by one block - size_type new_n_words = new_n_bits / words_per_block; + size_type new_n_words = new_n_bits / bits_per_word; words_.resize(new_n_words); } @@ -72,10 +75,8 @@ void bit_vector::get(KeyIt keys_begin, auto const num_keys = cuco::detail::distance(keys_begin, keys_end); if (num_keys == 0) { return; } - auto const grid_size = - (num_keys - 1) / (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE) + 1; - - auto ref_ = this->ref(cuco::experimental::bv_read); + auto grid_size = default_grid_size(num_keys); + auto ref_ = this->ref(cuco::experimental::bv_read); bitvector_get_kernel<<>>( ref_, keys_begin, outputs_begin, num_keys); @@ -91,19 +92,30 @@ void bit_vector::set(KeyIt keys_begin, auto const num_keys = cuco::detail::distance(keys_begin, keys_end); if (num_keys == 0) { return; } - auto const grid_size = - (num_keys - 1) / (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE) + 1; - - auto ref_ = this->ref(cuco::experimental::bv_set); + auto grid_size = default_grid_size(num_keys); + auto ref_ = this->ref(cuco::experimental::bv_set); bitvector_set_kernel<<>>( ref_, keys_begin, vals_begin, num_keys); } -template +/* + * @brief Gather bits of a range of keys + * + * @tparam BitvectorRef Bitvector reference type + * @tparam KeyIt Device-accessible iterator to input keys + * @tparam ValueIt Device-accessible iterator to values + * @tparam size_type Size type + * + * @param ref Bitvector ref + * @param keys Begin iterator to keys + * @param outputs Begin iterator to outputs + * @param num_keys Number of input keys + */ +template __global__ void bitvector_get_kernel(BitvectorRef ref, KeyIt keys, - OutputIt outputs, + ValueIt outputs, size_type num_keys) { uint32_t const loop_stride = gridDim.x * blockDim.x; @@ -115,6 +127,19 @@ __global__ void bitvector_get_kernel(BitvectorRef ref, } } +/* + * @brief Set bits of a range of keys + * + * @tparam BitvectorRef Bitvector reference type + * @tparam KeyIt Device-accessible iterator to input keys + * @tparam ValueIt Device-accessible iterator to values + * @tparam size_type Size type + * + * @param ref Bitvector ref + * @param keys Begin iterator to input keys + * @param values Begin iterator to input values + * @param num_keys Number of input keys + */ template __global__ void bitvector_set_kernel(BitvectorRef ref, KeyIt keys, @@ -130,93 +155,197 @@ __global__ void bitvector_set_kernel(BitvectorRef ref, } } -template -void bit_vector::build() noexcept +/* + * @brief Computes number of set or not-set bits in each word + * + * @tparam slot_type Word type + * @tparam size_type Size type + * + * @param words Input array of words + * @param bit_counts Output array of per-word bit counts + * @param num_words Number of words + * @param flip_bits Boolean to request negation of words before counting bits + */ +template +__global__ void bit_counts_kernel(const slot_type* words, + size_type* bit_counts, + size_type num_words, + bool flip_bits) { - build_ranks_and_selects(words_, ranks_, selects_, false); // 1-bits - build_ranks_and_selects(words_, ranks0_, selects0_, true); // 0-bits - move_to_device(); -} + size_type word_id = blockDim.x * blockIdx.x + threadIdx.x; + size_type stride = gridDim.x * blockDim.x; -template -void bit_vector::move_to_device() noexcept -{ - copy_host_array_to_aow(&aow_words_, words_); - copy_host_array_to_aow(&aow_ranks_, ranks_); - copy_host_array_to_aow(&aow_selects_, selects_); - copy_host_array_to_aow(&aow_ranks0_, ranks0_); - copy_host_array_to_aow(&aow_selects0_, selects0_); + while (word_id < num_words) { + auto word = words[word_id]; + bit_counts[word_id] = cuda::std::popcount(flip_bits ? ~word : word); + word_id += stride; + } } -template -void bit_vector::build_ranks_and_selects(const std::vector& words, - std::vector& ranks, - std::vector& selects, - bool flip_bits) noexcept -{ - size_type n_blocks = words.size() / words_per_block; - ranks.resize(n_blocks + 1); - - size_type count = 0; - for (size_type block_id = 0; block_id < n_blocks; ++block_id) { - ranks[block_id].set_abs(count); +/* + * @brief Compute rank values at block size intervals. + * + * ranks[i] = Number of set bits in [0, i) range + * This kernel transforms prefix sum array of per-word bit counts + * into base-delta encoding style of `rank` struct. + * Since prefix sum is available, there are no dependencies across blocks. - for (size_type block_offset = 0; block_offset < words_per_block; ++block_offset) { - if (block_offset != 0) { // Compute deltas - ranks[block_id].rels_[block_offset - 1] = count - ranks[block_id].abs(); - } + * @tparam size_type Size type + * + * @param prefix_bit_counts Prefix sum array of per-word bit counts + * @param ranks Output array of ranks + * @param num_words Length of input array + * @param num_blocks Length of ouput array + * @param words_per_block Number of words in each block + */ +template +__global__ void encode_ranks_from_prefix_bit_counts(const size_type* prefix_bit_counts, + rank* ranks, + size_type num_words, + size_type num_blocks, + size_type words_per_block) +{ + size_type rank_id = blockDim.x * blockIdx.x + threadIdx.x; + size_type stride = gridDim.x * blockDim.x; - size_type word_id = (block_id * words_per_block) + block_offset; - slot_type word = flip_bits ? ~words[word_id] : words[word_id]; + while (rank_id < num_blocks) { + size_type word_id = rank_id * words_per_block; - size_type prev_count = count; - count += cuda::std::popcount(word); + // Set base value of rank + auto& rank = ranks[rank_id]; + rank.set_abs(prefix_bit_counts[word_id]); - if ((prev_count - 1) / bits_per_block != (count - 1) / bits_per_block) { - add_selects_entry(word_id, word, prev_count, selects); + if (rank_id < num_blocks - 1) { + // For each subsequent word in this block, compute deltas from base + for (size_type block_offset = 0; block_offset < words_per_block - 1; block_offset++) { + auto delta = prefix_bit_counts[word_id + block_offset + 1] - prefix_bit_counts[word_id]; + rank.rels_[block_offset] = delta; } } + rank_id += stride; } - - ranks.back().set_abs(count); - selects.push_back(n_blocks); } -template -void bit_vector::add_selects_entry(size_type word_id, - slot_type word, - size_type count, - std::vector& selects) noexcept +/* + * @brief Compute select values at block size intervals. + * + * selects[i] = Position of (i+ 1)th set bit + * This kernel check for blocks where prefix sum crosses a multiple of `bits_per_block`. + * Such blocks are marked in the output boolean array + * + * @tparam size_type Size type + * + * @param prefix_bit_counts Prefix sum array of per-word bit counts + * @param selects_markers Ouput array indicating whether a block has selects entry or not + * @param num_blocks Length of ouput array + * @param words_per_block Number of words in each block + * @param bits_per_block Number of bits in each block + */ +template +__global__ void mark_blocks_with_select_entries(const size_type* prefix_bit_counts, + bool* select_markers, + size_type num_blocks, + size_type words_per_block, + size_type bits_per_block) { - while (word != 0) { - size_type pos = cuda::std::countr_zero(word); + size_type block_id = blockDim.x * blockIdx.x + threadIdx.x; + size_type stride = gridDim.x * blockDim.x; + + while (block_id < num_blocks) { + if (block_id == 0) { // Block 0 always has a selects entry + select_markers[block_id] = 1; + block_id += stride; + continue; + } + + select_markers[block_id] = 0; // Always clear marker first + size_type word_id = block_id * words_per_block; + size_type prev_count = prefix_bit_counts[word_id]; + + for (size_t block_offset = 1; block_offset <= words_per_block; block_offset++) { + size_type count = prefix_bit_counts[word_id + block_offset]; - if (count % bits_per_block == 0) { - selects.push_back((word_id * bits_per_word + pos) / bits_per_block); - break; + // Selects entry is added when cumulative bitcount crosses a multiple of bits_per_block + if ((prev_count - 1) / bits_per_block != (count - 1) / bits_per_block) { + select_markers[block_id] = 1; + break; + } + prev_count = count; } - word ^= 1UL << pos; - ++count; + block_id += stride; } } template -template -void bit_vector::copy_host_array_to_aow(std::unique_ptr* aow, - std::vector& host_array) noexcept +void bit_vector::build_ranks_and_selects(thrust::device_vector& ranks, + thrust::device_vector& selects, + bool flip_bits) { - uint64_t num_elements = host_array.size(); - *aow = std::make_unique(extent{num_elements + 1}, allocator_); - - if (num_elements > 0) { - // Move host array to device memory - thrust::device_vector device_array = host_array; - host_array.clear(); + size_type num_words = (n_bits_ - 1) / bits_per_word + 1; + // Round up num_words to a block + if (num_words % words_per_block) { num_words += words_per_block - (num_words % words_per_block); } + + // Step 1. Compute prefix sum of per-word bit counts + + // Population counts for each word + // Sized to have one extra entry for subsequent prefix sum + thrust::device_vector bit_counts(num_words + 1); + auto grid_size = default_grid_size(num_words); + bit_counts_kernel<<>>( + thrust::raw_pointer_cast(d_words_.data()), + thrust::raw_pointer_cast(bit_counts.data()), + num_words, + flip_bits); + + thrust::exclusive_scan(thrust::device, bit_counts.begin(), bit_counts.end(), bit_counts.begin()); + + // Step 2. Compute ranks + size_type num_blocks = (num_words - 1) / words_per_block + 2; + ranks.resize(num_blocks); + + grid_size = default_grid_size(num_blocks); + encode_ranks_from_prefix_bit_counts<<>>( + thrust::raw_pointer_cast(bit_counts.data()), + thrust::raw_pointer_cast(ranks.data()), + num_words, + num_blocks, + words_per_block); + + // Step 3. Compute selects + thrust::device_vector select_markers(num_blocks); + mark_blocks_with_select_entries<<>>( + thrust::raw_pointer_cast(bit_counts.data()), + thrust::raw_pointer_cast(select_markers.data()), + num_blocks, + words_per_block, + bits_per_block); + + size_type num_selects = + thrust::reduce(thrust::device, select_markers.begin(), select_markers.end()); + selects.resize(num_selects); + + // Generate indices of non-zeros in select_markers + thrust::copy_if(thrust::device, + thrust::make_counting_iterator(0lu), + thrust::make_counting_iterator(num_blocks), + select_markers.begin(), + selects.begin(), + thrust::identity()); +} - // Copy device array to window structure - initialize_aow(*aow, device_array, num_elements); - } +template +void bit_vector::build() noexcept +{ + d_words_ = words_; + build_ranks_and_selects(ranks_, selects_, false); // 1-bits + build_ranks_and_selects(ranks0_, selects0_, true); // 0-bits + + copy_device_array_to_aow(&aow_words_, d_words_); + copy_device_array_to_aow(&aow_ranks_, ranks_); + copy_device_array_to_aow(&aow_selects_, selects_); + copy_device_array_to_aow(&aow_ranks0_, ranks0_); + copy_device_array_to_aow(&aow_selects0_, selects0_); } // Copies device array to window structure @@ -233,18 +362,23 @@ __global__ void copy_to_window(WindowT* windows, cuco::detail::index_type n, T* } } -template -void initialize_aow(std::unique_ptr& storage, - thrust::device_vector& device_array, - uint64_t num_elements) +template +template +void bit_vector::copy_device_array_to_aow( + std::unique_ptr* aow, thrust::device_vector& device_array) noexcept { - auto constexpr stride = 4; - auto const grid_size = (num_elements + stride * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (stride * detail::CUCO_DEFAULT_BLOCK_SIZE); + size_type num_elements = device_array.size(); + *aow = std::make_unique(extent{num_elements + 1}, allocator_); - auto device_ptr = thrust::raw_pointer_cast(device_array.data()); - copy_to_window<<>>( - storage->data(), num_elements, device_ptr); + if (num_elements > 0) { + auto constexpr stride = 4; + auto grid_size = (num_elements + stride * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / + (stride * detail::CUCO_DEFAULT_BLOCK_SIZE); + + auto device_ptr = thrust::raw_pointer_cast(device_array.data()); + copy_to_window<<>>( + (*aow)->data(), num_elements, device_ptr); + } } template From d195766d7231665b4c14f8672426cc77439559d5 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Sat, 26 Aug 2023 22:49:06 +0000 Subject: [PATCH 59/99] Misc fixes in bitvector build --- include/cuco/detail/trie/bit_vector/bit_vector.inl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 22448987d..6b9b38cd0 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -243,7 +243,7 @@ __global__ void encode_ranks_from_prefix_bit_counts(const size_type* prefix_bit_ */ template __global__ void mark_blocks_with_select_entries(const size_type* prefix_bit_counts, - bool* select_markers, + size_type* select_markers, size_type num_blocks, size_type words_per_block, size_type bits_per_block) @@ -282,6 +282,7 @@ void bit_vector::build_ranks_and_selects(thrust::device_vector& thrust::device_vector& selects, bool flip_bits) { + if (n_bits_ == 0) { return; } size_type num_words = (n_bits_ - 1) / bits_per_word + 1; // Round up num_words to a block if (num_words % words_per_block) { num_words += words_per_block - (num_words % words_per_block); } @@ -313,7 +314,7 @@ void bit_vector::build_ranks_and_selects(thrust::device_vector& words_per_block); // Step 3. Compute selects - thrust::device_vector select_markers(num_blocks); + thrust::device_vector select_markers(num_blocks); mark_blocks_with_select_entries<<>>( thrust::raw_pointer_cast(bit_counts.data()), thrust::raw_pointer_cast(select_markers.data()), @@ -338,6 +339,7 @@ template void bit_vector::build() noexcept { d_words_ = words_; + words_.clear(); build_ranks_and_selects(ranks_, selects_, false); // 1-bits build_ranks_and_selects(ranks0_, selects0_, true); // 0-bits @@ -379,6 +381,7 @@ void bit_vector::copy_device_array_to_aow( copy_to_window<<>>( (*aow)->data(), num_elements, device_ptr); } + device_array.clear(); } template From aaa2261e262fa30dc3b30646d45bccb62411db7c Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Sun, 27 Aug 2023 01:33:11 +0000 Subject: [PATCH 60/99] Move constructor --- include/cuco/detail/trie/bit_vector/bit_vector.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 939faedce..58159c098 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -76,6 +76,7 @@ class bit_vector { * @param allocator Allocator for internal storage */ bit_vector(Allocator const& allocator = Allocator{}); + bit_vector(cuco::experimental::bit_vector&& other) = default; ~bit_vector(); /** From d237263a9f5aa4f189a13d4688c421fded1fe9ff Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Sun, 27 Aug 2023 05:49:26 +0000 Subject: [PATCH 61/99] Bulk API for rank and select operations --- .../detail/trie/bit_vector/bit_vector.cuh | 36 +++++++- .../detail/trie/bit_vector/bit_vector.inl | 92 +++++++++++++++++++ tests/bit_vector/rank_test.cu | 31 +++---- tests/bit_vector/select_test.cu | 26 ++---- 4 files changed, 149 insertions(+), 36 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 58159c098..e9847825f 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -76,7 +76,7 @@ class bit_vector { * @param allocator Allocator for internal storage */ bit_vector(Allocator const& allocator = Allocator{}); - bit_vector(cuco::experimental::bit_vector&& other) = default; + bit_vector(cuco::experimental::bit_vector&&) = default; ///< Move constructor ~bit_vector(); /** @@ -128,6 +128,40 @@ class bit_vector { OutputIt outputs_begin, cuda_stream_ref stream = {}) const noexcept; + /** + * @brief Bulk rank operation + * + * @tparam KeyIt Device-accessible iterator to keys + * @tparam OutputIt Device-accessible iterator to output ranks + * + * @param keys_begin Begin iterator to keys list whose ranks are queried + * @param keys_end End iterator to keys list + * @param outputs_begin Begin iterator to outputs ranks list + * @param stream Stream to execute ranks kernel + */ + template + void ranks(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) const noexcept; + + /** + * @brief Bulk select operation + * + * @tparam KeyIt Device-accessible iterator to keys + * @tparam OutputIt Device-accessible iterator to outputs + * + * @param keys_begin Begin iterator to keys list whose select values are queried + * @param keys_end End iterator to keys list + * @param outputs_begin Begin iterator to outputs selects list + * @param stream Stream to execute selects kernel + */ + template + void selects(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) const noexcept; + /** * @brief Bulk set operation * diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 6b9b38cd0..0f053b248 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -82,6 +82,42 @@ void bit_vector::get(KeyIt keys_begin, ref_, keys_begin, outputs_begin, num_keys); } +template +template +void bit_vector::ranks(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept + +{ + auto const num_keys = cuco::detail::distance(keys_begin, keys_end); + if (num_keys == 0) { return; } + + auto grid_size = default_grid_size(num_keys); + auto ref_ = this->ref(cuco::experimental::bv_read); + + bitvector_rank_kernel<<>>( + ref_, keys_begin, outputs_begin, num_keys); +} + +template +template +void bit_vector::selects(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept + +{ + auto const num_keys = cuco::detail::distance(keys_begin, keys_end); + if (num_keys == 0) { return; } + + auto grid_size = default_grid_size(num_keys); + auto ref_ = this->ref(cuco::experimental::bv_read); + + bitvector_select_kernel<<>>( + ref_, keys_begin, outputs_begin, num_keys); +} + template template void bit_vector::set(KeyIt keys_begin, @@ -127,6 +163,62 @@ __global__ void bitvector_get_kernel(BitvectorRef ref, } } +/* + * @brief Gather rank values for a range of keys + * + * @tparam BitvectorRef Bitvector reference type + * @tparam KeyIt Device-accessible iterator to input keys + * @tparam ValueIt Device-accessible iterator to values + * @tparam size_type Size type + * + * @param ref Bitvector ref + * @param keys Begin iterator to keys + * @param outputs Begin iterator to outputs + * @param num_keys Number of input keys + */ +template +__global__ void bitvector_rank_kernel(BitvectorRef ref, + KeyIt keys, + ValueIt outputs, + size_type num_keys) +{ + uint32_t const loop_stride = gridDim.x * blockDim.x; + uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; + + while (key_id < num_keys) { + outputs[key_id] = ref.rank(keys[key_id]); + key_id += loop_stride; + } +} + +/* + * @brief Gather select values for a range of keys + * + * @tparam BitvectorRef Bitvector reference type + * @tparam KeyIt Device-accessible iterator to input keys + * @tparam ValueIt Device-accessible iterator to values + * @tparam size_type Size type + * + * @param ref Bitvector ref + * @param keys Begin iterator to keys + * @param outputs Begin iterator to outputs + * @param num_keys Number of input keys + */ +template +__global__ void bitvector_select_kernel(BitvectorRef ref, + KeyIt keys, + ValueIt outputs, + size_type num_keys) +{ + uint32_t const loop_stride = gridDim.x * blockDim.x; + uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; + + while (key_id < num_keys) { + outputs[key_id] = ref.select(keys[key_id]); + key_id += loop_stride; + } +} + /* * @brief Set bits of a range of keys * diff --git a/tests/bit_vector/rank_test.cu b/tests/bit_vector/rank_test.cu index c271cf947..cdaadeb02 100644 --- a/tests/bit_vector/rank_test.cu +++ b/tests/bit_vector/rank_test.cu @@ -19,22 +19,11 @@ #include #include -#include #include +#include #include -template -__global__ void rank_kernel(BitVectorRef ref, size_type num_elements, OutputIt output) -{ - size_t index = blockIdx.x * blockDim.x + threadIdx.x; - size_t stride = gridDim.x * blockDim.x; - while (index < num_elements) { - output[index] = ref.rank(index); - index += stride; - } -} - extern bool modulo_bitgen(uint64_t i); // Defined in get_test.cu TEST_CASE("Rank test", "") @@ -49,15 +38,19 @@ TEST_CASE("Rank test", "") } bv.build(); - thrust::device_vector rank_result_device(num_elements); - auto ref = bv.ref(cuco::experimental::bv_read); - rank_kernel<<<1, 1024>>>(ref, num_elements, rank_result_device.data()); + thrust::device_vector keys(num_elements); + thrust::sequence(keys.begin(), keys.end(), 0); + + thrust::device_vector d_ranks(num_elements); + + bv.ranks(keys.begin(), keys.end(), d_ranks.begin()); + + thrust::host_vector h_ranks = d_ranks; - thrust::host_vector rank_result = rank_result_device; - size_type cur_rank = 0; - size_type num_matches = 0; + size_type cur_rank = 0; + size_type num_matches = 0; for (size_type i = 0; i < num_elements; i++) { - num_matches += cur_rank == rank_result[i]; + num_matches += cur_rank == h_ranks[i]; if (modulo_bitgen(i)) { cur_rank++; } } REQUIRE(num_matches == num_elements); diff --git a/tests/bit_vector/select_test.cu b/tests/bit_vector/select_test.cu index 6d90e3c11..e4ea23bd9 100644 --- a/tests/bit_vector/select_test.cu +++ b/tests/bit_vector/select_test.cu @@ -19,22 +19,11 @@ #include #include -#include #include +#include #include -template -__global__ void select_kernel(BitVectorRef ref, size_type num_elements, OutputIt output) -{ - size_t index = blockIdx.x * blockDim.x + threadIdx.x; - size_t stride = gridDim.x * blockDim.x; - while (index < num_elements) { - output[index] = ref.select(index); - index += stride; - } -} - template __global__ void select0_kernel(BitVectorRef ref, size_type num_elements, OutputIt output) { @@ -65,9 +54,14 @@ TEST_CASE("Select test", "") // Check select { - thrust::device_vector device_result(num_set); - select_kernel<<<1, 1024>>>(ref, num_set, device_result.data()); - thrust::host_vector host_result = device_result; + thrust::device_vector keys(num_set); + thrust::sequence(keys.begin(), keys.end(), 0); + + thrust::device_vector d_selects(num_set); + + bv.selects(keys.begin(), keys.end(), d_selects.begin()); + + thrust::host_vector h_selects = d_selects; size_type num_matches = 0; size_type cur_set_pos = -1lu; @@ -76,7 +70,7 @@ TEST_CASE("Select test", "") cur_set_pos++; } while (cur_set_pos < num_elements and !modulo_bitgen(cur_set_pos)); - num_matches += cur_set_pos == host_result[i]; + num_matches += cur_set_pos == h_selects[i]; } REQUIRE(num_matches == num_set); } From 07aa8130b0e3d085c105d91d0a2a5cebbd0ee08e Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Tue, 29 Aug 2023 00:51:05 +0000 Subject: [PATCH 62/99] Remove bulk set operation --- .../detail/trie/bit_vector/bit_vector.cuh | 17 ------ .../detail/trie/bit_vector/bit_vector.inl | 45 ---------------- .../detail/trie/bit_vector/bit_vector_ref.inl | 30 ----------- include/cuco/operator.hpp | 6 --- tests/CMakeLists.txt | 1 - tests/bit_vector/set_test.cu | 52 ------------------- 6 files changed, 151 deletions(-) delete mode 100644 tests/bit_vector/set_test.cu diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index e9847825f..04334f274 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -162,23 +162,6 @@ class bit_vector { OutputIt outputs_begin, cuda_stream_ref stream = {}) const noexcept; - /** - * @brief Bulk set operation - * - * @tparam KeyIt Device-accessible iterator to keys - * @tparam ValueIt Device-accessible iterator to values - * - * @param keys_begin Begin iterator to keys that need to modified - * @param keys_end End iterator to keys - * @param vals_begin Begin iterator to new bit values - * @param stream Stream to execute set kernel - */ - template - void set(KeyIt keys_begin, - KeyIt keys_end, - ValueIt vals_begin, - cuda_stream_ref stream = {}) const noexcept; - using allocator_type = Allocator; ///< Allocator type using slot_type = uint64_t; ///< Slot type diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 0f053b248..50e50c517 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -118,23 +118,6 @@ void bit_vector::selects(KeyIt keys_begin, ref_, keys_begin, outputs_begin, num_keys); } -template -template -void bit_vector::set(KeyIt keys_begin, - KeyIt keys_end, - ValueIt vals_begin, - cuda_stream_ref stream) const noexcept -{ - auto const num_keys = cuco::detail::distance(keys_begin, keys_end); - if (num_keys == 0) { return; } - - auto grid_size = default_grid_size(num_keys); - auto ref_ = this->ref(cuco::experimental::bv_set); - - bitvector_set_kernel<<>>( - ref_, keys_begin, vals_begin, num_keys); -} - /* * @brief Gather bits of a range of keys * @@ -219,34 +202,6 @@ __global__ void bitvector_select_kernel(BitvectorRef ref, } } -/* - * @brief Set bits of a range of keys - * - * @tparam BitvectorRef Bitvector reference type - * @tparam KeyIt Device-accessible iterator to input keys - * @tparam ValueIt Device-accessible iterator to values - * @tparam size_type Size type - * - * @param ref Bitvector ref - * @param keys Begin iterator to input keys - * @param values Begin iterator to input values - * @param num_keys Number of input keys - */ -template -__global__ void bitvector_set_kernel(BitvectorRef ref, - KeyIt keys, - ValueIt values, - size_type num_keys) -{ - uint32_t const loop_stride = gridDim.x * blockDim.x; - uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; - - while (key_id < num_keys) { - ref.set(keys[key_id], values[key_id]); - key_id += loop_stride; - } -} - /* * @brief Computes number of set or not-set bits in each word * diff --git a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl index 3ba5221d9..c308a1917 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl @@ -212,36 +212,6 @@ class operator_impl> { } }; -template -class operator_impl> { - using ref_type = bit_vector_ref; ///< Bitvector ref type - using size_type = typename StorageRef::size_type; ///< Size type - using slot_type = typename StorageRef::slot_type; ///< Slot type - static constexpr size_type bits_per_word = sizeof(slot_type) * 8; - - public: - /** - * @brief Modify a single bit - * - * @param key Position of bit - * @param bit New value of bit - */ - __device__ void set(size_type key, bool bit) noexcept - { - ref_type& ref_ = static_cast(*this); - - size_type word_id = key / bits_per_word; - size_type bit_id = key % bits_per_word; - auto& word = ref_.storage_.words_ref_[word_id][0]; - - if (bit) { - word |= 1UL << bit_id; - } else { - word &= ~(1UL << bit_id); - } - } -}; - } // namespace detail } // namespace experimental } // namespace cuco diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp index 4beb57615..f9165d3bf 100644 --- a/include/cuco/operator.hpp +++ b/include/cuco/operator.hpp @@ -51,12 +51,6 @@ struct find_tag { struct bv_read_tag { } inline constexpr bv_read; -/** - * @brief `bv_set` operator tag - */ -struct bv_set_tag { -} inline constexpr bv_set; - } // namespace op } // namespace experimental } // namespace cuco diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7d910565b..64fe713ac 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -101,7 +101,6 @@ ConfigureTest(STATIC_MULTIMAP_TEST ConfigureTest(BIT_VECTOR_TEST bit_vector/find_next_set_test.cu bit_vector/get_test.cu - bit_vector/set_test.cu bit_vector/rank_test.cu bit_vector/select_test.cu bit_vector/size_test.cu) diff --git a/tests/bit_vector/set_test.cu b/tests/bit_vector/set_test.cu deleted file mode 100644 index 46acb4c2e..000000000 --- a/tests/bit_vector/set_test.cu +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include - -TEST_CASE("Set test", "") -{ - cuco::experimental::bit_vector bv; - - using size_type = cuco::experimental::bit_vector<>::size_type; - constexpr size_type num_elements{400}; - - // Set odd bits on host - for (size_t i = 0; i < num_elements; i++) { - bv.append(i % 2 == 1); - } - bv.build(); - - // Set all bits on device - thrust::device_vector keys(num_elements); - thrust::sequence(keys.begin(), keys.end(), 0); - - thrust::device_vector vals(num_elements); - thrust::fill(vals.begin(), vals.end(), 1); - - bv.set(keys.begin(), keys.end(), vals.begin()); - - // Check that all bits are set - thrust::device_vector get_outputs(num_elements); - bv.get(keys.begin(), keys.end(), get_outputs.begin()); - - // size_type num_set = thrust::reduce(thrust::device, get_outputs.begin(), get_outputs.end(), 0); - // REQUIRE(num_set == num_elements); -} From fb07de9da75195b98f4dd84cdd7c5f8fbfa95ec2 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Tue, 29 Aug 2023 01:11:48 +0000 Subject: [PATCH 63/99] Remove aow_storage structures Directly use thrust::device_vector in device-ref code --- .../detail/trie/bit_vector/bit_vector.cuh | 74 +++--------- .../detail/trie/bit_vector/bit_vector.inl | 113 +++++------------- .../detail/trie/bit_vector/bit_vector_ref.inl | 34 +++--- tests/bit_vector/find_next_set_test.cu | 2 +- tests/bit_vector/get_test.cu | 2 +- tests/bit_vector/rank_test.cu | 2 +- tests/bit_vector/select_test.cu | 2 +- tests/bit_vector/size_test.cu | 2 +- 8 files changed, 67 insertions(+), 164 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 04334f274..4be681399 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -19,7 +19,6 @@ #include #include -#include #include @@ -64,20 +63,15 @@ struct rank { * new operations close to constant time. * Bitvector construction happens on host, after which the structures are moved to device. * All subsequent read-only operations access device structures only. - * - * @tparam Allocator Type of allocator used for device storage */ -template > class bit_vector { public: /** * @brief Constructs an empty bitvector - * - * @param allocator Allocator for internal storage */ - bit_vector(Allocator const& allocator = Allocator{}); - bit_vector(cuco::experimental::bit_vector&&) = default; ///< Move constructor - ~bit_vector(); + inline bit_vector(); + bit_vector(cuco::experimental::bit_vector&&) = default; ///< Move constructor + inline ~bit_vector(); /** * @brief adds a new bit at the end @@ -86,7 +80,7 @@ class bit_vector { * * @param bit Boolean value of new bit to be added */ - void append(bool bit) noexcept; + inline void append(bool bit) noexcept; using size_type = std::size_t; ///< size type to specify bit index /** @@ -95,21 +89,19 @@ class bit_vector { * @param index position of bit to be modified * @param bit new value of bit */ - void set(size_type index, bool bit) noexcept; + inline void set(size_type index, bool bit) noexcept; /** * @brief Sets last bit to specified value * * @param bit new value of last bit */ - void set_last(bool bit) noexcept; + inline void set_last(bool bit) noexcept; /** * @brief Builds indexes for rank and select - * - * Also creates device-side snapshot */ - void build() noexcept; + inline void build() noexcept; /** * @brief Bulk get operation @@ -162,31 +154,21 @@ class bit_vector { OutputIt outputs_begin, cuda_stream_ref stream = {}) const noexcept; - using allocator_type = Allocator; ///< Allocator type - using slot_type = uint64_t; ///< Slot type - - using words_storage_type = - aow_storage, allocator_type>; ///< storage type for words - using ranks_storage_type = - aow_storage, allocator_type>; ///< storage type for ranks - using selects_storage_type = - aow_storage, allocator_type>; ///< storage type for selects + using slot_type = uint64_t; ///< Slot type /** *@brief Struct to hold all storage refs needed by bitvector_ref */ struct device_storage_ref { - using size_type = size_type; ///< Size type - using slot_type = slot_type; ///< Slot type - using bit_vector_type = bit_vector<>; ///< bit_vector_ref needs this to access words_per_block + using bit_vector_type = bit_vector; ///< bit_vector_ref needs this to access words_per_block - typename words_storage_type::ref_type words_ref_; ///< Words ref + const slot_type* words_ref_; ///< Words ref - typename ranks_storage_type::ref_type ranks_ref_; ///< Ranks refs - typename selects_storage_type::ref_type selects_ref_; ///< Selects refs + const rank* ranks_ref_; ///< Ranks refs + const size_type* selects_ref_; ///< Selects refs - typename ranks_storage_type::ref_type ranks0_ref_; ///< Ranks refs for 0 bits - typename selects_storage_type::ref_type selects0_ref_; ///< Selects refs 0 bits + const rank* ranks0_ref_; ///< Ranks refs for 0 bits + const size_type* selects0_ref_; ///< Selects refs 0 bits }; template @@ -203,7 +185,7 @@ class bit_vector { * @return Device ref of the current `bit_vector` object */ template - [[nodiscard]] auto ref(Operators... ops) const noexcept; + [[nodiscard]] ref_type ref(Operators... ops) const noexcept; /** * @brief Get the number of bits bit_vector holds @@ -231,13 +213,6 @@ class bit_vector { thrust::device_vector selects_; ///< Block indices of (0, 256, 512...)th `1` bit thrust::device_vector selects0_; ///< Same as selects_, but for `0` bits - allocator_type allocator_; ///< Allocator used to (de)allocate temporary storage - std::unique_ptr aow_words_; ///< Array of window storage structure - std::unique_ptr aow_ranks_; - std::unique_ptr aow_ranks0_; - std::unique_ptr aow_selects_; - std::unique_ptr aow_selects0_; - /** * @brief Populates rank and select indexes on device * @@ -245,22 +220,9 @@ class bit_vector { * @param selects Output array of selects * @param flip_bits If true, negate bits to construct indexes for `0` bits */ - void build_ranks_and_selects(thrust::device_vector& ranks, - thrust::device_vector& selects, - bool flip_bits); - - /** - * @brief Creates a new window structure on device and initializes it from a device array - * - * @tparam T Type of device array elements - * @tparam storage_type Storage type - * - * @param aow pointer to destination (device window structure) - * @param device_array device array whose contents are used to intialize aow - */ - template - void copy_device_array_to_aow(std::unique_ptr* aow, - thrust::device_vector& device_array) noexcept; + inline void build_ranks_and_selects(thrust::device_vector& ranks, + thrust::device_vector& selects, + bool flip_bits); /** * @brief Helper function to calculate grid size for simple kernels diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 50e50c517..95ae931c1 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -21,19 +21,11 @@ namespace cuco { namespace experimental { -template -bit_vector::bit_vector(Allocator const& allocator) - : words_{}, ranks_{}, ranks0_{}, selects_{}, selects0_{}, n_bits_{0}, allocator_{allocator} -{ -} +bit_vector::bit_vector() : words_{}, ranks_{}, ranks0_{}, selects_{}, selects0_{}, n_bits_{0} {} -template -bit_vector::~bit_vector() -{ -} +bit_vector::~bit_vector() {} -template -void bit_vector::append(bool bit) noexcept +void bit_vector::append(bool bit) noexcept { if (n_bits_ % bits_per_block == 0) { size_type new_n_bits = n_bits_ + bits_per_block; // Extend storage by one block @@ -45,8 +37,7 @@ void bit_vector::append(bool bit) noexcept ++n_bits_; } -template -void bit_vector::set(size_type index, bool bit) noexcept +void bit_vector::set(size_type index, bool bit) noexcept { size_type word_id = index / bits_per_word; size_type bit_id = index % bits_per_word; @@ -58,18 +49,13 @@ void bit_vector::set(size_type index, bool bit) noexcept } } -template -void bit_vector::set_last(bool bit) noexcept -{ - set(n_bits_ - 1, bit); -} +void bit_vector::set_last(bool bit) noexcept { set(n_bits_ - 1, bit); } -template template -void bit_vector::get(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept +void bit_vector::get(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(keys_begin, keys_end); @@ -82,12 +68,11 @@ void bit_vector::get(KeyIt keys_begin, ref_, keys_begin, outputs_begin, num_keys); } -template template -void bit_vector::ranks(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept +void bit_vector::ranks(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(keys_begin, keys_end); @@ -100,12 +85,11 @@ void bit_vector::ranks(KeyIt keys_begin, ref_, keys_begin, outputs_begin, num_keys); } -template template -void bit_vector::selects(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept +void bit_vector::selects(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(keys_begin, keys_end); @@ -324,10 +308,9 @@ __global__ void mark_blocks_with_select_entries(const size_type* prefix_bit_coun } } -template -void bit_vector::build_ranks_and_selects(thrust::device_vector& ranks, - thrust::device_vector& selects, - bool flip_bits) +void bit_vector::build_ranks_and_selects(thrust::device_vector& ranks, + thrust::device_vector& selects, + bool flip_bits) { if (n_bits_ == 0) { return; } size_type num_words = (n_bits_ - 1) / bits_per_word + 1; @@ -382,65 +365,23 @@ void bit_vector::build_ranks_and_selects(thrust::device_vector& thrust::identity()); } -template -void bit_vector::build() noexcept +void bit_vector::build() noexcept { d_words_ = words_; words_.clear(); build_ranks_and_selects(ranks_, selects_, false); // 1-bits build_ranks_and_selects(ranks0_, selects0_, true); // 0-bits - - copy_device_array_to_aow(&aow_words_, d_words_); - copy_device_array_to_aow(&aow_ranks_, ranks_); - copy_device_array_to_aow(&aow_selects_, selects_); - copy_device_array_to_aow(&aow_ranks0_, ranks0_); - copy_device_array_to_aow(&aow_selects0_, selects0_); -} - -// Copies device array to window structure -template -__global__ void copy_to_window(WindowT* windows, cuco::detail::index_type n, T* values) -{ - cuco::detail::index_type const loop_stride = gridDim.x * blockDim.x; - cuco::detail::index_type idx = blockDim.x * blockIdx.x + threadIdx.x; - - while (idx < n) { - auto& window_slots = *(windows + idx); - window_slots[0] = values[idx]; - idx += loop_stride; - } -} - -template -template -void bit_vector::copy_device_array_to_aow( - std::unique_ptr* aow, thrust::device_vector& device_array) noexcept -{ - size_type num_elements = device_array.size(); - *aow = std::make_unique(extent{num_elements + 1}, allocator_); - - if (num_elements > 0) { - auto constexpr stride = 4; - auto grid_size = (num_elements + stride * detail::CUCO_DEFAULT_BLOCK_SIZE - 1) / - (stride * detail::CUCO_DEFAULT_BLOCK_SIZE); - - auto device_ptr = thrust::raw_pointer_cast(device_array.data()); - copy_to_window<<>>( - (*aow)->data(), num_elements, device_ptr); - } - device_array.clear(); } -template template -auto bit_vector::ref(Operators...) const noexcept +bit_vector::ref_type bit_vector::ref(Operators...) const noexcept { static_assert(sizeof...(Operators), "No operators specified"); - return ref_type{device_storage_ref{aow_words_->ref(), - aow_ranks_->ref(), - aow_selects_->ref(), - aow_ranks0_->ref(), - aow_selects0_->ref()}}; + return ref_type{device_storage_ref{thrust::raw_pointer_cast(d_words_.data()), + thrust::raw_pointer_cast(ranks_.data()), + thrust::raw_pointer_cast(selects_.data()), + thrust::raw_pointer_cast(ranks0_.data()), + thrust::raw_pointer_cast(selects0_.data())}}; } } // namespace experimental diff --git a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl index c308a1917..51042ba58 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl @@ -14,9 +14,9 @@ namespace detail { template class operator_impl> { - using ref_type = bit_vector_ref; ///< Bitvector ref type - using size_type = typename StorageRef::size_type; ///< Size type - using slot_type = typename StorageRef::slot_type; ///< Slot type + using ref_type = bit_vector_ref; ///< Bitvector ref type + using size_type = typename StorageRef::bit_vector_type::size_type; ///< Size type + using slot_type = typename StorageRef::bit_vector_type::slot_type; ///< Slot type static constexpr size_type bits_per_word = sizeof(slot_type) * 8; static constexpr size_type words_per_block = StorageRef::bit_vector_type::words_per_block; @@ -32,7 +32,7 @@ class operator_impl> { [[nodiscard]] __device__ bool get(size_type key) const noexcept { auto const& ref_ = static_cast(*this); - return (ref_.storage_.words_ref_[key / bits_per_word][0] >> (key % bits_per_word)) & 1UL; + return (ref_.storage_.words_ref_[key / bits_per_word] >> (key % bits_per_word)) & 1UL; } /** @@ -45,7 +45,7 @@ class operator_impl> { [[nodiscard]] __device__ slot_type get_word(size_type word_id) const noexcept { auto const& ref_ = static_cast(*this); - return ref_.storage_.words_ref_[word_id][0]; + return ref_.storage_.words_ref_[word_id]; } /** @@ -60,10 +60,10 @@ class operator_impl> { auto const& ref_ = static_cast(*this); size_type word_id = key / bits_per_word; size_type bit_id = key % bits_per_word; - slot_type word = ref_.storage_.words_ref_[word_id][0]; + slot_type word = ref_.storage_.words_ref_[word_id]; word &= ~(0lu) << bit_id; while (word == 0) { - word = ref_.storage_.words_ref_[++word_id][0]; + word = ref_.storage_.words_ref_[++word_id]; } return word_id * bits_per_word + __ffsll(word) - 1; // cuda intrinsic } @@ -84,12 +84,12 @@ class operator_impl> { size_type rank_id = word_id / words_per_block; size_type rel_id = word_id % words_per_block; - auto rank = ref_.storage_.ranks_ref_[rank_id][0]; + auto rank = ref_.storage_.ranks_ref_[rank_id]; size_type n = rank.abs(); if (rel_id != 0) { n += rank.rels_[rel_id - 1]; } - n += cuda::std::popcount(ref_.storage_.words_ref_[word_id][0] & ((1UL << bit_id) - 1)); + n += cuda::std::popcount(ref_.storage_.words_ref_[word_id] & ((1UL << bit_id) - 1)); return n; } @@ -106,12 +106,12 @@ class operator_impl> { auto const& storage_ = static_cast(*this).storage_; auto rank_id = get_initial_rank_estimate(count, storage_.selects_ref_, storage_.ranks_ref_); - auto rank = storage_.ranks_ref_[rank_id][0]; + auto rank = storage_.ranks_ref_[rank_id]; size_type word_id = rank_id * words_per_block; word_id += subtract_rank_from_count(count, rank); - return word_id * bits_per_word + select_bit_in_word(count, storage_.words_ref_[word_id][0]); + return word_id * bits_per_word + select_bit_in_word(count, storage_.words_ref_[word_id]); } /** @@ -126,12 +126,12 @@ class operator_impl> { auto const& storage_ = static_cast(*this).storage_; auto rank_id = get_initial_rank_estimate(count, storage_.selects0_ref_, storage_.ranks0_ref_); - auto rank = storage_.ranks0_ref_[rank_id][0]; + auto rank = storage_.ranks0_ref_[rank_id]; size_type word_id = rank_id * words_per_block; word_id += subtract_rank_from_count(count, rank); - return word_id * bits_per_word + select_bit_in_word(count, ~(storage_.words_ref_[word_id][0])); + return word_id * bits_per_word + select_bit_in_word(count, ~(storage_.words_ref_[word_id])); } private: @@ -150,17 +150,17 @@ class operator_impl> { const RanksRef& ranks) const noexcept { size_type block_id = count / (bits_per_word * words_per_block); - size_type begin = selects[block_id][0]; - size_type end = selects[block_id + 1][0] + 1UL; + size_type begin = selects[block_id]; + size_type end = selects[block_id + 1] + 1UL; if (begin + 10 >= end) { // Linear search - while (count >= ranks[begin + 1][0].abs()) { + while (count >= ranks[begin + 1].abs()) { ++begin; } } else { // Binary search while (begin + 1 < end) { size_type middle = (begin + end) / 2; - if (count < ranks[middle][0].abs()) { + if (count < ranks[middle].abs()) { end = middle; } else { begin = middle; diff --git a/tests/bit_vector/find_next_set_test.cu b/tests/bit_vector/find_next_set_test.cu index 6f7bbebf3..7b2c838c7 100644 --- a/tests/bit_vector/find_next_set_test.cu +++ b/tests/bit_vector/find_next_set_test.cu @@ -41,7 +41,7 @@ TEST_CASE("Find next set test", "") { cuco::experimental::bit_vector bv; - using size_type = cuco::experimental::bit_vector<>::size_type; + using size_type = cuco::experimental::bit_vector::size_type; constexpr size_type num_elements{400}; for (size_type i = 0; i < num_elements; i++) { diff --git a/tests/bit_vector/get_test.cu b/tests/bit_vector/get_test.cu index e97640767..26c3b94cf 100644 --- a/tests/bit_vector/get_test.cu +++ b/tests/bit_vector/get_test.cu @@ -38,7 +38,7 @@ TEST_CASE("Get test", "") { cuco::experimental::bit_vector bv; - using size_type = cuco::experimental::bit_vector<>::size_type; + using size_type = cuco::experimental::bit_vector::size_type; constexpr size_type num_elements{400}; size_type num_set_ref = 0; diff --git a/tests/bit_vector/rank_test.cu b/tests/bit_vector/rank_test.cu index cdaadeb02..7a4eb8e30 100644 --- a/tests/bit_vector/rank_test.cu +++ b/tests/bit_vector/rank_test.cu @@ -30,7 +30,7 @@ TEST_CASE("Rank test", "") { cuco::experimental::bit_vector bv; - using size_type = cuco::experimental::bit_vector<>::size_type; + using size_type = cuco::experimental::bit_vector::size_type; constexpr size_type num_elements{4000}; for (size_type i = 0; i < num_elements; i++) { diff --git a/tests/bit_vector/select_test.cu b/tests/bit_vector/select_test.cu index e4ea23bd9..8553b0d57 100644 --- a/tests/bit_vector/select_test.cu +++ b/tests/bit_vector/select_test.cu @@ -41,7 +41,7 @@ TEST_CASE("Select test", "") { cuco::experimental::bit_vector bv; - using size_type = cuco::experimental::bit_vector<>::size_type; + using size_type = cuco::experimental::bit_vector::size_type; constexpr size_type num_elements{4000}; size_type num_set = 0; diff --git a/tests/bit_vector/size_test.cu b/tests/bit_vector/size_test.cu index 5d128d418..59961ffba 100644 --- a/tests/bit_vector/size_test.cu +++ b/tests/bit_vector/size_test.cu @@ -21,7 +21,7 @@ TEST_CASE("Size computation", "") { cuco::experimental::bit_vector bv; - using size_type = cuco::experimental::bit_vector<>::size_type; + using size_type = cuco::experimental::bit_vector::size_type; constexpr size_type num_elements{400}; for (size_type i = 0; i < num_elements; i++) { From 017fd1c3722011becda63cbc78d8e01fd3ebfe58 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Tue, 29 Aug 2023 02:14:35 +0000 Subject: [PATCH 64/99] Grow bitvector on device --- .../detail/trie/bit_vector/bit_vector.cuh | 8 ++------ .../detail/trie/bit_vector/bit_vector.inl | 20 ++++++------------- 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 4be681399..5bae39ace 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -197,17 +197,13 @@ class bit_vector { static constexpr size_type words_per_block = 4; ///< Tradeoff between space efficiency and perf. private: - size_type n_bits_; ///< Number of bits bit_vector currently holds - // These could be public if needed by other classes. Private for now static constexpr size_type bits_per_word = sizeof(slot_type) * 8; ///< Bits in a word static constexpr size_type bits_per_block = words_per_block * bits_per_word; ///< Trivial - // Host-side structures - std::vector words_; ///< Words vector that represents all bits + size_type n_bits_; ///< Number of bits bit_vector currently holds - // Device-side structures - thrust::device_vector d_words_; ///< Device words vector + thrust::device_vector words_; ///< Words vector that represents all bits thrust::device_vector ranks_; ///< Rank values for every 256-th bit (4-th word) thrust::device_vector ranks0_; ///< Same as ranks_ but for `0` bits thrust::device_vector selects_; ///< Block indices of (0, 256, 512...)th `1` bit diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 95ae931c1..72d16dc69 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -21,18 +21,16 @@ namespace cuco { namespace experimental { -bit_vector::bit_vector() : words_{}, ranks_{}, ranks0_{}, selects_{}, selects0_{}, n_bits_{0} {} +bit_vector::bit_vector() : n_bits_{0}, words_{}, ranks_{}, ranks0_{}, selects_{}, selects0_{} {} bit_vector::~bit_vector() {} void bit_vector::append(bool bit) noexcept { if (n_bits_ % bits_per_block == 0) { - size_type new_n_bits = n_bits_ + bits_per_block; // Extend storage by one block - size_type new_n_words = new_n_bits / bits_per_word; - - words_.resize(new_n_words); + words_.resize(words_.size() + words_per_block); // Extend storage by one block } + set(n_bits_, bit); ++n_bits_; } @@ -41,7 +39,6 @@ void bit_vector::set(size_type index, bool bit) noexcept { size_type word_id = index / bits_per_word; size_type bit_id = index % bits_per_word; - if (bit) { words_[word_id] |= 1UL << bit_id; } else { @@ -313,18 +310,15 @@ void bit_vector::build_ranks_and_selects(thrust::device_vector& ranks, bool flip_bits) { if (n_bits_ == 0) { return; } - size_type num_words = (n_bits_ - 1) / bits_per_word + 1; - // Round up num_words to a block - if (num_words % words_per_block) { num_words += words_per_block - (num_words % words_per_block); } // Step 1. Compute prefix sum of per-word bit counts - // Population counts for each word // Sized to have one extra entry for subsequent prefix sum + size_type num_words = words_.size(); thrust::device_vector bit_counts(num_words + 1); auto grid_size = default_grid_size(num_words); bit_counts_kernel<<>>( - thrust::raw_pointer_cast(d_words_.data()), + thrust::raw_pointer_cast(words_.data()), thrust::raw_pointer_cast(bit_counts.data()), num_words, flip_bits); @@ -367,8 +361,6 @@ void bit_vector::build_ranks_and_selects(thrust::device_vector& ranks, void bit_vector::build() noexcept { - d_words_ = words_; - words_.clear(); build_ranks_and_selects(ranks_, selects_, false); // 1-bits build_ranks_and_selects(ranks0_, selects0_, true); // 0-bits } @@ -377,7 +369,7 @@ template bit_vector::ref_type bit_vector::ref(Operators...) const noexcept { static_assert(sizeof...(Operators), "No operators specified"); - return ref_type{device_storage_ref{thrust::raw_pointer_cast(d_words_.data()), + return ref_type{device_storage_ref{thrust::raw_pointer_cast(words_.data()), thrust::raw_pointer_cast(ranks_.data()), thrust::raw_pointer_cast(selects_.data()), thrust::raw_pointer_cast(ranks0_.data()), From 74366843095b6d3e836a97d4454379d67b09a95e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 29 Aug 2023 09:18:00 -0700 Subject: [PATCH 65/99] Move bit_vector to detail namespace --- include/cuco/detail/trie/bit_vector/bit_vector.cuh | 4 +++- include/cuco/detail/trie/bit_vector/bit_vector.inl | 2 ++ tests/bit_vector/find_next_set_test.cu | 4 ++-- tests/bit_vector/get_test.cu | 4 ++-- tests/bit_vector/rank_test.cu | 4 ++-- tests/bit_vector/select_test.cu | 4 ++-- tests/bit_vector/size_test.cu | 4 ++-- 7 files changed, 15 insertions(+), 11 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 5bae39ace..33a8bac08 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -24,6 +24,7 @@ namespace cuco { namespace experimental { +namespace detail { /** * @brief Struct to store ranks of bits at 256-bit intervals @@ -70,7 +71,7 @@ class bit_vector { * @brief Constructs an empty bitvector */ inline bit_vector(); - bit_vector(cuco::experimental::bit_vector&&) = default; ///< Move constructor + bit_vector(bit_vector&&) = default; ///< Move constructor inline ~bit_vector(); /** @@ -233,6 +234,7 @@ class bit_vector { } }; +} // namespace detail } // namespace experimental } // namespace cuco diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 72d16dc69..c087f0f9c 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -20,6 +20,7 @@ namespace cuco { namespace experimental { +namespace detail { bit_vector::bit_vector() : n_bits_{0}, words_{}, ranks_{}, ranks0_{}, selects_{}, selects0_{} {} @@ -376,5 +377,6 @@ bit_vector::ref_type bit_vector::ref(Operators...) const noexcept thrust::raw_pointer_cast(selects0_.data())}}; } +} // namespace detail } // namespace experimental } // namespace cuco diff --git a/tests/bit_vector/find_next_set_test.cu b/tests/bit_vector/find_next_set_test.cu index 7b2c838c7..5fdd05f62 100644 --- a/tests/bit_vector/find_next_set_test.cu +++ b/tests/bit_vector/find_next_set_test.cu @@ -39,9 +39,9 @@ extern bool modulo_bitgen(uint64_t i); // Defined in get_test.cu TEST_CASE("Find next set test", "") { - cuco::experimental::bit_vector bv; + cuco::experimental::detail::bit_vector bv; - using size_type = cuco::experimental::bit_vector::size_type; + using size_type = cuco::experimental::detail::bit_vector::size_type; constexpr size_type num_elements{400}; for (size_type i = 0; i < num_elements; i++) { diff --git a/tests/bit_vector/get_test.cu b/tests/bit_vector/get_test.cu index 26c3b94cf..bd12143fa 100644 --- a/tests/bit_vector/get_test.cu +++ b/tests/bit_vector/get_test.cu @@ -36,9 +36,9 @@ bool modulo_bitgen(uint64_t i) { return i % 7 == 0; } TEST_CASE("Get test", "") { - cuco::experimental::bit_vector bv; + cuco::experimental::detail::bit_vector bv; - using size_type = cuco::experimental::bit_vector::size_type; + using size_type = cuco::experimental::detail::bit_vector::size_type; constexpr size_type num_elements{400}; size_type num_set_ref = 0; diff --git a/tests/bit_vector/rank_test.cu b/tests/bit_vector/rank_test.cu index 7a4eb8e30..d852da536 100644 --- a/tests/bit_vector/rank_test.cu +++ b/tests/bit_vector/rank_test.cu @@ -28,9 +28,9 @@ extern bool modulo_bitgen(uint64_t i); // Defined in get_test.cu TEST_CASE("Rank test", "") { - cuco::experimental::bit_vector bv; + cuco::experimental::detail::bit_vector bv; - using size_type = cuco::experimental::bit_vector::size_type; + using size_type = cuco::experimental::detail::bit_vector::size_type; constexpr size_type num_elements{4000}; for (size_type i = 0; i < num_elements; i++) { diff --git a/tests/bit_vector/select_test.cu b/tests/bit_vector/select_test.cu index 8553b0d57..d3e303fb7 100644 --- a/tests/bit_vector/select_test.cu +++ b/tests/bit_vector/select_test.cu @@ -39,9 +39,9 @@ extern bool modulo_bitgen(uint64_t i); // Defined in get_test.cu TEST_CASE("Select test", "") { - cuco::experimental::bit_vector bv; + cuco::experimental::detail::bit_vector bv; - using size_type = cuco::experimental::bit_vector::size_type; + using size_type = cuco::experimental::detail::bit_vector::size_type; constexpr size_type num_elements{4000}; size_type num_set = 0; diff --git a/tests/bit_vector/size_test.cu b/tests/bit_vector/size_test.cu index 59961ffba..e86bade44 100644 --- a/tests/bit_vector/size_test.cu +++ b/tests/bit_vector/size_test.cu @@ -20,8 +20,8 @@ TEST_CASE("Size computation", "") { - cuco::experimental::bit_vector bv; - using size_type = cuco::experimental::bit_vector::size_type; + cuco::experimental::detail::bit_vector bv; + using size_type = cuco::experimental::detail::bit_vector::size_type; constexpr size_type num_elements{400}; for (size_type i = 0; i < num_elements; i++) { From 30209fb89203136941f2e181c203251bd1ce6f56 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 29 Aug 2023 09:34:16 -0700 Subject: [PATCH 66/99] Add missing headers --- include/cuco/detail/open_addressing_ref_impl.cuh | 1 + include/cuco/static_map.cuh | 1 + 2 files changed, 2 insertions(+) diff --git a/include/cuco/detail/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing_ref_impl.cuh index 99187cc51..23f118d6b 100644 --- a/include/cuco/detail/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing_ref_impl.cuh @@ -17,6 +17,7 @@ #pragma once #include +#include #include #include diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 2df5b2a10..65644ccff 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -16,6 +16,7 @@ #pragma once +#include #include #include #include From 15033a37661f7d0b5651d29265688cb90bee7f56 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 29 Aug 2023 15:07:33 -0700 Subject: [PATCH 67/99] Clean up type aliases and static constexpr --- .../cuco/detail/trie/bit_vector/bit_vector.cuh | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 33a8bac08..22751874d 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -22,6 +22,8 @@ #include +#include + namespace cuco { namespace experimental { namespace detail { @@ -67,6 +69,13 @@ struct rank { */ class bit_vector { public: + using size_type = std::size_t; ///< size type to specify bit index + using slot_type = uint64_t; ///< Slot type + + static constexpr size_type words_per_block = 4; ///< Tradeoff between space efficiency and perf. + static constexpr size_type bits_per_word = sizeof(slot_type) * CHAR_BIT; ///< Bits in a word + static constexpr size_type bits_per_block = words_per_block * bits_per_word; ///< Trivial + /** * @brief Constructs an empty bitvector */ @@ -83,7 +92,6 @@ class bit_vector { */ inline void append(bool bit) noexcept; - using size_type = std::size_t; ///< size type to specify bit index /** * @brief Modifies a single bit * @@ -155,8 +163,6 @@ class bit_vector { OutputIt outputs_begin, cuda_stream_ref stream = {}) const noexcept; - using slot_type = uint64_t; ///< Slot type - /** *@brief Struct to hold all storage refs needed by bitvector_ref */ @@ -195,13 +201,7 @@ class bit_vector { */ size_type constexpr size() const noexcept { return n_bits_; } - static constexpr size_type words_per_block = 4; ///< Tradeoff between space efficiency and perf. - private: - // These could be public if needed by other classes. Private for now - static constexpr size_type bits_per_word = sizeof(slot_type) * 8; ///< Bits in a word - static constexpr size_type bits_per_block = words_per_block * bits_per_word; ///< Trivial - size_type n_bits_; ///< Number of bits bit_vector currently holds thrust::device_vector words_; ///< Words vector that represents all bits From d9914bf4de7277dcd0da9410f01921855aa26728 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 29 Aug 2023 15:08:29 -0700 Subject: [PATCH 68/99] Add missing headers --- include/cuco/detail/trie/bit_vector/bit_vector.cuh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 22751874d..431b1c709 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -22,7 +22,10 @@ #include +#include + #include +#include namespace cuco { namespace experimental { From 7290190b147236ca94c8f79969495dc0c3c9285c Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 29 Aug 2023 15:10:49 -0700 Subject: [PATCH 69/99] Add missing headers --- include/cuco/detail/trie/bit_vector/bit_vector.cuh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 431b1c709..c2c79f8c5 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -17,8 +17,10 @@ #pragma once +#include #include -#include +#include +#include #include From 9fdbc1769b041597643a754a862745366fa69db4 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 29 Aug 2023 15:28:13 -0700 Subject: [PATCH 70/99] Add allocator template parameter --- .../detail/trie/bit_vector/bit_vector.cuh | 42 ++++++++--- .../detail/trie/bit_vector/bit_vector.inl | 71 +++++++++++++------ tests/bit_vector/find_next_set_test.cu | 2 +- tests/bit_vector/get_test.cu | 2 +- tests/bit_vector/rank_test.cu | 2 +- tests/bit_vector/select_test.cu | 2 +- tests/bit_vector/size_test.cu | 2 +- 7 files changed, 85 insertions(+), 38 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index c2c79f8c5..a0389b0eb 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -71,11 +72,17 @@ struct rank { * new operations close to constant time. * Bitvector construction happens on host, after which the structures are moved to device. * All subsequent read-only operations access device structures only. + * + * @tparam Allocator Type of allocator used for device storage */ +template > class bit_vector { public: using size_type = std::size_t; ///< size type to specify bit index using slot_type = uint64_t; ///< Slot type + using allocator_type = + typename std::allocator_traits::rebind_alloc; ///< Type of the allocator + ///< to (de)allocate words static constexpr size_type words_per_block = 4; ///< Tradeoff between space efficiency and perf. static constexpr size_type bits_per_word = sizeof(slot_type) * CHAR_BIT; ///< Bits in a word @@ -83,8 +90,10 @@ class bit_vector { /** * @brief Constructs an empty bitvector + * + * @param allocator Allocator used for allocating device storage */ - inline bit_vector(); + inline bit_vector(Allocator const& allocator = Allocator{}); bit_vector(bit_vector&&) = default; ///< Move constructor inline ~bit_vector(); @@ -207,13 +216,24 @@ class bit_vector { size_type constexpr size() const noexcept { return n_bits_; } private: - size_type n_bits_; ///< Number of bits bit_vector currently holds - - thrust::device_vector words_; ///< Words vector that represents all bits - thrust::device_vector ranks_; ///< Rank values for every 256-th bit (4-th word) - thrust::device_vector ranks0_; ///< Same as ranks_ but for `0` bits - thrust::device_vector selects_; ///< Block indices of (0, 256, 512...)th `1` bit - thrust::device_vector selects0_; ///< Same as selects_, but for `0` bits + using rank_allocator_type = + typename std::allocator_traits::rebind_alloc; ///< Type of the allocator to + ///< (de)allocate ranks + using size_allocator_type = typename std::allocator_traits::rebind_alloc< + size_type>; ///< Type of the allocator to (de)allocate indices + + allocator_type allocator_; ///< Words allocator + size_type n_bits_; ///< Number of bits bit_vector currently holds + + thrust::device_vector + words_; ///< Words vector that represents all bits + thrust::device_vector + ranks_; ///< Rank values for every 256-th bit (4-th word) + thrust::device_vector ranks0_; ///< Same as ranks_ but for `0` bits + thrust::device_vector + selects_; ///< Block indices of (0, 256, 512...)th `1` bit + thrust::device_vector + selects0_; ///< Same as selects_, but for `0` bits /** * @brief Populates rank and select indexes on device @@ -222,9 +242,9 @@ class bit_vector { * @param selects Output array of selects * @param flip_bits If true, negate bits to construct indexes for `0` bits */ - inline void build_ranks_and_selects(thrust::device_vector& ranks, - thrust::device_vector& selects, - bool flip_bits); + void build_ranks_and_selects(thrust::device_vector& ranks, + thrust::device_vector& selects, + bool flip_bits) noexcept; /** * @brief Helper function to calculate grid size for simple kernels diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index c087f0f9c..2d2ddbc53 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -22,11 +22,25 @@ namespace cuco { namespace experimental { namespace detail { -bit_vector::bit_vector() : n_bits_{0}, words_{}, ranks_{}, ranks0_{}, selects_{}, selects0_{} {} +template +bit_vector::bit_vector(Allocator const& allocator) + : allocator_{allocator}, + n_bits_{0}, + words_{allocator}, + ranks_{allocator}, + ranks0_{allocator}, + selects_{allocator}, + selects0_{allocator} +{ +} -bit_vector::~bit_vector() {} +template +bit_vector::~bit_vector() +{ +} -void bit_vector::append(bool bit) noexcept +template +void bit_vector::append(bool bit) noexcept { if (n_bits_ % bits_per_block == 0) { words_.resize(words_.size() + words_per_block); // Extend storage by one block @@ -36,7 +50,8 @@ void bit_vector::append(bool bit) noexcept ++n_bits_; } -void bit_vector::set(size_type index, bool bit) noexcept +template +void bit_vector::set(size_type index, bool bit) noexcept { size_type word_id = index / bits_per_word; size_type bit_id = index % bits_per_word; @@ -47,13 +62,18 @@ void bit_vector::set(size_type index, bool bit) noexcept } } -void bit_vector::set_last(bool bit) noexcept { set(n_bits_ - 1, bit); } +template +void bit_vector::set_last(bool bit) noexcept +{ + set(n_bits_ - 1, bit); +} +template template -void bit_vector::get(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept +void bit_vector::get(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(keys_begin, keys_end); @@ -66,11 +86,12 @@ void bit_vector::get(KeyIt keys_begin, ref_, keys_begin, outputs_begin, num_keys); } +template template -void bit_vector::ranks(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept +void bit_vector::ranks(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(keys_begin, keys_end); @@ -83,11 +104,12 @@ void bit_vector::ranks(KeyIt keys_begin, ref_, keys_begin, outputs_begin, num_keys); } +template template -void bit_vector::selects(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept +void bit_vector::selects(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(keys_begin, keys_end); @@ -306,9 +328,11 @@ __global__ void mark_blocks_with_select_entries(const size_type* prefix_bit_coun } } -void bit_vector::build_ranks_and_selects(thrust::device_vector& ranks, - thrust::device_vector& selects, - bool flip_bits) +template +void bit_vector::build_ranks_and_selects( + thrust::device_vector& ranks, + thrust::device_vector& selects, + bool flip_bits) noexcept { if (n_bits_ == 0) { return; } @@ -360,14 +384,17 @@ void bit_vector::build_ranks_and_selects(thrust::device_vector& ranks, thrust::identity()); } -void bit_vector::build() noexcept +template +void bit_vector::build() noexcept { build_ranks_and_selects(ranks_, selects_, false); // 1-bits build_ranks_and_selects(ranks0_, selects0_, true); // 0-bits } +template template -bit_vector::ref_type bit_vector::ref(Operators...) const noexcept +bit_vector::ref_type bit_vector::ref( + Operators...) const noexcept { static_assert(sizeof...(Operators), "No operators specified"); return ref_type{device_storage_ref{thrust::raw_pointer_cast(words_.data()), diff --git a/tests/bit_vector/find_next_set_test.cu b/tests/bit_vector/find_next_set_test.cu index 5fdd05f62..852827319 100644 --- a/tests/bit_vector/find_next_set_test.cu +++ b/tests/bit_vector/find_next_set_test.cu @@ -41,7 +41,7 @@ TEST_CASE("Find next set test", "") { cuco::experimental::detail::bit_vector bv; - using size_type = cuco::experimental::detail::bit_vector::size_type; + using size_type = std::size_t; constexpr size_type num_elements{400}; for (size_type i = 0; i < num_elements; i++) { diff --git a/tests/bit_vector/get_test.cu b/tests/bit_vector/get_test.cu index bd12143fa..576c90e53 100644 --- a/tests/bit_vector/get_test.cu +++ b/tests/bit_vector/get_test.cu @@ -38,7 +38,7 @@ TEST_CASE("Get test", "") { cuco::experimental::detail::bit_vector bv; - using size_type = cuco::experimental::detail::bit_vector::size_type; + using size_type = std::size_t; constexpr size_type num_elements{400}; size_type num_set_ref = 0; diff --git a/tests/bit_vector/rank_test.cu b/tests/bit_vector/rank_test.cu index d852da536..e33e5d04c 100644 --- a/tests/bit_vector/rank_test.cu +++ b/tests/bit_vector/rank_test.cu @@ -30,7 +30,7 @@ TEST_CASE("Rank test", "") { cuco::experimental::detail::bit_vector bv; - using size_type = cuco::experimental::detail::bit_vector::size_type; + using size_type = std::size_t; constexpr size_type num_elements{4000}; for (size_type i = 0; i < num_elements; i++) { diff --git a/tests/bit_vector/select_test.cu b/tests/bit_vector/select_test.cu index d3e303fb7..93867a1b6 100644 --- a/tests/bit_vector/select_test.cu +++ b/tests/bit_vector/select_test.cu @@ -41,7 +41,7 @@ TEST_CASE("Select test", "") { cuco::experimental::detail::bit_vector bv; - using size_type = cuco::experimental::detail::bit_vector::size_type; + using size_type = std::size_t; constexpr size_type num_elements{4000}; size_type num_set = 0; diff --git a/tests/bit_vector/size_test.cu b/tests/bit_vector/size_test.cu index e86bade44..2b4204635 100644 --- a/tests/bit_vector/size_test.cu +++ b/tests/bit_vector/size_test.cu @@ -21,7 +21,7 @@ TEST_CASE("Size computation", "") { cuco::experimental::detail::bit_vector bv; - using size_type = cuco::experimental::detail::bit_vector::size_type; + using size_type = std::size_t; constexpr size_type num_elements{400}; for (size_type i = 0; i < num_elements; i++) { From 82d1e2640872ae8b9a3480daeeedd24ee85cefc7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 29 Aug 2023 15:30:34 -0700 Subject: [PATCH 71/99] Clean up docs --- .../detail/trie/bit_vector/bit_vector.cuh | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index a0389b0eb..44c72bb2e 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -80,9 +80,8 @@ class bit_vector { public: using size_type = std::size_t; ///< size type to specify bit index using slot_type = uint64_t; ///< Slot type - using allocator_type = - typename std::allocator_traits::rebind_alloc; ///< Type of the allocator - ///< to (de)allocate words + /// Type of the allocator to (de)allocate words + using allocator_type = typename std::allocator_traits::rebind_alloc; static constexpr size_type words_per_block = 4; ///< Tradeoff between space efficiency and perf. static constexpr size_type bits_per_word = sizeof(slot_type) * CHAR_BIT; ///< Bits in a word @@ -216,24 +215,24 @@ class bit_vector { size_type constexpr size() const noexcept { return n_bits_; } private: - using rank_allocator_type = - typename std::allocator_traits::rebind_alloc; ///< Type of the allocator to - ///< (de)allocate ranks - using size_allocator_type = typename std::allocator_traits::rebind_alloc< - size_type>; ///< Type of the allocator to (de)allocate indices + /// Type of the allocator to (de)allocate ranks + using rank_allocator_type = typename std::allocator_traits::rebind_alloc; + /// Type of the allocator to (de)allocate indices + using size_allocator_type = typename std::allocator_traits::rebind_alloc; allocator_type allocator_; ///< Words allocator size_type n_bits_; ///< Number of bits bit_vector currently holds - thrust::device_vector - words_; ///< Words vector that represents all bits - thrust::device_vector - ranks_; ///< Rank values for every 256-th bit (4-th word) - thrust::device_vector ranks0_; ///< Same as ranks_ but for `0` bits - thrust::device_vector - selects_; ///< Block indices of (0, 256, 512...)th `1` bit - thrust::device_vector - selects0_; ///< Same as selects_, but for `0` bits + /// Words vector that represents all bits + thrust::device_vector words_; + /// Rank values for every 256-th bit (4-th word) + thrust::device_vector ranks_; + /// Same as ranks_ but for `0` bits + thrust::device_vector ranks0_; + /// Block indices of (0, 256, 512...)th `1` bit + thrust::device_vector selects_; + /// Same as selects_, but for `0` bits + thrust::device_vector selects0_; /** * @brief Populates rank and select indexes on device From 793cf28acb9623e2e9b6b5ec6b9532dc23891a18 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 29 Aug 2023 15:40:40 -0700 Subject: [PATCH 72/99] Move kernels to a separate file --- .../detail/trie/bit_vector/bit_vector.inl | 213 +--------------- .../cuco/detail/trie/bit_vector/kernels.cuh | 232 ++++++++++++++++++ 2 files changed, 236 insertions(+), 209 deletions(-) create mode 100644 include/cuco/detail/trie/bit_vector/kernels.cuh diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 2d2ddbc53..2253abe7e 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -15,8 +15,10 @@ * limitations under the License. */ +#include + +#include #include -#include namespace cuco { namespace experimental { @@ -46,8 +48,7 @@ void bit_vector::append(bool bit) noexcept words_.resize(words_.size() + words_per_block); // Extend storage by one block } - set(n_bits_, bit); - ++n_bits_; + set(n_bits_++, bit); } template @@ -122,212 +123,6 @@ void bit_vector::selects(KeyIt keys_begin, ref_, keys_begin, outputs_begin, num_keys); } -/* - * @brief Gather bits of a range of keys - * - * @tparam BitvectorRef Bitvector reference type - * @tparam KeyIt Device-accessible iterator to input keys - * @tparam ValueIt Device-accessible iterator to values - * @tparam size_type Size type - * - * @param ref Bitvector ref - * @param keys Begin iterator to keys - * @param outputs Begin iterator to outputs - * @param num_keys Number of input keys - */ -template -__global__ void bitvector_get_kernel(BitvectorRef ref, - KeyIt keys, - ValueIt outputs, - size_type num_keys) -{ - uint32_t const loop_stride = gridDim.x * blockDim.x; - uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; - - while (key_id < num_keys) { - outputs[key_id] = ref.get(keys[key_id]); - key_id += loop_stride; - } -} - -/* - * @brief Gather rank values for a range of keys - * - * @tparam BitvectorRef Bitvector reference type - * @tparam KeyIt Device-accessible iterator to input keys - * @tparam ValueIt Device-accessible iterator to values - * @tparam size_type Size type - * - * @param ref Bitvector ref - * @param keys Begin iterator to keys - * @param outputs Begin iterator to outputs - * @param num_keys Number of input keys - */ -template -__global__ void bitvector_rank_kernel(BitvectorRef ref, - KeyIt keys, - ValueIt outputs, - size_type num_keys) -{ - uint32_t const loop_stride = gridDim.x * blockDim.x; - uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; - - while (key_id < num_keys) { - outputs[key_id] = ref.rank(keys[key_id]); - key_id += loop_stride; - } -} - -/* - * @brief Gather select values for a range of keys - * - * @tparam BitvectorRef Bitvector reference type - * @tparam KeyIt Device-accessible iterator to input keys - * @tparam ValueIt Device-accessible iterator to values - * @tparam size_type Size type - * - * @param ref Bitvector ref - * @param keys Begin iterator to keys - * @param outputs Begin iterator to outputs - * @param num_keys Number of input keys - */ -template -__global__ void bitvector_select_kernel(BitvectorRef ref, - KeyIt keys, - ValueIt outputs, - size_type num_keys) -{ - uint32_t const loop_stride = gridDim.x * blockDim.x; - uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; - - while (key_id < num_keys) { - outputs[key_id] = ref.select(keys[key_id]); - key_id += loop_stride; - } -} - -/* - * @brief Computes number of set or not-set bits in each word - * - * @tparam slot_type Word type - * @tparam size_type Size type - * - * @param words Input array of words - * @param bit_counts Output array of per-word bit counts - * @param num_words Number of words - * @param flip_bits Boolean to request negation of words before counting bits - */ -template -__global__ void bit_counts_kernel(const slot_type* words, - size_type* bit_counts, - size_type num_words, - bool flip_bits) -{ - size_type word_id = blockDim.x * blockIdx.x + threadIdx.x; - size_type stride = gridDim.x * blockDim.x; - - while (word_id < num_words) { - auto word = words[word_id]; - bit_counts[word_id] = cuda::std::popcount(flip_bits ? ~word : word); - word_id += stride; - } -} - -/* - * @brief Compute rank values at block size intervals. - * - * ranks[i] = Number of set bits in [0, i) range - * This kernel transforms prefix sum array of per-word bit counts - * into base-delta encoding style of `rank` struct. - * Since prefix sum is available, there are no dependencies across blocks. - - * @tparam size_type Size type - * - * @param prefix_bit_counts Prefix sum array of per-word bit counts - * @param ranks Output array of ranks - * @param num_words Length of input array - * @param num_blocks Length of ouput array - * @param words_per_block Number of words in each block - */ -template -__global__ void encode_ranks_from_prefix_bit_counts(const size_type* prefix_bit_counts, - rank* ranks, - size_type num_words, - size_type num_blocks, - size_type words_per_block) -{ - size_type rank_id = blockDim.x * blockIdx.x + threadIdx.x; - size_type stride = gridDim.x * blockDim.x; - - while (rank_id < num_blocks) { - size_type word_id = rank_id * words_per_block; - - // Set base value of rank - auto& rank = ranks[rank_id]; - rank.set_abs(prefix_bit_counts[word_id]); - - if (rank_id < num_blocks - 1) { - // For each subsequent word in this block, compute deltas from base - for (size_type block_offset = 0; block_offset < words_per_block - 1; block_offset++) { - auto delta = prefix_bit_counts[word_id + block_offset + 1] - prefix_bit_counts[word_id]; - rank.rels_[block_offset] = delta; - } - } - rank_id += stride; - } -} - -/* - * @brief Compute select values at block size intervals. - * - * selects[i] = Position of (i+ 1)th set bit - * This kernel check for blocks where prefix sum crosses a multiple of `bits_per_block`. - * Such blocks are marked in the output boolean array - * - * @tparam size_type Size type - * - * @param prefix_bit_counts Prefix sum array of per-word bit counts - * @param selects_markers Ouput array indicating whether a block has selects entry or not - * @param num_blocks Length of ouput array - * @param words_per_block Number of words in each block - * @param bits_per_block Number of bits in each block - */ -template -__global__ void mark_blocks_with_select_entries(const size_type* prefix_bit_counts, - size_type* select_markers, - size_type num_blocks, - size_type words_per_block, - size_type bits_per_block) -{ - size_type block_id = blockDim.x * blockIdx.x + threadIdx.x; - size_type stride = gridDim.x * blockDim.x; - - while (block_id < num_blocks) { - if (block_id == 0) { // Block 0 always has a selects entry - select_markers[block_id] = 1; - block_id += stride; - continue; - } - - select_markers[block_id] = 0; // Always clear marker first - size_type word_id = block_id * words_per_block; - size_type prev_count = prefix_bit_counts[word_id]; - - for (size_t block_offset = 1; block_offset <= words_per_block; block_offset++) { - size_type count = prefix_bit_counts[word_id + block_offset]; - - // Selects entry is added when cumulative bitcount crosses a multiple of bits_per_block - if ((prev_count - 1) / bits_per_block != (count - 1) / bits_per_block) { - select_markers[block_id] = 1; - break; - } - prev_count = count; - } - - block_id += stride; - } -} - template void bit_vector::build_ranks_and_selects( thrust::device_vector& ranks, diff --git a/include/cuco/detail/trie/bit_vector/kernels.cuh b/include/cuco/detail/trie/bit_vector/kernels.cuh new file mode 100644 index 000000000..ecb7994e4 --- /dev/null +++ b/include/cuco/detail/trie/bit_vector/kernels.cuh @@ -0,0 +1,232 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace cuco { +namespace experimental { +namespace detail { + +/* + * @brief Gather bits of a range of keys + * + * @tparam BitvectorRef Bitvector reference type + * @tparam KeyIt Device-accessible iterator to input keys + * @tparam ValueIt Device-accessible iterator to values + * @tparam size_type Size type + * + * @param ref Bitvector ref + * @param keys Begin iterator to keys + * @param outputs Begin iterator to outputs + * @param num_keys Number of input keys + */ +template +__global__ void bitvector_get_kernel(BitvectorRef ref, + KeyIt keys, + ValueIt outputs, + size_type num_keys) +{ + uint32_t const loop_stride = gridDim.x * blockDim.x; + uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; + + while (key_id < num_keys) { + outputs[key_id] = ref.get(keys[key_id]); + key_id += loop_stride; + } +} + +/* + * @brief Gather rank values for a range of keys + * + * @tparam BitvectorRef Bitvector reference type + * @tparam KeyIt Device-accessible iterator to input keys + * @tparam ValueIt Device-accessible iterator to values + * @tparam size_type Size type + * + * @param ref Bitvector ref + * @param keys Begin iterator to keys + * @param outputs Begin iterator to outputs + * @param num_keys Number of input keys + */ +template +__global__ void bitvector_rank_kernel(BitvectorRef ref, + KeyIt keys, + ValueIt outputs, + size_type num_keys) +{ + uint32_t const loop_stride = gridDim.x * blockDim.x; + uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; + + while (key_id < num_keys) { + outputs[key_id] = ref.rank(keys[key_id]); + key_id += loop_stride; + } +} + +/* + * @brief Gather select values for a range of keys + * + * @tparam BitvectorRef Bitvector reference type + * @tparam KeyIt Device-accessible iterator to input keys + * @tparam ValueIt Device-accessible iterator to values + * @tparam size_type Size type + * + * @param ref Bitvector ref + * @param keys Begin iterator to keys + * @param outputs Begin iterator to outputs + * @param num_keys Number of input keys + */ +template +__global__ void bitvector_select_kernel(BitvectorRef ref, + KeyIt keys, + ValueIt outputs, + size_type num_keys) +{ + uint32_t const loop_stride = gridDim.x * blockDim.x; + uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; + + while (key_id < num_keys) { + outputs[key_id] = ref.select(keys[key_id]); + key_id += loop_stride; + } +} + +/* + * @brief Computes number of set or not-set bits in each word + * + * @tparam slot_type Word type + * @tparam size_type Size type + * + * @param words Input array of words + * @param bit_counts Output array of per-word bit counts + * @param num_words Number of words + * @param flip_bits Boolean to request negation of words before counting bits + */ +template +__global__ void bit_counts_kernel(const slot_type* words, + size_type* bit_counts, + size_type num_words, + bool flip_bits) +{ + size_type word_id = blockDim.x * blockIdx.x + threadIdx.x; + size_type stride = gridDim.x * blockDim.x; + + while (word_id < num_words) { + auto word = words[word_id]; + bit_counts[word_id] = cuda::std::popcount(flip_bits ? ~word : word); + word_id += stride; + } +} + +/* + * @brief Compute rank values at block size intervals. + * + * ranks[i] = Number of set bits in [0, i) range + * This kernel transforms prefix sum array of per-word bit counts + * into base-delta encoding style of `rank` struct. + * Since prefix sum is available, there are no dependencies across blocks. + + * @tparam size_type Size type + * + * @param prefix_bit_counts Prefix sum array of per-word bit counts + * @param ranks Output array of ranks + * @param num_words Length of input array + * @param num_blocks Length of ouput array + * @param words_per_block Number of words in each block + */ +template +__global__ void encode_ranks_from_prefix_bit_counts(const size_type* prefix_bit_counts, + rank* ranks, + size_type num_words, + size_type num_blocks, + size_type words_per_block) +{ + size_type rank_id = blockDim.x * blockIdx.x + threadIdx.x; + size_type stride = gridDim.x * blockDim.x; + + while (rank_id < num_blocks) { + size_type word_id = rank_id * words_per_block; + + // Set base value of rank + auto& rank = ranks[rank_id]; + rank.set_abs(prefix_bit_counts[word_id]); + + if (rank_id < num_blocks - 1) { + // For each subsequent word in this block, compute deltas from base + for (size_type block_offset = 0; block_offset < words_per_block - 1; block_offset++) { + auto delta = prefix_bit_counts[word_id + block_offset + 1] - prefix_bit_counts[word_id]; + rank.rels_[block_offset] = delta; + } + } + rank_id += stride; + } +} + +/* + * @brief Compute select values at block size intervals. + * + * selects[i] = Position of (i+ 1)th set bit + * This kernel check for blocks where prefix sum crosses a multiple of `bits_per_block`. + * Such blocks are marked in the output boolean array + * + * @tparam size_type Size type + * + * @param prefix_bit_counts Prefix sum array of per-word bit counts + * @param selects_markers Ouput array indicating whether a block has selects entry or not + * @param num_blocks Length of ouput array + * @param words_per_block Number of words in each block + * @param bits_per_block Number of bits in each block + */ +template +__global__ void mark_blocks_with_select_entries(const size_type* prefix_bit_counts, + size_type* select_markers, + size_type num_blocks, + size_type words_per_block, + size_type bits_per_block) +{ + size_type block_id = blockDim.x * blockIdx.x + threadIdx.x; + size_type stride = gridDim.x * blockDim.x; + + while (block_id < num_blocks) { + if (block_id == 0) { // Block 0 always has a selects entry + select_markers[block_id] = 1; + block_id += stride; + continue; + } + + select_markers[block_id] = 0; // Always clear marker first + size_type word_id = block_id * words_per_block; + size_type prev_count = prefix_bit_counts[word_id]; + + for (size_t block_offset = 1; block_offset <= words_per_block; block_offset++) { + size_type count = prefix_bit_counts[word_id + block_offset]; + + // Selects entry is added when cumulative bitcount crosses a multiple of bits_per_block + if ((prev_count - 1) / bits_per_block != (count - 1) / bits_per_block) { + select_markers[block_id] = 1; + break; + } + prev_count = count; + } + + block_id += stride; + } +} + +} // namespace detail +} // namespace experimental +} // namespace cuco From 0e29dbc6f20ebd05dbbaf90fa8e719a0339164bc Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 29 Aug 2023 16:16:47 -0700 Subject: [PATCH 73/99] Make bit_vector_ref a nested type --- .../detail/trie/bit_vector/bit_vector.cuh | 221 ++++++++++++++++-- .../detail/trie/bit_vector/bit_vector.inl | 24 +- .../detail/trie/bit_vector/bit_vector_ref.cuh | 39 ---- .../detail/trie/bit_vector/bit_vector_ref.inl | 217 ----------------- tests/bit_vector/find_next_set_test.cu | 2 +- tests/bit_vector/get_test.cu | 2 +- tests/bit_vector/select_test.cu | 2 +- 7 files changed, 217 insertions(+), 290 deletions(-) delete mode 100644 include/cuco/detail/trie/bit_vector/bit_vector_ref.cuh delete mode 100644 include/cuco/detail/trie/bit_vector/bit_vector_ref.inl diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 44c72bb2e..8ee2a6755 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -18,7 +18,6 @@ #pragma once #include -#include #include #include @@ -26,6 +25,7 @@ #include #include +#include #include #include @@ -179,10 +179,8 @@ class bit_vector { /** *@brief Struct to hold all storage refs needed by bitvector_ref */ - struct device_storage_ref { - using bit_vector_type = bit_vector; ///< bit_vector_ref needs this to access words_per_block - - const slot_type* words_ref_; ///< Words ref + struct storage_ref_type { + const slot_type* words_ref_; ///< Words refs const rank* ranks_ref_; ///< Ranks refs const size_type* selects_ref_; ///< Selects refs @@ -191,24 +189,215 @@ class bit_vector { const size_type* selects0_ref_; ///< Selects refs 0 bits }; - template - using ref_type = - bit_vector_ref; ///< Non-owning container ref type + /** + * @brief Device non-owning reference type of bit_vector + */ + class reference { + public: + /** + * @brief Constructs bit_vector_ref. + * + * @param storage Struct with non-owning refs to bitvector slot storages + */ + __host__ __device__ explicit constexpr reference(storage_ref_type storage) noexcept + : storage_{storage} + { + } + + /** + * @brief Access value of a single bit + * + * @param key Position of bit + * + * @return Value of bit at position specified by key + */ + [[nodiscard]] __device__ bool get(size_type key) const noexcept + { + return (storage_.words_ref_[key / bits_per_word] >> (key % bits_per_word)) & 1UL; + } + + /** + * @brief Access a single word of internal storage + * + * @param word_id Index of word + * + * @return Word at position specified by index + */ + [[nodiscard]] __device__ slot_type get_word(size_type word_id) const noexcept + { + return storage_.words_ref_[word_id]; + } + + /** + * @brief Find position of first set bit starting from a given position (inclusive) + * + * @param key Position of starting bit + * + * @return Index of next set bit + */ + [[nodiscard]] __device__ size_type find_next_set(size_type key) const noexcept + { + size_type word_id = key / bits_per_word; + size_type bit_id = key % bits_per_word; + slot_type word = storage_.words_ref_[word_id]; + word &= ~(0lu) << bit_id; + while (word == 0) { + word = storage_.words_ref_[++word_id]; + } + return word_id * bits_per_word + __ffsll(word) - 1; // cuda intrinsic + } + + /** + * @brief Find number of set bits (rank) in all positions before the input position (exclusive) + * + * @param key Input bit position + * + * @return Rank of input position + */ + [[nodiscard]] __device__ size_type rank(size_type key) const noexcept + { + size_type word_id = key / bits_per_word; + size_type bit_id = key % bits_per_word; + size_type rank_id = word_id / words_per_block; + size_type rel_id = word_id % words_per_block; + + auto rank = storage_.ranks_ref_[rank_id]; + size_type n = rank.abs(); + + if (rel_id != 0) { n += rank.rels_[rel_id - 1]; } + + n += cuda::std::popcount(storage_.words_ref_[word_id] & ((1UL << bit_id) - 1)); + + return n; + } + + /** + * @brief Find position of Nth set (1) bit counting from start of bitvector + * + * @param count Input N + * + * @return Position of Nth set bit + */ + [[nodiscard]] __device__ size_type select(size_type count) const noexcept + { + auto rank_id = get_initial_rank_estimate(count, storage_.selects_ref_, storage_.ranks_ref_); + auto rank = storage_.ranks_ref_[rank_id]; + + size_type word_id = rank_id * words_per_block; + word_id += subtract_rank_from_count(count, rank); + + return word_id * bits_per_word + select_bit_in_word(count, storage_.words_ref_[word_id]); + } + + /** + * @brief Find position of Nth not-set (0) bit counting from start of bitvector + * + * @param count Input N + * + * @return Position of Nth not-set bit + */ + [[nodiscard]] __device__ size_type select0(size_type count) const noexcept + { + auto rank_id = get_initial_rank_estimate(count, storage_.selects0_ref_, storage_.ranks0_ref_); + auto rank = storage_.ranks0_ref_[rank_id]; + + size_type word_id = rank_id * words_per_block; + word_id += subtract_rank_from_count(count, rank); + + return word_id * bits_per_word + select_bit_in_word(count, ~(storage_.words_ref_[word_id])); + } + + private: + /** + * @brief Helper function for select operation that computes an initial rank estimate + * + * @param count Input count for which select operation is being performed + * @param selects Selects array + * @param ranks Ranks array + * + * @return index in ranks which corresponds to highest rank less than count (least upper bound) + */ + template + [[nodiscard]] __device__ size_type get_initial_rank_estimate( + size_type count, const SelectsRef& selects, const RanksRef& ranks) const noexcept + { + size_type block_id = count / (bits_per_word * words_per_block); + size_type begin = selects[block_id]; + size_type end = selects[block_id + 1] + 1UL; + + if (begin + 10 >= end) { // Linear search + while (count >= ranks[begin + 1].abs()) { + ++begin; + } + } else { // Binary search + while (begin + 1 < end) { + size_type middle = (begin + end) / 2; + if (count < ranks[middle].abs()) { + end = middle; + } else { + begin = middle; + } + } + } + return begin; + } + + /** + * @brief Subtract rank estimate from input count and return an increment to word_id + * + * @tparam Rank type + * + * @param count Input count that will be updated + * @param rank Initial rank estimate for count + * + * @return Increment to word_id based on rank values + */ + template + [[nodiscard]] __device__ size_type subtract_rank_from_count(size_type& count, + Rank rank) const noexcept + { + count -= rank.abs(); + + bool a0 = count >= rank.rels_[0]; + bool a1 = count >= rank.rels_[1]; + bool a2 = count >= rank.rels_[2]; + size_type inc = a0 + a1 + a2; + + count -= (inc > 0) * rank.rels_[inc - (inc > 0)]; + + return inc; + } + + /** + * @brief Find position of Nth set bit in a 64-bit word + * + * @param N Input count + * + * @return Position of Nth set bit + */ + [[nodiscard]] __device__ size_type select_bit_in_word(size_type N, + slot_type word) const noexcept + { + for (size_type pos = 0; pos < N; pos++) { + word &= word - 1; + } + return __ffsll(word & -word) - 1; // cuda intrinsic + } + + storage_ref_type storage_; ///< Non-owning storage + }; + + using ref_type = reference; ///< Non-owning container ref type /** - * @brief Get device ref with operators. - * - * @tparam Operators Set of `cuco::op` to be provided by the ref - * - * @param ops List of operators, e.g., `cuco::bv_read` + * @brief Gets non-owning device ref of the current object * * @return Device ref of the current `bit_vector` object */ - template - [[nodiscard]] ref_type ref(Operators... ops) const noexcept; + [[nodiscard]] ref_type ref() const noexcept; /** - * @brief Get the number of bits bit_vector holds + * @brief Gets the number of bits bit_vector holds * * @return Number of bits bit_vector holds */ diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 2253abe7e..6f5ef7906 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -81,10 +81,9 @@ void bit_vector::get(KeyIt keys_begin, if (num_keys == 0) { return; } auto grid_size = default_grid_size(num_keys); - auto ref_ = this->ref(cuco::experimental::bv_read); bitvector_get_kernel<<>>( - ref_, keys_begin, outputs_begin, num_keys); + ref(), keys_begin, outputs_begin, num_keys); } template @@ -99,10 +98,9 @@ void bit_vector::ranks(KeyIt keys_begin, if (num_keys == 0) { return; } auto grid_size = default_grid_size(num_keys); - auto ref_ = this->ref(cuco::experimental::bv_read); bitvector_rank_kernel<<>>( - ref_, keys_begin, outputs_begin, num_keys); + ref(), keys_begin, outputs_begin, num_keys); } template @@ -117,10 +115,9 @@ void bit_vector::selects(KeyIt keys_begin, if (num_keys == 0) { return; } auto grid_size = default_grid_size(num_keys); - auto ref_ = this->ref(cuco::experimental::bv_read); bitvector_select_kernel<<>>( - ref_, keys_begin, outputs_begin, num_keys); + ref(), keys_begin, outputs_begin, num_keys); } template @@ -187,16 +184,13 @@ void bit_vector::build() noexcept } template -template -bit_vector::ref_type bit_vector::ref( - Operators...) const noexcept +bit_vector::ref_type bit_vector::ref() const noexcept { - static_assert(sizeof...(Operators), "No operators specified"); - return ref_type{device_storage_ref{thrust::raw_pointer_cast(words_.data()), - thrust::raw_pointer_cast(ranks_.data()), - thrust::raw_pointer_cast(selects_.data()), - thrust::raw_pointer_cast(ranks0_.data()), - thrust::raw_pointer_cast(selects0_.data())}}; + return ref_type{storage_ref_type{thrust::raw_pointer_cast(words_.data()), + thrust::raw_pointer_cast(ranks_.data()), + thrust::raw_pointer_cast(selects_.data()), + thrust::raw_pointer_cast(ranks0_.data()), + thrust::raw_pointer_cast(selects0_.data())}}; } } // namespace detail diff --git a/include/cuco/detail/trie/bit_vector/bit_vector_ref.cuh b/include/cuco/detail/trie/bit_vector/bit_vector_ref.cuh deleted file mode 100644 index 5bd0e4499..000000000 --- a/include/cuco/detail/trie/bit_vector/bit_vector_ref.cuh +++ /dev/null @@ -1,39 +0,0 @@ -#pragma once - -#include - -namespace cuco { -namespace experimental { - -/** - * @brief Device non-owning "ref" type that can be used in device code to perform arbitrary - * operations defined in `include/cuco/operator.hpp` - * - * @tparam StorageRef Storage ref type - * @tparam Operators Device operator options defined in `include/cuco/operator.hpp` - */ -template -class bit_vector_ref - : public detail::operator_impl>... { - public: - using storage_ref_type = StorageRef; ///< Type of storage ref - - /** - * @brief Constructs bit_vector_ref. - * - * @param storage Struct with non-owning refs to bitvector slot storages - */ - __host__ __device__ explicit constexpr bit_vector_ref(storage_ref_type storage) noexcept; - - private: - storage_ref_type storage_; - - // Mixins need to be friends with this class in order to access private members - template - friend class detail::operator_impl; -}; - -} // namespace experimental -} // namespace cuco - -#include diff --git a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl b/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl deleted file mode 100644 index 51042ba58..000000000 --- a/include/cuco/detail/trie/bit_vector/bit_vector_ref.inl +++ /dev/null @@ -1,217 +0,0 @@ -#include - -namespace cuco { -namespace experimental { - -template -__host__ __device__ constexpr bit_vector_ref::bit_vector_ref( - StorageRef storage) noexcept - : storage_{storage} -{ -} - -namespace detail { - -template -class operator_impl> { - using ref_type = bit_vector_ref; ///< Bitvector ref type - using size_type = typename StorageRef::bit_vector_type::size_type; ///< Size type - using slot_type = typename StorageRef::bit_vector_type::slot_type; ///< Slot type - - static constexpr size_type bits_per_word = sizeof(slot_type) * 8; - static constexpr size_type words_per_block = StorageRef::bit_vector_type::words_per_block; - - public: - /** - * @brief Access value of a single bit - * - * @param key Position of bit - * - * @return Value of bit at position specified by key - */ - [[nodiscard]] __device__ bool get(size_type key) const noexcept - { - auto const& ref_ = static_cast(*this); - return (ref_.storage_.words_ref_[key / bits_per_word] >> (key % bits_per_word)) & 1UL; - } - - /** - * @brief Access a single word of internal storage - * - * @param word_id Index of word - * - * @return Word at position specified by index - */ - [[nodiscard]] __device__ slot_type get_word(size_type word_id) const noexcept - { - auto const& ref_ = static_cast(*this); - return ref_.storage_.words_ref_[word_id]; - } - - /** - * @brief Find position of first set bit starting from a given position (inclusive) - * - * @param key Position of starting bit - * - * @return Index of next set bit - */ - [[nodiscard]] __device__ size_type find_next_set(size_type key) const noexcept - { - auto const& ref_ = static_cast(*this); - size_type word_id = key / bits_per_word; - size_type bit_id = key % bits_per_word; - slot_type word = ref_.storage_.words_ref_[word_id]; - word &= ~(0lu) << bit_id; - while (word == 0) { - word = ref_.storage_.words_ref_[++word_id]; - } - return word_id * bits_per_word + __ffsll(word) - 1; // cuda intrinsic - } - - /** - * @brief Find number of set bits (rank) in all positions before the input position (exclusive) - * - * @param key Input bit position - * - * @return Rank of input position - */ - [[nodiscard]] __device__ size_type rank(size_type key) const noexcept - { - auto const& ref_ = static_cast(*this); - - size_type word_id = key / bits_per_word; - size_type bit_id = key % bits_per_word; - size_type rank_id = word_id / words_per_block; - size_type rel_id = word_id % words_per_block; - - auto rank = ref_.storage_.ranks_ref_[rank_id]; - size_type n = rank.abs(); - - if (rel_id != 0) { n += rank.rels_[rel_id - 1]; } - - n += cuda::std::popcount(ref_.storage_.words_ref_[word_id] & ((1UL << bit_id) - 1)); - - return n; - } - - /** - * @brief Find position of Nth set (1) bit counting from start of bitvector - * - * @param count Input N - * - * @return Position of Nth set bit - */ - [[nodiscard]] __device__ size_type select(size_type count) const noexcept - { - auto const& storage_ = static_cast(*this).storage_; - - auto rank_id = get_initial_rank_estimate(count, storage_.selects_ref_, storage_.ranks_ref_); - auto rank = storage_.ranks_ref_[rank_id]; - - size_type word_id = rank_id * words_per_block; - word_id += subtract_rank_from_count(count, rank); - - return word_id * bits_per_word + select_bit_in_word(count, storage_.words_ref_[word_id]); - } - - /** - * @brief Find position of Nth not-set (0) bit counting from start of bitvector - * - * @param count Input N - * - * @return Position of Nth not-set bit - */ - [[nodiscard]] __device__ size_type select0(size_type count) const noexcept - { - auto const& storage_ = static_cast(*this).storage_; - - auto rank_id = get_initial_rank_estimate(count, storage_.selects0_ref_, storage_.ranks0_ref_); - auto rank = storage_.ranks0_ref_[rank_id]; - - size_type word_id = rank_id * words_per_block; - word_id += subtract_rank_from_count(count, rank); - - return word_id * bits_per_word + select_bit_in_word(count, ~(storage_.words_ref_[word_id])); - } - - private: - /** - * @brief Helper function for select operation that computes an initial rank estimate - * - * @param count Input count for which select operation is being performed - * @param selects Selects array - * @param ranks Ranks array - * - * @return index in ranks which corresponds to highest rank less than count (least upper bound) - */ - template - [[nodiscard]] __device__ size_type get_initial_rank_estimate(size_type count, - const SelectsRef& selects, - const RanksRef& ranks) const noexcept - { - size_type block_id = count / (bits_per_word * words_per_block); - size_type begin = selects[block_id]; - size_type end = selects[block_id + 1] + 1UL; - - if (begin + 10 >= end) { // Linear search - while (count >= ranks[begin + 1].abs()) { - ++begin; - } - } else { // Binary search - while (begin + 1 < end) { - size_type middle = (begin + end) / 2; - if (count < ranks[middle].abs()) { - end = middle; - } else { - begin = middle; - } - } - } - return begin; - } - - /** - * @brief Subtract rank estimate from input count and return an increment to word_id - * - * @tparam Rank type - * - * @param count Input count that will be updated - * @param rank Initial rank estimate for count - * - * @return Increment to word_id based on rank values - */ - template - [[nodiscard]] __device__ size_type subtract_rank_from_count(size_type& count, - Rank rank) const noexcept - { - count -= rank.abs(); - - bool a0 = count >= rank.rels_[0]; - bool a1 = count >= rank.rels_[1]; - bool a2 = count >= rank.rels_[2]; - size_type inc = a0 + a1 + a2; - - count -= (inc > 0) * rank.rels_[inc - (inc > 0)]; - - return inc; - } - - /** - * @brief Find position of Nth set bit in a 64-bit word - * - * @param N Input count - * - * @return Position of Nth set bit - */ - [[nodiscard]] __device__ size_type select_bit_in_word(size_type N, slot_type word) const noexcept - { - for (size_type pos = 0; pos < N; pos++) { - word &= word - 1; - } - return __ffsll(word & -word) - 1; // cuda intrinsic - } -}; - -} // namespace detail -} // namespace experimental -} // namespace cuco diff --git a/tests/bit_vector/find_next_set_test.cu b/tests/bit_vector/find_next_set_test.cu index 852827319..04672a2a1 100644 --- a/tests/bit_vector/find_next_set_test.cu +++ b/tests/bit_vector/find_next_set_test.cu @@ -50,7 +50,7 @@ TEST_CASE("Find next set test", "") bv.build(); thrust::device_vector device_result(num_elements); - auto ref = bv.ref(cuco::experimental::bv_read); + auto ref = bv.ref(); find_next_set_kernel<<<1, 1024>>>(ref, num_elements, device_result.data()); thrust::host_vector host_result = device_result; diff --git a/tests/bit_vector/get_test.cu b/tests/bit_vector/get_test.cu index 576c90e53..c9df39f83 100644 --- a/tests/bit_vector/get_test.cu +++ b/tests/bit_vector/get_test.cu @@ -49,7 +49,7 @@ TEST_CASE("Get test", "") bv.build(); // Device-ref test - auto ref = bv.ref(cuco::experimental::bv_read); + auto ref = bv.ref(); thrust::device_vector get_result(num_elements); get_kernel<<<1, 1024>>>(ref, num_elements, get_result.data()); diff --git a/tests/bit_vector/select_test.cu b/tests/bit_vector/select_test.cu index 93867a1b6..f8009b853 100644 --- a/tests/bit_vector/select_test.cu +++ b/tests/bit_vector/select_test.cu @@ -50,7 +50,7 @@ TEST_CASE("Select test", "") num_set += modulo_bitgen(i); } bv.build(); - auto ref = bv.ref(cuco::experimental::bv_read); + auto ref = bv.ref(); // Check select { From 2314801f9aa527f374007fd8bb757bf65182567d Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 29 Aug 2023 16:17:29 -0700 Subject: [PATCH 74/99] Remove bv read operator tag --- include/cuco/operator.hpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp index f9165d3bf..b7629ae4c 100644 --- a/include/cuco/operator.hpp +++ b/include/cuco/operator.hpp @@ -45,12 +45,6 @@ struct contains_tag { struct find_tag { } inline constexpr find; -/** - * @brief `bv_read` operator tag - */ -struct bv_read_tag { -} inline constexpr bv_read; - } // namespace op } // namespace experimental } // namespace cuco From 0348dafbc76985ce88932fed481279540b14fd84 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 29 Aug 2023 16:49:17 -0700 Subject: [PATCH 75/99] Move implementation details to inl file --- .../detail/trie/bit_vector/bit_vector.cuh | 122 ++------------ .../detail/trie/bit_vector/bit_vector.inl | 152 ++++++++++++++++++ .../cuco/detail/trie/bit_vector/kernels.cuh | 2 + 3 files changed, 167 insertions(+), 109 deletions(-) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/bit_vector/bit_vector.cuh index 8ee2a6755..376a193d1 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/bit_vector/bit_vector.cuh @@ -18,14 +18,11 @@ #pragma once #include -#include -#include #include #include #include -#include #include #include @@ -199,10 +196,7 @@ class bit_vector {  *  * @param storage Struct with non-owning refs to bitvector slot storages  */ - __host__ __device__ explicit constexpr reference(storage_ref_type storage) noexcept - : storage_{storage} - { - } + __host__ __device__ explicit constexpr reference(storage_ref_type storage) noexcept; /** * @brief Access value of a single bit @@ -211,10 +205,7 @@ class bit_vector { * * @return Value of bit at position specified by key */ - [[nodiscard]] __device__ bool get(size_type key) const noexcept - { - return (storage_.words_ref_[key / bits_per_word] >> (key % bits_per_word)) & 1UL; - } + [[nodiscard]] __device__ bool get(size_type key) const noexcept; /** * @brief Access a single word of internal storage @@ -223,10 +214,7 @@ class bit_vector { * * @return Word at position specified by index */ - [[nodiscard]] __device__ slot_type get_word(size_type word_id) const noexcept - { - return storage_.words_ref_[word_id]; - } + [[nodiscard]] __device__ slot_type get_word(size_type word_id) const noexcept; /** * @brief Find position of first set bit starting from a given position (inclusive) @@ -235,17 +223,7 @@ class bit_vector { * * @return Index of next set bit */ - [[nodiscard]] __device__ size_type find_next_set(size_type key) const noexcept - { - size_type word_id = key / bits_per_word; - size_type bit_id = key % bits_per_word; - slot_type word = storage_.words_ref_[word_id]; - word &= ~(0lu) << bit_id; - while (word == 0) { - word = storage_.words_ref_[++word_id]; - } - return word_id * bits_per_word + __ffsll(word) - 1; // cuda intrinsic - } + [[nodiscard]] __device__ size_type find_next_set(size_type key) const noexcept; /** * @brief Find number of set bits (rank) in all positions before the input position (exclusive) @@ -254,22 +232,7 @@ class bit_vector { * * @return Rank of input position */ - [[nodiscard]] __device__ size_type rank(size_type key) const noexcept - { - size_type word_id = key / bits_per_word; - size_type bit_id = key % bits_per_word; - size_type rank_id = word_id / words_per_block; - size_type rel_id = word_id % words_per_block; - - auto rank = storage_.ranks_ref_[rank_id]; - size_type n = rank.abs(); - - if (rel_id != 0) { n += rank.rels_[rel_id - 1]; } - - n += cuda::std::popcount(storage_.words_ref_[word_id] & ((1UL << bit_id) - 1)); - - return n; - } + [[nodiscard]] __device__ size_type rank(size_type key) const noexcept; /** * @brief Find position of Nth set (1) bit counting from start of bitvector @@ -278,16 +241,7 @@ class bit_vector { * * @return Position of Nth set bit */ - [[nodiscard]] __device__ size_type select(size_type count) const noexcept - { - auto rank_id = get_initial_rank_estimate(count, storage_.selects_ref_, storage_.ranks_ref_); - auto rank = storage_.ranks_ref_[rank_id]; - - size_type word_id = rank_id * words_per_block; - word_id += subtract_rank_from_count(count, rank); - - return word_id * bits_per_word + select_bit_in_word(count, storage_.words_ref_[word_id]); - } + [[nodiscard]] __device__ size_type select(size_type count) const noexcept; /** * @brief Find position of Nth not-set (0) bit counting from start of bitvector @@ -296,16 +250,7 @@ class bit_vector { * * @return Position of Nth not-set bit */ - [[nodiscard]] __device__ size_type select0(size_type count) const noexcept - { - auto rank_id = get_initial_rank_estimate(count, storage_.selects0_ref_, storage_.ranks0_ref_); - auto rank = storage_.ranks0_ref_[rank_id]; - - size_type word_id = rank_id * words_per_block; - word_id += subtract_rank_from_count(count, rank); - - return word_id * bits_per_word + select_bit_in_word(count, ~(storage_.words_ref_[word_id])); - } + [[nodiscard]] __device__ size_type select0(size_type count) const noexcept; private: /** @@ -319,28 +264,7 @@ class bit_vector { */ template [[nodiscard]] __device__ size_type get_initial_rank_estimate( - size_type count, const SelectsRef& selects, const RanksRef& ranks) const noexcept - { - size_type block_id = count / (bits_per_word * words_per_block); - size_type begin = selects[block_id]; - size_type end = selects[block_id + 1] + 1UL; - - if (begin + 10 >= end) { // Linear search - while (count >= ranks[begin + 1].abs()) { - ++begin; - } - } else { // Binary search - while (begin + 1 < end) { - size_type middle = (begin + end) / 2; - if (count < ranks[middle].abs()) { - end = middle; - } else { - begin = middle; - } - } - } - return begin; - } + size_type count, const SelectsRef& selects, const RanksRef& ranks) const noexcept; /** * @brief Subtract rank estimate from input count and return an increment to word_id @@ -354,19 +278,7 @@ class bit_vector { */ template [[nodiscard]] __device__ size_type subtract_rank_from_count(size_type& count, - Rank rank) const noexcept - { - count -= rank.abs(); - - bool a0 = count >= rank.rels_[0]; - bool a1 = count >= rank.rels_[1]; - bool a2 = count >= rank.rels_[2]; - size_type inc = a0 + a1 + a2; - - count -= (inc > 0) * rank.rels_[inc - (inc > 0)]; - - return inc; - } + Rank rank) const noexcept; /** * @brief Find position of Nth set bit in a 64-bit word @@ -376,13 +288,7 @@ class bit_vector { * @return Position of Nth set bit */ [[nodiscard]] __device__ size_type select_bit_in_word(size_type N, - slot_type word) const noexcept - { - for (size_type pos = 0; pos < N; pos++) { - word &= word - 1; - } - return __ffsll(word & -word) - 1; // cuda intrinsic - } + slot_type word) const noexcept; storage_ref_type storage_; ///< Non-owning storage }; @@ -401,7 +307,7 @@ class bit_vector { * * @return Number of bits bit_vector holds */ - size_type constexpr size() const noexcept { return n_bits_; } + [[nodiscard]] constexpr size_type size() const noexcept; private: /// Type of the allocator to (de)allocate ranks @@ -441,10 +347,8 @@ class bit_vector { * * @return grid size */ - size_type constexpr default_grid_size(size_type num_elements) const noexcept - { - return (num_elements - 1) / (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE) + 1; - } + // TODO: to be moved to the CUDA utility header + size_type constexpr default_grid_size(size_type num_elements) const noexcept; }; } // namespace detail diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/bit_vector/bit_vector.inl index 6f5ef7906..743dade4a 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/bit_vector/bit_vector.inl @@ -16,10 +16,14 @@ */ #include +#include +#include #include #include +#include + namespace cuco { namespace experimental { namespace detail { @@ -193,6 +197,154 @@ bit_vector::ref_type bit_vector::ref() const noexcept thrust::raw_pointer_cast(selects0_.data())}}; } +template +constexpr bit_vector::size_type bit_vector::size() const noexcept +{ + return n_bits_; +} + +template +constexpr bit_vector::size_type bit_vector::default_grid_size( + size_type num_elements) const noexcept +{ + return (num_elements - 1) / (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE) + 1; +} + +// Device reference implementations +template +__host__ __device__ constexpr bit_vector::reference::reference( + storage_ref_type storage) noexcept + : storage_{storage} +{ +} + +template +__device__ bool bit_vector::reference::get(size_type key) const noexcept +{ + return (storage_.words_ref_[key / bits_per_word] >> (key % bits_per_word)) & 1UL; +} + +template +__device__ typename bit_vector::slot_type bit_vector::reference::get_word( + size_type word_id) const noexcept +{ + return storage_.words_ref_[word_id]; +} + +template +__device__ typename bit_vector::size_type +bit_vector::reference::find_next_set(size_type key) const noexcept +{ + size_type word_id = key / bits_per_word; + size_type bit_id = key % bits_per_word; + slot_type word = storage_.words_ref_[word_id]; + word &= ~(0lu) << bit_id; + while (word == 0) { + word = storage_.words_ref_[++word_id]; + } + return word_id * bits_per_word + __ffsll(word) - 1; // cuda intrinsic +} + +template +__device__ typename bit_vector::size_type bit_vector::reference::rank( + size_type key) const noexcept +{ + size_type word_id = key / bits_per_word; + size_type bit_id = key % bits_per_word; + size_type rank_id = word_id / words_per_block; + size_type rel_id = word_id % words_per_block; + + auto rank = storage_.ranks_ref_[rank_id]; + size_type n = rank.abs(); + + if (rel_id != 0) { n += rank.rels_[rel_id - 1]; } + + n += cuda::std::popcount(storage_.words_ref_[word_id] & ((1UL << bit_id) - 1)); + + return n; +} + +template +__device__ typename bit_vector::size_type bit_vector::reference::select( + size_type count) const noexcept +{ + auto rank_id = get_initial_rank_estimate(count, storage_.selects_ref_, storage_.ranks_ref_); + auto rank = storage_.ranks_ref_[rank_id]; + + size_type word_id = rank_id * words_per_block; + word_id += subtract_rank_from_count(count, rank); + + return word_id * bits_per_word + select_bit_in_word(count, storage_.words_ref_[word_id]); +} + +template +__device__ typename bit_vector::size_type bit_vector::reference::select0( + size_type count) const noexcept +{ + auto rank_id = get_initial_rank_estimate(count, storage_.selects0_ref_, storage_.ranks0_ref_); + auto rank = storage_.ranks0_ref_[rank_id]; + + size_type word_id = rank_id * words_per_block; + word_id += subtract_rank_from_count(count, rank); + + return word_id * bits_per_word + select_bit_in_word(count, ~(storage_.words_ref_[word_id])); +} + +template +template +__device__ typename bit_vector::size_type +bit_vector::reference::get_initial_rank_estimate(size_type count, + SelectsRef const& selects, + RanksRef const& ranks) const noexcept +{ + size_type block_id = count / (bits_per_word * words_per_block); + size_type begin = selects[block_id]; + size_type end = selects[block_id + 1] + 1UL; + + if (begin + 10 >= end) { // Linear search + while (count >= ranks[begin + 1].abs()) { + ++begin; + } + } else { // Binary search + while (begin + 1 < end) { + size_type middle = (begin + end) / 2; + if (count < ranks[middle].abs()) { + end = middle; + } else { + begin = middle; + } + } + } + return begin; +} + +template +template +__device__ typename bit_vector::size_type +bit_vector::reference::subtract_rank_from_count(size_type& count, + Rank rank) const noexcept +{ + count -= rank.abs(); + + bool a0 = count >= rank.rels_[0]; + bool a1 = count >= rank.rels_[1]; + bool a2 = count >= rank.rels_[2]; + size_type inc = a0 + a1 + a2; + + count -= (inc > 0) * rank.rels_[inc - (inc > 0)]; + + return inc; +} + +template +__device__ typename bit_vector::size_type +bit_vector::reference::select_bit_in_word(size_type N, slot_type word) const noexcept +{ + for (size_type pos = 0; pos < N; pos++) { + word &= word - 1; + } + return __ffsll(word & -word) - 1; // cuda intrinsic +} } // namespace detail } // namespace experimental } // namespace cuco diff --git a/include/cuco/detail/trie/bit_vector/kernels.cuh b/include/cuco/detail/trie/bit_vector/kernels.cuh index ecb7994e4..7b0027df2 100644 --- a/include/cuco/detail/trie/bit_vector/kernels.cuh +++ b/include/cuco/detail/trie/bit_vector/kernels.cuh @@ -17,6 +17,8 @@ #pragma once +#include + namespace cuco { namespace experimental { namespace detail { From f804b89b97e94c0a867335b8222003ab34e14e58 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 29 Aug 2023 16:54:41 -0700 Subject: [PATCH 76/99] Rename bit_vector as dynamic_bitset --- .../dynamic_bitset.cuh} | 22 ++--- .../dynamic_bitset.inl} | 85 +++++++++---------- .../kernels.cuh | 0 tests/CMakeLists.txt | 14 +-- .../find_next_set_test.cu | 4 +- .../get_test.cu | 4 +- .../rank_test.cu | 4 +- .../select_test.cu | 4 +- .../size_test.cu | 4 +- 9 files changed, 70 insertions(+), 71 deletions(-) rename include/cuco/detail/trie/{bit_vector/bit_vector.cuh => dynamic_bitset/dynamic_bitset.cuh} (94%) rename include/cuco/detail/trie/{bit_vector/bit_vector.inl => dynamic_bitset/dynamic_bitset.inl} (76%) rename include/cuco/detail/trie/{bit_vector => dynamic_bitset}/kernels.cuh (100%) rename tests/{bit_vector => dynamic_bitset}/find_next_set_test.cu (95%) rename tests/{bit_vector => dynamic_bitset}/get_test.cu (95%) rename tests/{bit_vector => dynamic_bitset}/rank_test.cu (93%) rename tests/{bit_vector => dynamic_bitset}/select_test.cu (96%) rename tests/{bit_vector => dynamic_bitset}/size_test.cu (89%) diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh similarity index 94% rename from include/cuco/detail/trie/bit_vector/bit_vector.cuh rename to include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index 376a193d1..039714bbf 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -73,7 +73,7 @@ struct rank { * @tparam Allocator Type of allocator used for device storage */ template > -class bit_vector { +class dynamic_bitset { public: using size_type = std::size_t; ///< size type to specify bit index using slot_type = uint64_t; ///< Slot type @@ -89,9 +89,9 @@ class bit_vector { * * @param allocator Allocator used for allocating device storage */ - inline bit_vector(Allocator const& allocator = Allocator{}); - bit_vector(bit_vector&&) = default; ///< Move constructor - inline ~bit_vector(); + inline dynamic_bitset(Allocator const& allocator = Allocator{}); + dynamic_bitset(dynamic_bitset&&) = default; ///< Move constructor + inline ~dynamic_bitset(); /** * @brief adds a new bit at the end @@ -187,12 +187,12 @@ class bit_vector { }; /** - * @brief Device non-owning reference type of bit_vector + * @brief Device non-owning reference type of dynamic_bitset */ class reference { public: /** - * @brief Constructs bit_vector_ref. + * @brief Constructs dynamic_bitset_ref.  *  * @param storage Struct with non-owning refs to bitvector slot storages  */ @@ -298,14 +298,14 @@ class bit_vector { /** * @brief Gets non-owning device ref of the current object * - * @return Device ref of the current `bit_vector` object + * @return Device ref of the current `dynamic_bitset` object */ [[nodiscard]] ref_type ref() const noexcept; /** - * @brief Gets the number of bits bit_vector holds + * @brief Gets the number of bits dynamic_bitset holds * - * @return Number of bits bit_vector holds + * @return Number of bits dynamic_bitset holds */ [[nodiscard]] constexpr size_type size() const noexcept; @@ -316,7 +316,7 @@ class bit_vector { using size_allocator_type = typename std::allocator_traits::rebind_alloc; allocator_type allocator_; ///< Words allocator - size_type n_bits_; ///< Number of bits bit_vector currently holds + size_type n_bits_; ///< Number of bits dynamic_bitset currently holds /// Words vector that represents all bits thrust::device_vector words_; @@ -355,4 +355,4 @@ class bit_vector { } // namespace experimental } // namespace cuco -#include +#include diff --git a/include/cuco/detail/trie/bit_vector/bit_vector.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl similarity index 76% rename from include/cuco/detail/trie/bit_vector/bit_vector.inl rename to include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index 743dade4a..b437f37e4 100644 --- a/include/cuco/detail/trie/bit_vector/bit_vector.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -15,7 +15,7 @@ * limitations under the License. */ -#include +#include #include #include @@ -29,7 +29,7 @@ namespace experimental { namespace detail { template -bit_vector::bit_vector(Allocator const& allocator) +dynamic_bitset::dynamic_bitset(Allocator const& allocator) : allocator_{allocator}, n_bits_{0}, words_{allocator}, @@ -41,12 +41,12 @@ bit_vector::bit_vector(Allocator const& allocator) } template -bit_vector::~bit_vector() +dynamic_bitset::~dynamic_bitset() { } template -void bit_vector::append(bool bit) noexcept +void dynamic_bitset::append(bool bit) noexcept { if (n_bits_ % bits_per_block == 0) { words_.resize(words_.size() + words_per_block); // Extend storage by one block @@ -56,7 +56,7 @@ void bit_vector::append(bool bit) noexcept } template -void bit_vector::set(size_type index, bool bit) noexcept +void dynamic_bitset::set(size_type index, bool bit) noexcept { size_type word_id = index / bits_per_word; size_type bit_id = index % bits_per_word; @@ -68,17 +68,17 @@ void bit_vector::set(size_type index, bool bit) noexcept } template -void bit_vector::set_last(bool bit) noexcept +void dynamic_bitset::set_last(bool bit) noexcept { set(n_bits_ - 1, bit); } template template -void bit_vector::get(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept +void dynamic_bitset::get(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(keys_begin, keys_end); @@ -92,10 +92,10 @@ void bit_vector::get(KeyIt keys_begin, template template -void bit_vector::ranks(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept +void dynamic_bitset::ranks(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(keys_begin, keys_end); @@ -109,10 +109,10 @@ void bit_vector::ranks(KeyIt keys_begin, template template -void bit_vector::selects(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept +void dynamic_bitset::selects(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(keys_begin, keys_end); @@ -125,7 +125,7 @@ void bit_vector::selects(KeyIt keys_begin, } template -void bit_vector::build_ranks_and_selects( +void dynamic_bitset::build_ranks_and_selects( thrust::device_vector& ranks, thrust::device_vector& selects, bool flip_bits) noexcept @@ -181,14 +181,14 @@ void bit_vector::build_ranks_and_selects( } template -void bit_vector::build() noexcept +void dynamic_bitset::build() noexcept { build_ranks_and_selects(ranks_, selects_, false); // 1-bits build_ranks_and_selects(ranks0_, selects0_, true); // 0-bits } template -bit_vector::ref_type bit_vector::ref() const noexcept +dynamic_bitset::ref_type dynamic_bitset::ref() const noexcept { return ref_type{storage_ref_type{thrust::raw_pointer_cast(words_.data()), thrust::raw_pointer_cast(ranks_.data()), @@ -198,13 +198,13 @@ bit_vector::ref_type bit_vector::ref() const noexcept } template -constexpr bit_vector::size_type bit_vector::size() const noexcept +constexpr dynamic_bitset::size_type dynamic_bitset::size() const noexcept { return n_bits_; } template -constexpr bit_vector::size_type bit_vector::default_grid_size( +constexpr dynamic_bitset::size_type dynamic_bitset::default_grid_size( size_type num_elements) const noexcept { return (num_elements - 1) / (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE) + 1; @@ -212,28 +212,28 @@ constexpr bit_vector::size_type bit_vector::default_grid_s // Device reference implementations template -__host__ __device__ constexpr bit_vector::reference::reference( +__host__ __device__ constexpr dynamic_bitset::reference::reference( storage_ref_type storage) noexcept : storage_{storage} { } template -__device__ bool bit_vector::reference::get(size_type key) const noexcept +__device__ bool dynamic_bitset::reference::get(size_type key) const noexcept { return (storage_.words_ref_[key / bits_per_word] >> (key % bits_per_word)) & 1UL; } template -__device__ typename bit_vector::slot_type bit_vector::reference::get_word( - size_type word_id) const noexcept +__device__ typename dynamic_bitset::slot_type +dynamic_bitset::reference::get_word(size_type word_id) const noexcept { return storage_.words_ref_[word_id]; } template -__device__ typename bit_vector::size_type -bit_vector::reference::find_next_set(size_type key) const noexcept +__device__ typename dynamic_bitset::size_type +dynamic_bitset::reference::find_next_set(size_type key) const noexcept { size_type word_id = key / bits_per_word; size_type bit_id = key % bits_per_word; @@ -246,7 +246,7 @@ bit_vector::reference::find_next_set(size_type key) const noexcept } template -__device__ typename bit_vector::size_type bit_vector::reference::rank( +__device__ typename dynamic_bitset::size_type dynamic_bitset::reference::rank( size_type key) const noexcept { size_type word_id = key / bits_per_word; @@ -265,8 +265,8 @@ __device__ typename bit_vector::size_type bit_vector::refe } template -__device__ typename bit_vector::size_type bit_vector::reference::select( - size_type count) const noexcept +__device__ typename dynamic_bitset::size_type +dynamic_bitset::reference::select(size_type count) const noexcept { auto rank_id = get_initial_rank_estimate(count, storage_.selects_ref_, storage_.ranks_ref_); auto rank = storage_.ranks_ref_[rank_id]; @@ -278,8 +278,8 @@ __device__ typename bit_vector::size_type bit_vector::refe } template -__device__ typename bit_vector::size_type bit_vector::reference::select0( - size_type count) const noexcept +__device__ typename dynamic_bitset::size_type +dynamic_bitset::reference::select0(size_type count) const noexcept { auto rank_id = get_initial_rank_estimate(count, storage_.selects0_ref_, storage_.ranks0_ref_); auto rank = storage_.ranks0_ref_[rank_id]; @@ -292,10 +292,9 @@ __device__ typename bit_vector::size_type bit_vector::refe template template -__device__ typename bit_vector::size_type -bit_vector::reference::get_initial_rank_estimate(size_type count, - SelectsRef const& selects, - RanksRef const& ranks) const noexcept +__device__ typename dynamic_bitset::size_type +dynamic_bitset::reference::get_initial_rank_estimate( + size_type count, SelectsRef const& selects, RanksRef const& ranks) const noexcept { size_type block_id = count / (bits_per_word * words_per_block); size_type begin = selects[block_id]; @@ -320,9 +319,9 @@ bit_vector::reference::get_initial_rank_estimate(size_type count, template template -__device__ typename bit_vector::size_type -bit_vector::reference::subtract_rank_from_count(size_type& count, - Rank rank) const noexcept +__device__ typename dynamic_bitset::size_type +dynamic_bitset::reference::subtract_rank_from_count(size_type& count, + Rank rank) const noexcept { count -= rank.abs(); @@ -337,8 +336,8 @@ bit_vector::reference::subtract_rank_from_count(size_type& count, } template -__device__ typename bit_vector::size_type -bit_vector::reference::select_bit_in_word(size_type N, slot_type word) const noexcept +__device__ typename dynamic_bitset::size_type +dynamic_bitset::reference::select_bit_in_word(size_type N, slot_type word) const noexcept { for (size_type pos = 0; pos < N; pos++) { word &= word - 1; diff --git a/include/cuco/detail/trie/bit_vector/kernels.cuh b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh similarity index 100% rename from include/cuco/detail/trie/bit_vector/kernels.cuh rename to include/cuco/detail/trie/dynamic_bitset/kernels.cuh diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 64fe713ac..2b99515d0 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -97,10 +97,10 @@ ConfigureTest(STATIC_MULTIMAP_TEST static_multimap/pair_function_test.cu) ################################################################################################### -# - bit_vector tests ------------------------------------------------------------------------------ -ConfigureTest(BIT_VECTOR_TEST - bit_vector/find_next_set_test.cu - bit_vector/get_test.cu - bit_vector/rank_test.cu - bit_vector/select_test.cu - bit_vector/size_test.cu) +# - dynamic_bitset tests -------------------------------------------------------------------------- +ConfigureTest(DYNAMIC_BITSET_TEST + dynamic_bitset/find_next_set_test.cu + dynamic_bitset/get_test.cu + dynamic_bitset/rank_test.cu + dynamic_bitset/select_test.cu + dynamic_bitset/size_test.cu) diff --git a/tests/bit_vector/find_next_set_test.cu b/tests/dynamic_bitset/find_next_set_test.cu similarity index 95% rename from tests/bit_vector/find_next_set_test.cu rename to tests/dynamic_bitset/find_next_set_test.cu index 04672a2a1..6ae068361 100644 --- a/tests/bit_vector/find_next_set_test.cu +++ b/tests/dynamic_bitset/find_next_set_test.cu @@ -16,7 +16,7 @@ #include -#include +#include #include #include @@ -39,7 +39,7 @@ extern bool modulo_bitgen(uint64_t i); // Defined in get_test.cu TEST_CASE("Find next set test", "") { - cuco::experimental::detail::bit_vector bv; + cuco::experimental::detail::dynamic_bitset bv; using size_type = std::size_t; constexpr size_type num_elements{400}; diff --git a/tests/bit_vector/get_test.cu b/tests/dynamic_bitset/get_test.cu similarity index 95% rename from tests/bit_vector/get_test.cu rename to tests/dynamic_bitset/get_test.cu index c9df39f83..acbb70f7d 100644 --- a/tests/bit_vector/get_test.cu +++ b/tests/dynamic_bitset/get_test.cu @@ -15,7 +15,7 @@ */ #include -#include +#include #include #include #include @@ -36,7 +36,7 @@ bool modulo_bitgen(uint64_t i) { return i % 7 == 0; } TEST_CASE("Get test", "") { - cuco::experimental::detail::bit_vector bv; + cuco::experimental::detail::dynamic_bitset bv; using size_type = std::size_t; constexpr size_type num_elements{400}; diff --git a/tests/bit_vector/rank_test.cu b/tests/dynamic_bitset/rank_test.cu similarity index 93% rename from tests/bit_vector/rank_test.cu rename to tests/dynamic_bitset/rank_test.cu index e33e5d04c..e7359a32b 100644 --- a/tests/bit_vector/rank_test.cu +++ b/tests/dynamic_bitset/rank_test.cu @@ -16,7 +16,7 @@ #include -#include +#include #include #include @@ -28,7 +28,7 @@ extern bool modulo_bitgen(uint64_t i); // Defined in get_test.cu TEST_CASE("Rank test", "") { - cuco::experimental::detail::bit_vector bv; + cuco::experimental::detail::dynamic_bitset bv; using size_type = std::size_t; constexpr size_type num_elements{4000}; diff --git a/tests/bit_vector/select_test.cu b/tests/dynamic_bitset/select_test.cu similarity index 96% rename from tests/bit_vector/select_test.cu rename to tests/dynamic_bitset/select_test.cu index f8009b853..a5c96ac93 100644 --- a/tests/bit_vector/select_test.cu +++ b/tests/dynamic_bitset/select_test.cu @@ -16,7 +16,7 @@ #include -#include +#include #include #include @@ -39,7 +39,7 @@ extern bool modulo_bitgen(uint64_t i); // Defined in get_test.cu TEST_CASE("Select test", "") { - cuco::experimental::detail::bit_vector bv; + cuco::experimental::detail::dynamic_bitset bv; using size_type = std::size_t; constexpr size_type num_elements{4000}; diff --git a/tests/bit_vector/size_test.cu b/tests/dynamic_bitset/size_test.cu similarity index 89% rename from tests/bit_vector/size_test.cu rename to tests/dynamic_bitset/size_test.cu index 2b4204635..940050602 100644 --- a/tests/bit_vector/size_test.cu +++ b/tests/dynamic_bitset/size_test.cu @@ -14,13 +14,13 @@ * limitations under the License. */ -#include +#include #include TEST_CASE("Size computation", "") { - cuco::experimental::detail::bit_vector bv; + cuco::experimental::detail::dynamic_bitset bv; using size_type = std::size_t; constexpr size_type num_elements{400}; From 0cf4bac8dc92c233892063f317191627b224dd32 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 29 Aug 2023 17:32:38 -0700 Subject: [PATCH 77/99] Cleanups: constexpr instead of inline, TODO, etc --- .../trie/dynamic_bitset/dynamic_bitset.cuh | 69 ++++++++++--------- .../trie/dynamic_bitset/dynamic_bitset.inl | 60 ++++++++-------- 2 files changed, 63 insertions(+), 66 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index 039714bbf..e9cdb9b87 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -44,7 +44,7 @@ struct rank { * * @return The base rank */ - __host__ __device__ uint64_t constexpr abs() const noexcept + __host__ __device__ constexpr uint64_t abs() const noexcept { return (static_cast(abs_hi_) << 8) | abs_lo_; } @@ -54,7 +54,7 @@ struct rank { * * @param abs Base rank */ - __host__ __device__ void set_abs(uint64_t abs) noexcept + __host__ __device__ constexpr void set_abs(uint64_t abs) noexcept { abs_hi_ = static_cast(abs >> 8); abs_lo_ = static_cast(abs); @@ -72,6 +72,7 @@ struct rank { * * @tparam Allocator Type of allocator used for device storage */ +// TODO: have to use device_malloc_allocator for now otherwise the container cannot grow template > class dynamic_bitset { public: @@ -89,9 +90,7 @@ class dynamic_bitset { * * @param allocator Allocator used for allocating device storage */ - inline dynamic_bitset(Allocator const& allocator = Allocator{}); - dynamic_bitset(dynamic_bitset&&) = default; ///< Move constructor - inline ~dynamic_bitset(); + constexpr dynamic_bitset(Allocator const& allocator = Allocator{}); /** * @brief adds a new bit at the end @@ -100,7 +99,7 @@ class dynamic_bitset { * * @param bit Boolean value of new bit to be added */ - inline void append(bool bit) noexcept; + constexpr void append(bool bit) noexcept; /** * @brief Modifies a single bit @@ -108,19 +107,19 @@ class dynamic_bitset { * @param index position of bit to be modified * @param bit new value of bit */ - inline void set(size_type index, bool bit) noexcept; + constexpr void set(size_type index, bool bit) noexcept; /** * @brief Sets last bit to specified value * * @param bit new value of last bit */ - inline void set_last(bool bit) noexcept; + constexpr void set_last(bool bit) noexcept; /** * @brief Builds indexes for rank and select */ - inline void build() noexcept; + constexpr void build() noexcept; /** * @brief Bulk get operation @@ -134,10 +133,10 @@ class dynamic_bitset { * @param stream Stream to execute get kernel */ template - void get(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream = {}) const noexcept; + constexpr void get(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) const noexcept; /** * @brief Bulk rank operation @@ -151,10 +150,10 @@ class dynamic_bitset { * @param stream Stream to execute ranks kernel */ template - void ranks(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream = {}) const noexcept; + constexpr void ranks(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) const noexcept; /** * @brief Bulk select operation @@ -168,14 +167,15 @@ class dynamic_bitset { * @param stream Stream to execute selects kernel */ template - void selects(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream = {}) const noexcept; + constexpr void selects(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) const noexcept; /** *@brief Struct to hold all storage refs needed by bitvector_ref */ + // TODO: this is not a real ref type, to be changed struct storage_ref_type { const slot_type* words_ref_; ///< Words refs @@ -205,7 +205,7 @@ class dynamic_bitset { * * @return Value of bit at position specified by key */ - [[nodiscard]] __device__ bool get(size_type key) const noexcept; + [[nodiscard]] __device__ constexpr bool get(size_type key) const noexcept; /** * @brief Access a single word of internal storage @@ -214,7 +214,7 @@ class dynamic_bitset { * * @return Word at position specified by index */ - [[nodiscard]] __device__ slot_type get_word(size_type word_id) const noexcept; + [[nodiscard]] __device__ constexpr slot_type get_word(size_type word_id) const noexcept; /** * @brief Find position of first set bit starting from a given position (inclusive) @@ -232,7 +232,7 @@ class dynamic_bitset { * * @return Rank of input position */ - [[nodiscard]] __device__ size_type rank(size_type key) const noexcept; + [[nodiscard]] __device__ constexpr size_type rank(size_type key) const noexcept; /** * @brief Find position of Nth set (1) bit counting from start of bitvector @@ -241,7 +241,7 @@ class dynamic_bitset { * * @return Position of Nth set bit */ - [[nodiscard]] __device__ size_type select(size_type count) const noexcept; + [[nodiscard]] __device__ constexpr size_type select(size_type count) const noexcept; /** * @brief Find position of Nth not-set (0) bit counting from start of bitvector @@ -250,7 +250,7 @@ class dynamic_bitset { * * @return Position of Nth not-set bit */ - [[nodiscard]] __device__ size_type select0(size_type count) const noexcept; + [[nodiscard]] __device__ constexpr size_type select0(size_type count) const noexcept; private: /** @@ -263,7 +263,7 @@ class dynamic_bitset { * @return index in ranks which corresponds to highest rank less than count (least upper bound) */ template - [[nodiscard]] __device__ size_type get_initial_rank_estimate( + [[nodiscard]] __device__ constexpr size_type get_initial_rank_estimate( size_type count, const SelectsRef& selects, const RanksRef& ranks) const noexcept; /** @@ -277,8 +277,8 @@ class dynamic_bitset { * @return Increment to word_id based on rank values */ template - [[nodiscard]] __device__ size_type subtract_rank_from_count(size_type& count, - Rank rank) const noexcept; + [[nodiscard]] __device__ constexpr size_type subtract_rank_from_count(size_type& count, + Rank rank) const noexcept; /** * @brief Find position of Nth set bit in a 64-bit word @@ -300,7 +300,7 @@ class dynamic_bitset { * * @return Device ref of the current `dynamic_bitset` object */ - [[nodiscard]] ref_type ref() const noexcept; + [[nodiscard]] constexpr ref_type ref() const noexcept; /** * @brief Gets the number of bits dynamic_bitset holds @@ -336,9 +336,10 @@ class dynamic_bitset { * @param selects Output array of selects * @param flip_bits If true, negate bits to construct indexes for `0` bits */ - void build_ranks_and_selects(thrust::device_vector& ranks, - thrust::device_vector& selects, - bool flip_bits) noexcept; + constexpr void build_ranks_and_selects( + thrust::device_vector& ranks, + thrust::device_vector& selects, + bool flip_bits) noexcept; /** * @brief Helper function to calculate grid size for simple kernels @@ -348,7 +349,7 @@ class dynamic_bitset { * @return grid size */ // TODO: to be moved to the CUDA utility header - size_type constexpr default_grid_size(size_type num_elements) const noexcept; + constexpr size_type default_grid_size(size_type num_elements) const noexcept; }; } // namespace detail diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index b437f37e4..ae748ea70 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -29,7 +29,7 @@ namespace experimental { namespace detail { template -dynamic_bitset::dynamic_bitset(Allocator const& allocator) +constexpr dynamic_bitset::dynamic_bitset(Allocator const& allocator) : allocator_{allocator}, n_bits_{0}, words_{allocator}, @@ -41,12 +41,7 @@ dynamic_bitset::dynamic_bitset(Allocator const& allocator) } template -dynamic_bitset::~dynamic_bitset() -{ -} - -template -void dynamic_bitset::append(bool bit) noexcept +constexpr void dynamic_bitset::append(bool bit) noexcept { if (n_bits_ % bits_per_block == 0) { words_.resize(words_.size() + words_per_block); // Extend storage by one block @@ -56,7 +51,7 @@ void dynamic_bitset::append(bool bit) noexcept } template -void dynamic_bitset::set(size_type index, bool bit) noexcept +constexpr void dynamic_bitset::set(size_type index, bool bit) noexcept { size_type word_id = index / bits_per_word; size_type bit_id = index % bits_per_word; @@ -68,17 +63,17 @@ void dynamic_bitset::set(size_type index, bool bit) noexcept } template -void dynamic_bitset::set_last(bool bit) noexcept +constexpr void dynamic_bitset::set_last(bool bit) noexcept { set(n_bits_ - 1, bit); } template template -void dynamic_bitset::get(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept +constexpr void dynamic_bitset::get(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(keys_begin, keys_end); @@ -92,10 +87,10 @@ void dynamic_bitset::get(KeyIt keys_begin, template template -void dynamic_bitset::ranks(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept +constexpr void dynamic_bitset::ranks(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(keys_begin, keys_end); @@ -109,10 +104,10 @@ void dynamic_bitset::ranks(KeyIt keys_begin, template template -void dynamic_bitset::selects(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept +constexpr void dynamic_bitset::selects(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(keys_begin, keys_end); @@ -125,7 +120,7 @@ void dynamic_bitset::selects(KeyIt keys_begin, } template -void dynamic_bitset::build_ranks_and_selects( +constexpr void dynamic_bitset::build_ranks_and_selects( thrust::device_vector& ranks, thrust::device_vector& selects, bool flip_bits) noexcept @@ -181,14 +176,14 @@ void dynamic_bitset::build_ranks_and_selects( } template -void dynamic_bitset::build() noexcept +constexpr void dynamic_bitset::build() noexcept { build_ranks_and_selects(ranks_, selects_, false); // 1-bits build_ranks_and_selects(ranks0_, selects0_, true); // 0-bits } template -dynamic_bitset::ref_type dynamic_bitset::ref() const noexcept +constexpr dynamic_bitset::ref_type dynamic_bitset::ref() const noexcept { return ref_type{storage_ref_type{thrust::raw_pointer_cast(words_.data()), thrust::raw_pointer_cast(ranks_.data()), @@ -211,6 +206,7 @@ constexpr dynamic_bitset::size_type dynamic_bitset::defaul } // Device reference implementations + template __host__ __device__ constexpr dynamic_bitset::reference::reference( storage_ref_type storage) noexcept @@ -219,13 +215,13 @@ __host__ __device__ constexpr dynamic_bitset::reference::reference( } template -__device__ bool dynamic_bitset::reference::get(size_type key) const noexcept +__device__ constexpr bool dynamic_bitset::reference::get(size_type key) const noexcept { return (storage_.words_ref_[key / bits_per_word] >> (key % bits_per_word)) & 1UL; } template -__device__ typename dynamic_bitset::slot_type +__device__ constexpr typename dynamic_bitset::slot_type dynamic_bitset::reference::get_word(size_type word_id) const noexcept { return storage_.words_ref_[word_id]; @@ -246,8 +242,8 @@ dynamic_bitset::reference::find_next_set(size_type key) const noexcep } template -__device__ typename dynamic_bitset::size_type dynamic_bitset::reference::rank( - size_type key) const noexcept +__device__ constexpr typename dynamic_bitset::size_type +dynamic_bitset::reference::rank(size_type key) const noexcept { size_type word_id = key / bits_per_word; size_type bit_id = key % bits_per_word; @@ -265,7 +261,7 @@ __device__ typename dynamic_bitset::size_type dynamic_bitset -__device__ typename dynamic_bitset::size_type +__device__ constexpr typename dynamic_bitset::size_type dynamic_bitset::reference::select(size_type count) const noexcept { auto rank_id = get_initial_rank_estimate(count, storage_.selects_ref_, storage_.ranks_ref_); @@ -278,7 +274,7 @@ dynamic_bitset::reference::select(size_type count) const noexcept } template -__device__ typename dynamic_bitset::size_type +__device__ constexpr typename dynamic_bitset::size_type dynamic_bitset::reference::select0(size_type count) const noexcept { auto rank_id = get_initial_rank_estimate(count, storage_.selects0_ref_, storage_.ranks0_ref_); @@ -292,7 +288,7 @@ dynamic_bitset::reference::select0(size_type count) const noexcept template template -__device__ typename dynamic_bitset::size_type +__device__ constexpr typename dynamic_bitset::size_type dynamic_bitset::reference::get_initial_rank_estimate( size_type count, SelectsRef const& selects, RanksRef const& ranks) const noexcept { @@ -319,7 +315,7 @@ dynamic_bitset::reference::get_initial_rank_estimate( template template -__device__ typename dynamic_bitset::size_type +__device__ constexpr typename dynamic_bitset::size_type dynamic_bitset::reference::subtract_rank_from_count(size_type& count, Rank rank) const noexcept { From c7faed0992d792f37919b8749949d0d3e7cf88ee Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 30 Aug 2023 06:47:46 +0000 Subject: [PATCH 78/99] Change names of rank, select variables rank, select -> rank_true, select_true rank0, select0 -> rank_false, select_false --- .../trie/dynamic_bitset/dynamic_bitset.cuh | 18 +++++----- .../trie/dynamic_bitset/dynamic_bitset.inl | 34 ++++++++++--------- tests/dynamic_bitset/select_test.cu | 8 ++--- 3 files changed, 31 insertions(+), 29 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index e9cdb9b87..722d92721 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -179,11 +179,11 @@ class dynamic_bitset { struct storage_ref_type { const slot_type* words_ref_; ///< Words refs - const rank* ranks_ref_; ///< Ranks refs - const size_type* selects_ref_; ///< Selects refs + const rank* ranks_true_ref_; ///< Ranks refs for 1 bits + const size_type* selects_true_ref_; ///< Selects refs for 1 bits - const rank* ranks0_ref_; ///< Ranks refs for 0 bits - const size_type* selects0_ref_; ///< Selects refs 0 bits + const rank* ranks_false_ref_; ///< Ranks refs for 0 bits + const size_type* selects_false_ref_; ///< Selects refs 0 bits }; /** @@ -250,7 +250,7 @@ class dynamic_bitset { * * @return Position of Nth not-set bit */ - [[nodiscard]] __device__ constexpr size_type select0(size_type count) const noexcept; + [[nodiscard]] __device__ constexpr size_type select_false(size_type count) const noexcept; private: /** @@ -321,13 +321,13 @@ class dynamic_bitset { /// Words vector that represents all bits thrust::device_vector words_; /// Rank values for every 256-th bit (4-th word) - thrust::device_vector ranks_; + thrust::device_vector ranks_true_; /// Same as ranks_ but for `0` bits - thrust::device_vector ranks0_; + thrust::device_vector ranks_false_; /// Block indices of (0, 256, 512...)th `1` bit - thrust::device_vector selects_; + thrust::device_vector selects_true_; /// Same as selects_, but for `0` bits - thrust::device_vector selects0_; + thrust::device_vector selects_false_; /** * @brief Populates rank and select indexes on device diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index ae748ea70..f2041142a 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -33,10 +33,10 @@ constexpr dynamic_bitset::dynamic_bitset(Allocator const& allocator) : allocator_{allocator}, n_bits_{0}, words_{allocator}, - ranks_{allocator}, - ranks0_{allocator}, - selects_{allocator}, - selects0_{allocator} + ranks_true_{allocator}, + ranks_false_{allocator}, + selects_true_{allocator}, + selects_false_{allocator} { } @@ -178,18 +178,18 @@ constexpr void dynamic_bitset::build_ranks_and_selects( template constexpr void dynamic_bitset::build() noexcept { - build_ranks_and_selects(ranks_, selects_, false); // 1-bits - build_ranks_and_selects(ranks0_, selects0_, true); // 0-bits + build_ranks_and_selects(ranks_true_, selects_true_, false); // 1 bits + build_ranks_and_selects(ranks_false_, selects_false_, true); // 0 bits } template constexpr dynamic_bitset::ref_type dynamic_bitset::ref() const noexcept { return ref_type{storage_ref_type{thrust::raw_pointer_cast(words_.data()), - thrust::raw_pointer_cast(ranks_.data()), - thrust::raw_pointer_cast(selects_.data()), - thrust::raw_pointer_cast(ranks0_.data()), - thrust::raw_pointer_cast(selects0_.data())}}; + thrust::raw_pointer_cast(ranks_true_.data()), + thrust::raw_pointer_cast(selects_true_.data()), + thrust::raw_pointer_cast(ranks_false_.data()), + thrust::raw_pointer_cast(selects_false_.data())}}; } template @@ -250,7 +250,7 @@ dynamic_bitset::reference::rank(size_type key) const noexcept size_type rank_id = word_id / words_per_block; size_type rel_id = word_id % words_per_block; - auto rank = storage_.ranks_ref_[rank_id]; + auto rank = storage_.ranks_true_ref_[rank_id]; size_type n = rank.abs(); if (rel_id != 0) { n += rank.rels_[rel_id - 1]; } @@ -264,8 +264,9 @@ template __device__ constexpr typename dynamic_bitset::size_type dynamic_bitset::reference::select(size_type count) const noexcept { - auto rank_id = get_initial_rank_estimate(count, storage_.selects_ref_, storage_.ranks_ref_); - auto rank = storage_.ranks_ref_[rank_id]; + auto rank_id = + get_initial_rank_estimate(count, storage_.selects_true_ref_, storage_.ranks_true_ref_); + auto rank = storage_.ranks_true_ref_[rank_id]; size_type word_id = rank_id * words_per_block; word_id += subtract_rank_from_count(count, rank); @@ -275,10 +276,11 @@ dynamic_bitset::reference::select(size_type count) const noexcept template __device__ constexpr typename dynamic_bitset::size_type -dynamic_bitset::reference::select0(size_type count) const noexcept +dynamic_bitset::reference::select_false(size_type count) const noexcept { - auto rank_id = get_initial_rank_estimate(count, storage_.selects0_ref_, storage_.ranks0_ref_); - auto rank = storage_.ranks0_ref_[rank_id]; + auto rank_id = + get_initial_rank_estimate(count, storage_.selects_false_ref_, storage_.ranks_false_ref_); + auto rank = storage_.ranks_false_ref_[rank_id]; size_type word_id = rank_id * words_per_block; word_id += subtract_rank_from_count(count, rank); diff --git a/tests/dynamic_bitset/select_test.cu b/tests/dynamic_bitset/select_test.cu index a5c96ac93..17a6d0c02 100644 --- a/tests/dynamic_bitset/select_test.cu +++ b/tests/dynamic_bitset/select_test.cu @@ -25,12 +25,12 @@ #include template -__global__ void select0_kernel(BitVectorRef ref, size_type num_elements, OutputIt output) +__global__ void select_false_kernel(BitVectorRef ref, size_type num_elements, OutputIt output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; while (index < num_elements) { - output[index] = ref.select0(index); + output[index] = ref.select_false(index); index += stride; } } @@ -75,12 +75,12 @@ TEST_CASE("Select test", "") REQUIRE(num_matches == num_set); } - // Check select0 + // Check select_false { size_type num_not_set = num_elements - num_set; thrust::device_vector device_result(num_not_set); - select0_kernel<<<1, 1024>>>(ref, num_not_set, device_result.data()); + select_false_kernel<<<1, 1024>>>(ref, num_not_set, device_result.data()); thrust::host_vector host_result = device_result; size_type num_matches = 0; From a85b7b70eb22f7c1a03279c4a8674e8c949a6aba Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 1 Sep 2023 01:00:19 +0000 Subject: [PATCH 79/99] Rename members and methods of `rank` abs -> base rels -> offsets --- .../trie/dynamic_bitset/dynamic_bitset.cuh | 23 +++++++++------- .../trie/dynamic_bitset/dynamic_bitset.inl | 26 +++++++++---------- .../detail/trie/dynamic_bitset/kernels.cuh | 4 +-- 3 files changed, 29 insertions(+), 24 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index 722d92721..483d24546 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -33,31 +33,36 @@ namespace detail { /** * @brief Struct to store ranks of bits at 256-bit intervals + * + * This struct encodes a list of four rank values using base + offset format + * e.g. [1000, 1005, 1006, 1009] is stored as base = 1000, offsets = [5, 6, 9] + * base uses 40 bits, split between one uint32_t and one uint8_t + * each offset uses 8 bits */ struct rank { - uint32_t abs_hi_; ///< Upper 32 bits of base - uint8_t abs_lo_; ///< Lower 8 bits of base - cuda::std::array rels_; ///< Offsets for 64-bit sub-intervals + uint32_t base_hi_; ///< Upper 32 bits of base + uint8_t base_lo_; ///< Lower 8 bits of base + cuda::std::array offsets_; ///< Offsets for 64-bit sub-intervals, relative to base /** * @brief Gets base rank of current 256-bit interval * * @return The base rank */ - __host__ __device__ constexpr uint64_t abs() const noexcept + __host__ __device__ constexpr uint64_t base() const noexcept { - return (static_cast(abs_hi_) << 8) | abs_lo_; + return (static_cast(base_hi_) << CHAR_BIT) | base_lo_; } /** * @brief Sets base rank of current 256-bit interval * - * @param abs Base rank + * @param base Base rank */ - __host__ __device__ constexpr void set_abs(uint64_t abs) noexcept + __host__ __device__ constexpr void set_base(uint64_t base) noexcept { - abs_hi_ = static_cast(abs >> 8); - abs_lo_ = static_cast(abs); + base_hi_ = static_cast(base >> CHAR_BIT); + base_lo_ = static_cast(base); } }; diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index f2041142a..6abd3cc3f 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -245,15 +245,15 @@ template __device__ constexpr typename dynamic_bitset::size_type dynamic_bitset::reference::rank(size_type key) const noexcept { - size_type word_id = key / bits_per_word; - size_type bit_id = key % bits_per_word; - size_type rank_id = word_id / words_per_block; - size_type rel_id = word_id % words_per_block; + size_type word_id = key / bits_per_word; + size_type bit_id = key % bits_per_word; + size_type rank_id = word_id / words_per_block; + size_type offset_id = word_id % words_per_block; auto rank = storage_.ranks_true_ref_[rank_id]; - size_type n = rank.abs(); + size_type n = rank.base(); - if (rel_id != 0) { n += rank.rels_[rel_id - 1]; } + if (offset_id != 0) { n += rank.offsets_[offset_id - 1]; } n += cuda::std::popcount(storage_.words_ref_[word_id] & ((1UL << bit_id) - 1)); @@ -299,13 +299,13 @@ dynamic_bitset::reference::get_initial_rank_estimate( size_type end = selects[block_id + 1] + 1UL; if (begin + 10 >= end) { // Linear search - while (count >= ranks[begin + 1].abs()) { + while (count >= ranks[begin + 1].base()) { ++begin; } } else { // Binary search while (begin + 1 < end) { size_type middle = (begin + end) / 2; - if (count < ranks[middle].abs()) { + if (count < ranks[middle].base()) { end = middle; } else { begin = middle; @@ -321,14 +321,14 @@ __device__ constexpr typename dynamic_bitset::size_type dynamic_bitset::reference::subtract_rank_from_count(size_type& count, Rank rank) const noexcept { - count -= rank.abs(); + count -= rank.base(); - bool a0 = count >= rank.rels_[0]; - bool a1 = count >= rank.rels_[1]; - bool a2 = count >= rank.rels_[2]; + bool a0 = count >= rank.offsets_[0]; + bool a1 = count >= rank.offsets_[1]; + bool a2 = count >= rank.offsets_[2]; size_type inc = a0 + a1 + a2; - count -= (inc > 0) * rank.rels_[inc - (inc > 0)]; + count -= (inc > 0) * rank.offsets_[inc - (inc > 0)]; return inc; } diff --git a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh index 7b0027df2..183617465 100644 --- a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh @@ -165,13 +165,13 @@ __global__ void encode_ranks_from_prefix_bit_counts(const size_type* prefix_bit_ // Set base value of rank auto& rank = ranks[rank_id]; - rank.set_abs(prefix_bit_counts[word_id]); + rank.set_base(prefix_bit_counts[word_id]); if (rank_id < num_blocks - 1) { // For each subsequent word in this block, compute deltas from base for (size_type block_offset = 0; block_offset < words_per_block - 1; block_offset++) { auto delta = prefix_bit_counts[word_id + block_offset + 1] - prefix_bit_counts[word_id]; - rank.rels_[block_offset] = delta; + rank.offsets_[block_offset] = delta; } } rank_id += stride; From ec6bb1117ed9e7d5489763549a0de818651e6c03 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 1 Sep 2023 01:47:27 +0000 Subject: [PATCH 80/99] Rename bitvector to bitset --- .../trie/dynamic_bitset/dynamic_bitset.cuh | 26 ++++++++------- .../trie/dynamic_bitset/dynamic_bitset.inl | 6 ++-- .../detail/trie/dynamic_bitset/kernels.cuh | 33 +++++++------------ tests/dynamic_bitset/find_next_set_test.cu | 4 +-- tests/dynamic_bitset/get_test.cu | 4 +-- tests/dynamic_bitset/select_test.cu | 4 +-- 6 files changed, 35 insertions(+), 42 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index 483d24546..5407ece48 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -67,13 +67,15 @@ struct rank { }; /** - * @brief Bitvector class with rank and select index structures + * @brief Bitset class with rank and select index structures * - * In addition to standard bitvector get/set operations, this class provides + * In addition to standard bitset get/set operations, this class provides * rank and select operation API. It maintains index structures to make both these * new operations close to constant time. - * Bitvector construction happens on host, after which the structures are moved to device. - * All subsequent read-only operations access device structures only. + * + * Current limitations: + * - Stream controls are partially supported due to the use of `thrust::device_vector` as storage + * - Device ref doesn't support modifiers like `set`, `reset`, etc. * * @tparam Allocator Type of allocator used for device storage */ @@ -91,7 +93,7 @@ class dynamic_bitset { static constexpr size_type bits_per_block = words_per_block * bits_per_word; ///< Trivial /** - * @brief Constructs an empty bitvector + * @brief Constructs an empty bitset * * @param allocator Allocator used for allocating device storage */ @@ -178,7 +180,7 @@ class dynamic_bitset { cuda_stream_ref stream = {}) const noexcept; /** - *@brief Struct to hold all storage refs needed by bitvector_ref + *@brief Struct to hold all storage refs needed by reference */ // TODO: this is not a real ref type, to be changed struct storage_ref_type { @@ -197,10 +199,10 @@ class dynamic_bitset { class reference { public: /** - * @brief Constructs dynamic_bitset_ref. - * - * @param storage Struct with non-owning refs to bitvector slot storages - */ + * @brief Constructs a reference + * + * @param storage Struct with non-owning refs to bitset storage arrays + */ __host__ __device__ explicit constexpr reference(storage_ref_type storage) noexcept; /** @@ -240,7 +242,7 @@ class dynamic_bitset { [[nodiscard]] __device__ constexpr size_type rank(size_type key) const noexcept; /** - * @brief Find position of Nth set (1) bit counting from start of bitvector + * @brief Find position of Nth set (1) bit counting from start * * @param count Input N * @@ -249,7 +251,7 @@ class dynamic_bitset { [[nodiscard]] __device__ constexpr size_type select(size_type count) const noexcept; /** - * @brief Find position of Nth not-set (0) bit counting from start of bitvector + * @brief Find position of Nth not-set (0) bit counting from start * * @param count Input N * diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index 6abd3cc3f..35965dbed 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -81,7 +81,7 @@ constexpr void dynamic_bitset::get(KeyIt keys_begin, auto grid_size = default_grid_size(num_keys); - bitvector_get_kernel<<>>( + bitset_get_kernel<<>>( ref(), keys_begin, outputs_begin, num_keys); } @@ -98,7 +98,7 @@ constexpr void dynamic_bitset::ranks(KeyIt keys_begin, auto grid_size = default_grid_size(num_keys); - bitvector_rank_kernel<<>>( + bitset_rank_kernel<<>>( ref(), keys_begin, outputs_begin, num_keys); } @@ -115,7 +115,7 @@ constexpr void dynamic_bitset::selects(KeyIt keys_begin, auto grid_size = default_grid_size(num_keys); - bitvector_select_kernel<<>>( + bitset_select_kernel<<>>( ref(), keys_begin, outputs_begin, num_keys); } diff --git a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh index 183617465..7152f1275 100644 --- a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh @@ -26,21 +26,18 @@ namespace detail { /* * @brief Gather bits of a range of keys * - * @tparam BitvectorRef Bitvector reference type + * @tparam BitsetRef Bitset reference type * @tparam KeyIt Device-accessible iterator to input keys * @tparam ValueIt Device-accessible iterator to values * @tparam size_type Size type * - * @param ref Bitvector ref + * @param ref Bitset ref * @param keys Begin iterator to keys * @param outputs Begin iterator to outputs * @param num_keys Number of input keys */ -template -__global__ void bitvector_get_kernel(BitvectorRef ref, - KeyIt keys, - ValueIt outputs, - size_type num_keys) +template +__global__ void bitset_get_kernel(BitsetRef ref, KeyIt keys, ValueIt outputs, size_type num_keys) { uint32_t const loop_stride = gridDim.x * blockDim.x; uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; @@ -54,21 +51,18 @@ __global__ void bitvector_get_kernel(BitvectorRef ref, /* * @brief Gather rank values for a range of keys * - * @tparam BitvectorRef Bitvector reference type + * @tparam BitsetRef Bitset reference type * @tparam KeyIt Device-accessible iterator to input keys * @tparam ValueIt Device-accessible iterator to values * @tparam size_type Size type * - * @param ref Bitvector ref + * @param ref Bitset ref * @param keys Begin iterator to keys * @param outputs Begin iterator to outputs * @param num_keys Number of input keys */ -template -__global__ void bitvector_rank_kernel(BitvectorRef ref, - KeyIt keys, - ValueIt outputs, - size_type num_keys) +template +__global__ void bitset_rank_kernel(BitsetRef ref, KeyIt keys, ValueIt outputs, size_type num_keys) { uint32_t const loop_stride = gridDim.x * blockDim.x; uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; @@ -82,21 +76,18 @@ __global__ void bitvector_rank_kernel(BitvectorRef ref, /* * @brief Gather select values for a range of keys * - * @tparam BitvectorRef Bitvector reference type + * @tparam BitsetRef Bitset reference type * @tparam KeyIt Device-accessible iterator to input keys * @tparam ValueIt Device-accessible iterator to values * @tparam size_type Size type * - * @param ref Bitvector ref + * @param ref Bitset ref * @param keys Begin iterator to keys * @param outputs Begin iterator to outputs * @param num_keys Number of input keys */ -template -__global__ void bitvector_select_kernel(BitvectorRef ref, - KeyIt keys, - ValueIt outputs, - size_type num_keys) +template +__global__ void bitset_select_kernel(BitsetRef ref, KeyIt keys, ValueIt outputs, size_type num_keys) { uint32_t const loop_stride = gridDim.x * blockDim.x; uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; diff --git a/tests/dynamic_bitset/find_next_set_test.cu b/tests/dynamic_bitset/find_next_set_test.cu index 6ae068361..71a918b6c 100644 --- a/tests/dynamic_bitset/find_next_set_test.cu +++ b/tests/dynamic_bitset/find_next_set_test.cu @@ -24,8 +24,8 @@ #include -template -__global__ void find_next_set_kernel(BitVectorRef ref, size_type num_elements, OutputIt output) +template +__global__ void find_next_set_kernel(BitsetRef ref, size_type num_elements, OutputIt output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; diff --git a/tests/dynamic_bitset/get_test.cu b/tests/dynamic_bitset/get_test.cu index acbb70f7d..446e2db3c 100644 --- a/tests/dynamic_bitset/get_test.cu +++ b/tests/dynamic_bitset/get_test.cu @@ -21,8 +21,8 @@ #include #include -template -__global__ void get_kernel(BitVectorRef ref, size_type num_elements, OutputIt output) +template +__global__ void get_kernel(BitsetRef ref, size_type num_elements, OutputIt output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; diff --git a/tests/dynamic_bitset/select_test.cu b/tests/dynamic_bitset/select_test.cu index 17a6d0c02..361dc5c79 100644 --- a/tests/dynamic_bitset/select_test.cu +++ b/tests/dynamic_bitset/select_test.cu @@ -24,8 +24,8 @@ #include -template -__global__ void select_false_kernel(BitVectorRef ref, size_type num_elements, OutputIt output) +template +__global__ void select_false_kernel(BitsetRef ref, size_type num_elements, OutputIt output) { size_t index = blockIdx.x * blockDim.x + threadIdx.x; size_t stride = gridDim.x * blockDim.x; From 59457e91c5eba5f93d45563c54336a9282f26b2b Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 1 Sep 2023 02:03:39 +0000 Subject: [PATCH 81/99] Remove `get_` prefixes in method names --- .../detail/trie/dynamic_bitset/dynamic_bitset.cuh | 4 ++-- .../detail/trie/dynamic_bitset/dynamic_bitset.inl | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index 5407ece48..7b3f3e68c 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -221,7 +221,7 @@ class dynamic_bitset { * * @return Word at position specified by index */ - [[nodiscard]] __device__ constexpr slot_type get_word(size_type word_id) const noexcept; + [[nodiscard]] __device__ constexpr slot_type word(size_type word_id) const noexcept; /** * @brief Find position of first set bit starting from a given position (inclusive) @@ -270,7 +270,7 @@ class dynamic_bitset { * @return index in ranks which corresponds to highest rank less than count (least upper bound) */ template - [[nodiscard]] __device__ constexpr size_type get_initial_rank_estimate( + [[nodiscard]] __device__ constexpr size_type initial_rank_estimate( size_type count, const SelectsRef& selects, const RanksRef& ranks) const noexcept; /** diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index 35965dbed..ae265bcba 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -222,7 +222,7 @@ __device__ constexpr bool dynamic_bitset::reference::get(size_type ke template __device__ constexpr typename dynamic_bitset::slot_type -dynamic_bitset::reference::get_word(size_type word_id) const noexcept +dynamic_bitset::reference::word(size_type word_id) const noexcept { return storage_.words_ref_[word_id]; } @@ -264,9 +264,8 @@ template __device__ constexpr typename dynamic_bitset::size_type dynamic_bitset::reference::select(size_type count) const noexcept { - auto rank_id = - get_initial_rank_estimate(count, storage_.selects_true_ref_, storage_.ranks_true_ref_); - auto rank = storage_.ranks_true_ref_[rank_id]; + auto rank_id = initial_rank_estimate(count, storage_.selects_true_ref_, storage_.ranks_true_ref_); + auto rank = storage_.ranks_true_ref_[rank_id]; size_type word_id = rank_id * words_per_block; word_id += subtract_rank_from_count(count, rank); @@ -279,7 +278,7 @@ __device__ constexpr typename dynamic_bitset::size_type dynamic_bitset::reference::select_false(size_type count) const noexcept { auto rank_id = - get_initial_rank_estimate(count, storage_.selects_false_ref_, storage_.ranks_false_ref_); + initial_rank_estimate(count, storage_.selects_false_ref_, storage_.ranks_false_ref_); auto rank = storage_.ranks_false_ref_[rank_id]; size_type word_id = rank_id * words_per_block; @@ -291,8 +290,9 @@ dynamic_bitset::reference::select_false(size_type count) const noexce template template __device__ constexpr typename dynamic_bitset::size_type -dynamic_bitset::reference::get_initial_rank_estimate( - size_type count, SelectsRef const& selects, RanksRef const& ranks) const noexcept +dynamic_bitset::reference::initial_rank_estimate(size_type count, + SelectsRef const& selects, + RanksRef const& ranks) const noexcept { size_type block_id = count / (bits_per_word * words_per_block); size_type begin = selects[block_id]; From 5b86c0043f4feb7e7652304cea1fa5d3e190fda7 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 1 Sep 2023 03:31:15 +0000 Subject: [PATCH 82/99] Use rank_type This will allow us to rename ranks() to rank() --- .../trie/dynamic_bitset/dynamic_bitset.cuh | 20 ++++++++++--------- .../trie/dynamic_bitset/dynamic_bitset.inl | 2 +- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index 7b3f3e68c..3ae1a0f97 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -179,18 +179,20 @@ class dynamic_bitset { OutputIt outputs_begin, cuda_stream_ref stream = {}) const noexcept; + using rank_type = cuco::experimental::detail::rank; ///< Rank type + /** *@brief Struct to hold all storage refs needed by reference */ // TODO: this is not a real ref type, to be changed struct storage_ref_type { - const slot_type* words_ref_; ///< Words refs + const slot_type* words_ref_; ///< Words ref - const rank* ranks_true_ref_; ///< Ranks refs for 1 bits - const size_type* selects_true_ref_; ///< Selects refs for 1 bits + const rank_type* ranks_true_ref_; ///< Ranks ref for 1 bits + const size_type* selects_true_ref_; ///< Selects ref for 1 bits - const rank* ranks_false_ref_; ///< Ranks refs for 0 bits - const size_type* selects_false_ref_; ///< Selects refs 0 bits + const rank_type* ranks_false_ref_; ///< Ranks ref for 0 bits + const size_type* selects_false_ref_; ///< Selects ref 0 bits }; /** @@ -318,7 +320,7 @@ class dynamic_bitset { private: /// Type of the allocator to (de)allocate ranks - using rank_allocator_type = typename std::allocator_traits::rebind_alloc; + using rank_allocator_type = typename std::allocator_traits::rebind_alloc; /// Type of the allocator to (de)allocate indices using size_allocator_type = typename std::allocator_traits::rebind_alloc; @@ -328,9 +330,9 @@ class dynamic_bitset { /// Words vector that represents all bits thrust::device_vector words_; /// Rank values for every 256-th bit (4-th word) - thrust::device_vector ranks_true_; + thrust::device_vector ranks_true_; /// Same as ranks_ but for `0` bits - thrust::device_vector ranks_false_; + thrust::device_vector ranks_false_; /// Block indices of (0, 256, 512...)th `1` bit thrust::device_vector selects_true_; /// Same as selects_, but for `0` bits @@ -344,7 +346,7 @@ class dynamic_bitset { * @param flip_bits If true, negate bits to construct indexes for `0` bits */ constexpr void build_ranks_and_selects( - thrust::device_vector& ranks, + thrust::device_vector& ranks, thrust::device_vector& selects, bool flip_bits) noexcept; diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index ae265bcba..052b13a5a 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -121,7 +121,7 @@ constexpr void dynamic_bitset::selects(KeyIt keys_begin, template constexpr void dynamic_bitset::build_ranks_and_selects( - thrust::device_vector& ranks, + thrust::device_vector& ranks, thrust::device_vector& selects, bool flip_bits) noexcept { From c753f1082bf34b074023d6c80fe1e6ad59d6d075 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 1 Sep 2023 03:33:35 +0000 Subject: [PATCH 83/99] Rename bulk API methods ranks -> rank selects -> select --- .../trie/dynamic_bitset/dynamic_bitset.cuh | 16 ++++++++-------- .../trie/dynamic_bitset/dynamic_bitset.inl | 16 ++++++++-------- tests/dynamic_bitset/rank_test.cu | 2 +- tests/dynamic_bitset/select_test.cu | 2 +- 4 files changed, 18 insertions(+), 18 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index 3ae1a0f97..047e124e9 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -157,10 +157,10 @@ class dynamic_bitset { * @param stream Stream to execute ranks kernel */ template - constexpr void ranks(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream = {}) const noexcept; + constexpr void rank(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) const noexcept; /** * @brief Bulk select operation @@ -174,10 +174,10 @@ class dynamic_bitset { * @param stream Stream to execute selects kernel */ template - constexpr void selects(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream = {}) const noexcept; + constexpr void select(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) const noexcept; using rank_type = cuco::experimental::detail::rank; ///< Rank type diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index 052b13a5a..1caf0506b 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -87,10 +87,10 @@ constexpr void dynamic_bitset::get(KeyIt keys_begin, template template -constexpr void dynamic_bitset::ranks(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept +constexpr void dynamic_bitset::rank(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(keys_begin, keys_end); @@ -104,10 +104,10 @@ constexpr void dynamic_bitset::ranks(KeyIt keys_begin, template template -constexpr void dynamic_bitset::selects(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept +constexpr void dynamic_bitset::select(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(keys_begin, keys_end); diff --git a/tests/dynamic_bitset/rank_test.cu b/tests/dynamic_bitset/rank_test.cu index e7359a32b..4a5e5c424 100644 --- a/tests/dynamic_bitset/rank_test.cu +++ b/tests/dynamic_bitset/rank_test.cu @@ -43,7 +43,7 @@ TEST_CASE("Rank test", "") thrust::device_vector d_ranks(num_elements); - bv.ranks(keys.begin(), keys.end(), d_ranks.begin()); + bv.rank(keys.begin(), keys.end(), d_ranks.begin()); thrust::host_vector h_ranks = d_ranks; diff --git a/tests/dynamic_bitset/select_test.cu b/tests/dynamic_bitset/select_test.cu index 361dc5c79..59aded87c 100644 --- a/tests/dynamic_bitset/select_test.cu +++ b/tests/dynamic_bitset/select_test.cu @@ -59,7 +59,7 @@ TEST_CASE("Select test", "") thrust::device_vector d_selects(num_set); - bv.selects(keys.begin(), keys.end(), d_selects.begin()); + bv.select(keys.begin(), keys.end(), d_selects.begin()); thrust::host_vector h_selects = d_selects; From b26f326b6929531b6517305031e8c816402fd05a Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 1 Sep 2023 04:26:44 +0000 Subject: [PATCH 84/99] Use cuco::detail::index_type in kernels --- .../detail/trie/dynamic_bitset/kernels.cuh | 32 ++++++++++--------- tests/dynamic_bitset/find_next_set_test.cu | 4 +-- tests/dynamic_bitset/get_test.cu | 4 +-- tests/dynamic_bitset/select_test.cu | 4 +-- 4 files changed, 23 insertions(+), 21 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh index 7152f1275..73a435781 100644 --- a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh @@ -17,6 +17,8 @@ #pragma once +#include + #include namespace cuco { @@ -39,12 +41,12 @@ namespace detail { template __global__ void bitset_get_kernel(BitsetRef ref, KeyIt keys, ValueIt outputs, size_type num_keys) { - uint32_t const loop_stride = gridDim.x * blockDim.x; - uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; + cuco::detail::index_type key_id = blockDim.x * blockIdx.x + threadIdx.x; + cuco::detail::index_type stride = gridDim.x * blockDim.x; while (key_id < num_keys) { outputs[key_id] = ref.get(keys[key_id]); - key_id += loop_stride; + key_id += stride; } } @@ -64,12 +66,12 @@ __global__ void bitset_get_kernel(BitsetRef ref, KeyIt keys, ValueIt outputs, si template __global__ void bitset_rank_kernel(BitsetRef ref, KeyIt keys, ValueIt outputs, size_type num_keys) { - uint32_t const loop_stride = gridDim.x * blockDim.x; - uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; + cuco::detail::index_type key_id = blockDim.x * blockIdx.x + threadIdx.x; + cuco::detail::index_type stride = gridDim.x * blockDim.x; while (key_id < num_keys) { outputs[key_id] = ref.rank(keys[key_id]); - key_id += loop_stride; + key_id += stride; } } @@ -89,12 +91,12 @@ __global__ void bitset_rank_kernel(BitsetRef ref, KeyIt keys, ValueIt outputs, s template __global__ void bitset_select_kernel(BitsetRef ref, KeyIt keys, ValueIt outputs, size_type num_keys) { - uint32_t const loop_stride = gridDim.x * blockDim.x; - uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; + cuco::detail::index_type key_id = blockDim.x * blockIdx.x + threadIdx.x; + cuco::detail::index_type stride = gridDim.x * blockDim.x; while (key_id < num_keys) { outputs[key_id] = ref.select(keys[key_id]); - key_id += loop_stride; + key_id += stride; } } @@ -115,8 +117,8 @@ __global__ void bit_counts_kernel(const slot_type* words, size_type num_words, bool flip_bits) { - size_type word_id = blockDim.x * blockIdx.x + threadIdx.x; - size_type stride = gridDim.x * blockDim.x; + cuco::detail::index_type word_id = blockDim.x * blockIdx.x + threadIdx.x; + cuco::detail::index_type stride = gridDim.x * blockDim.x; while (word_id < num_words) { auto word = words[word_id]; @@ -148,8 +150,8 @@ __global__ void encode_ranks_from_prefix_bit_counts(const size_type* prefix_bit_ size_type num_blocks, size_type words_per_block) { - size_type rank_id = blockDim.x * blockIdx.x + threadIdx.x; - size_type stride = gridDim.x * blockDim.x; + cuco::detail::index_type rank_id = blockDim.x * blockIdx.x + threadIdx.x; + cuco::detail::index_type stride = gridDim.x * blockDim.x; while (rank_id < num_blocks) { size_type word_id = rank_id * words_per_block; @@ -191,8 +193,8 @@ __global__ void mark_blocks_with_select_entries(const size_type* prefix_bit_coun size_type words_per_block, size_type bits_per_block) { - size_type block_id = blockDim.x * blockIdx.x + threadIdx.x; - size_type stride = gridDim.x * blockDim.x; + cuco::detail::index_type block_id = blockDim.x * blockIdx.x + threadIdx.x; + cuco::detail::index_type stride = gridDim.x * blockDim.x; while (block_id < num_blocks) { if (block_id == 0) { // Block 0 always has a selects entry diff --git a/tests/dynamic_bitset/find_next_set_test.cu b/tests/dynamic_bitset/find_next_set_test.cu index 71a918b6c..1d3190427 100644 --- a/tests/dynamic_bitset/find_next_set_test.cu +++ b/tests/dynamic_bitset/find_next_set_test.cu @@ -27,8 +27,8 @@ template __global__ void find_next_set_kernel(BitsetRef ref, size_type num_elements, OutputIt output) { - size_t index = blockIdx.x * blockDim.x + threadIdx.x; - size_t stride = gridDim.x * blockDim.x; + cuco::detail::index_type index = blockIdx.x * blockDim.x + threadIdx.x; + cuco::detail::index_type stride = gridDim.x * blockDim.x; while (index < num_elements) { output[index] = ref.find_next_set(index); index += stride; diff --git a/tests/dynamic_bitset/get_test.cu b/tests/dynamic_bitset/get_test.cu index 446e2db3c..6a8fbcfd3 100644 --- a/tests/dynamic_bitset/get_test.cu +++ b/tests/dynamic_bitset/get_test.cu @@ -24,8 +24,8 @@ template __global__ void get_kernel(BitsetRef ref, size_type num_elements, OutputIt output) { - size_t index = blockIdx.x * blockDim.x + threadIdx.x; - size_t stride = gridDim.x * blockDim.x; + cuco::detail::index_type index = blockIdx.x * blockDim.x + threadIdx.x; + cuco::detail::index_type stride = gridDim.x * blockDim.x; while (index < num_elements) { output[index] = ref.get(index); index += stride; diff --git a/tests/dynamic_bitset/select_test.cu b/tests/dynamic_bitset/select_test.cu index 59aded87c..f352a4a59 100644 --- a/tests/dynamic_bitset/select_test.cu +++ b/tests/dynamic_bitset/select_test.cu @@ -27,8 +27,8 @@ template __global__ void select_false_kernel(BitsetRef ref, size_type num_elements, OutputIt output) { - size_t index = blockIdx.x * blockDim.x + threadIdx.x; - size_t stride = gridDim.x * blockDim.x; + cuco::detail::index_type index = blockIdx.x * blockDim.x + threadIdx.x; + cuco::detail::index_type stride = gridDim.x * blockDim.x; while (index < num_elements) { output[index] = ref.select_false(index); index += stride; From f96468df7b03a5ad8dd40eba9463f74fceb09142 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 1 Sep 2023 04:47:10 +0000 Subject: [PATCH 85/99] Change some API to match boost dynamic_bitset --- .../trie/dynamic_bitset/dynamic_bitset.cuh | 20 +++++++++---------- .../trie/dynamic_bitset/dynamic_bitset.inl | 14 ++++++------- .../detail/trie/dynamic_bitset/kernels.cuh | 6 +++--- tests/dynamic_bitset/find_next_set_test.cu | 6 +++--- tests/dynamic_bitset/get_test.cu | 16 +++++++-------- 5 files changed, 31 insertions(+), 31 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index 047e124e9..26f5a565e 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -69,7 +69,7 @@ struct rank { /** * @brief Bitset class with rank and select index structures * - * In addition to standard bitset get/set operations, this class provides + * In addition to standard bitset set/test operations, this class provides * rank and select operation API. It maintains index structures to make both these * new operations close to constant time. * @@ -129,21 +129,21 @@ class dynamic_bitset { constexpr void build() noexcept; /** - * @brief Bulk get operation + * @brief Bulk test operation * * @tparam KeyIt Device-accessible iterator to keys * @tparam OutputIt Device-accessible iterator to outputs * * @param keys_begin Begin iterator to keys list whose values are queried * @param keys_end End iterator to keys list - * @param outputs_begin Begin iterator to outputs of get operation - * @param stream Stream to execute get kernel + * @param outputs_begin Begin iterator to outputs of test operation + * @param stream Stream to execute test kernel */ template - constexpr void get(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream = {}) const noexcept; + constexpr void test(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) const noexcept; /** * @brief Bulk rank operation @@ -214,7 +214,7 @@ class dynamic_bitset { * * @return Value of bit at position specified by key */ - [[nodiscard]] __device__ constexpr bool get(size_type key) const noexcept; + [[nodiscard]] __device__ constexpr bool test(size_type key) const noexcept; /** * @brief Access a single word of internal storage @@ -232,7 +232,7 @@ class dynamic_bitset { * * @return Index of next set bit */ - [[nodiscard]] __device__ size_type find_next_set(size_type key) const noexcept; + [[nodiscard]] __device__ size_type find_next(size_type key) const noexcept; /** * @brief Find number of set bits (rank) in all positions before the input position (exclusive) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index 1caf0506b..7fb57cbc8 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -70,10 +70,10 @@ constexpr void dynamic_bitset::set_last(bool bit) noexcept template template -constexpr void dynamic_bitset::get(KeyIt keys_begin, - KeyIt keys_end, - OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept +constexpr void dynamic_bitset::test(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept { auto const num_keys = cuco::detail::distance(keys_begin, keys_end); @@ -81,7 +81,7 @@ constexpr void dynamic_bitset::get(KeyIt keys_begin, auto grid_size = default_grid_size(num_keys); - bitset_get_kernel<<>>( + bitset_test_kernel<<>>( ref(), keys_begin, outputs_begin, num_keys); } @@ -215,7 +215,7 @@ __host__ __device__ constexpr dynamic_bitset::reference::reference( } template -__device__ constexpr bool dynamic_bitset::reference::get(size_type key) const noexcept +__device__ constexpr bool dynamic_bitset::reference::test(size_type key) const noexcept { return (storage_.words_ref_[key / bits_per_word] >> (key % bits_per_word)) & 1UL; } @@ -229,7 +229,7 @@ dynamic_bitset::reference::word(size_type word_id) const noexcept template __device__ typename dynamic_bitset::size_type -dynamic_bitset::reference::find_next_set(size_type key) const noexcept +dynamic_bitset::reference::find_next(size_type key) const noexcept { size_type word_id = key / bits_per_word; size_type bit_id = key % bits_per_word; diff --git a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh index 73a435781..963e61aa3 100644 --- a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh @@ -26,7 +26,7 @@ namespace experimental { namespace detail { /* - * @brief Gather bits of a range of keys + * @brief Test bits for a range of keys * * @tparam BitsetRef Bitset reference type * @tparam KeyIt Device-accessible iterator to input keys @@ -39,13 +39,13 @@ namespace detail { * @param num_keys Number of input keys */ template -__global__ void bitset_get_kernel(BitsetRef ref, KeyIt keys, ValueIt outputs, size_type num_keys) +__global__ void bitset_test_kernel(BitsetRef ref, KeyIt keys, ValueIt outputs, size_type num_keys) { cuco::detail::index_type key_id = blockDim.x * blockIdx.x + threadIdx.x; cuco::detail::index_type stride = gridDim.x * blockDim.x; while (key_id < num_keys) { - outputs[key_id] = ref.get(keys[key_id]); + outputs[key_id] = ref.test(keys[key_id]); key_id += stride; } } diff --git a/tests/dynamic_bitset/find_next_set_test.cu b/tests/dynamic_bitset/find_next_set_test.cu index 1d3190427..e36fa58ea 100644 --- a/tests/dynamic_bitset/find_next_set_test.cu +++ b/tests/dynamic_bitset/find_next_set_test.cu @@ -25,12 +25,12 @@ #include template -__global__ void find_next_set_kernel(BitsetRef ref, size_type num_elements, OutputIt output) +__global__ void find_next_kernel(BitsetRef ref, size_type num_elements, OutputIt output) { cuco::detail::index_type index = blockIdx.x * blockDim.x + threadIdx.x; cuco::detail::index_type stride = gridDim.x * blockDim.x; while (index < num_elements) { - output[index] = ref.find_next_set(index); + output[index] = ref.find_next(index); index += stride; } } @@ -51,7 +51,7 @@ TEST_CASE("Find next set test", "") thrust::device_vector device_result(num_elements); auto ref = bv.ref(); - find_next_set_kernel<<<1, 1024>>>(ref, num_elements, device_result.data()); + find_next_kernel<<<1, 1024>>>(ref, num_elements, device_result.data()); thrust::host_vector host_result = device_result; size_type num_matches = 0; diff --git a/tests/dynamic_bitset/get_test.cu b/tests/dynamic_bitset/get_test.cu index 6a8fbcfd3..e785bc65d 100644 --- a/tests/dynamic_bitset/get_test.cu +++ b/tests/dynamic_bitset/get_test.cu @@ -22,12 +22,12 @@ #include template -__global__ void get_kernel(BitsetRef ref, size_type num_elements, OutputIt output) +__global__ void test_kernel(BitsetRef ref, size_type num_elements, OutputIt output) { cuco::detail::index_type index = blockIdx.x * blockDim.x + threadIdx.x; cuco::detail::index_type stride = gridDim.x * blockDim.x; while (index < num_elements) { - output[index] = ref.get(index); + output[index] = ref.test(index); index += stride; } } @@ -50,19 +50,19 @@ TEST_CASE("Get test", "") // Device-ref test auto ref = bv.ref(); - thrust::device_vector get_result(num_elements); - get_kernel<<<1, 1024>>>(ref, num_elements, get_result.data()); + thrust::device_vector test_result(num_elements); + test_kernel<<<1, 1024>>>(ref, num_elements, test_result.data()); - size_type num_set = thrust::reduce(thrust::device, get_result.begin(), get_result.end(), 0); + size_type num_set = thrust::reduce(thrust::device, test_result.begin(), test_result.end(), 0); REQUIRE(num_set == num_set_ref); // Host-bulk test thrust::device_vector keys(num_elements); thrust::sequence(keys.begin(), keys.end(), 0); - thrust::fill(get_result.begin(), get_result.end(), 0); + thrust::fill(test_result.begin(), test_result.end(), 0); - bv.get(keys.begin(), keys.end(), get_result.begin()); + bv.test(keys.begin(), keys.end(), test_result.begin()); - num_set = thrust::reduce(thrust::device, get_result.begin(), get_result.end(), 0); + num_set = thrust::reduce(thrust::device, test_result.begin(), test_result.end(), 0); REQUIRE(num_set == num_set_ref); } From 9f452c029dc4ccb7923d051a8194eca8aed648d7 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 1 Sep 2023 04:49:01 +0000 Subject: [PATCH 86/99] Rename file to match previous API change --- tests/CMakeLists.txt | 2 +- .../dynamic_bitset/{find_next_set_test.cu => find_next_test.cu} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename tests/dynamic_bitset/{find_next_set_test.cu => find_next_test.cu} (100%) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2b99515d0..7e331773b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -99,7 +99,7 @@ ConfigureTest(STATIC_MULTIMAP_TEST ################################################################################################### # - dynamic_bitset tests -------------------------------------------------------------------------- ConfigureTest(DYNAMIC_BITSET_TEST - dynamic_bitset/find_next_set_test.cu + dynamic_bitset/find_next_test.cu dynamic_bitset/get_test.cu dynamic_bitset/rank_test.cu dynamic_bitset/select_test.cu diff --git a/tests/dynamic_bitset/find_next_set_test.cu b/tests/dynamic_bitset/find_next_test.cu similarity index 100% rename from tests/dynamic_bitset/find_next_set_test.cu rename to tests/dynamic_bitset/find_next_test.cu From 0f1db0fae4cdedfcf7605f996400c647123ea13b Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 1 Sep 2023 05:10:52 +0000 Subject: [PATCH 87/99] More API changes --- .../trie/dynamic_bitset/dynamic_bitset.cuh | 22 +++++++++---------- .../trie/dynamic_bitset/dynamic_bitset.inl | 2 +- tests/dynamic_bitset/find_next_test.cu | 2 +- tests/dynamic_bitset/get_test.cu | 2 +- tests/dynamic_bitset/rank_test.cu | 2 +- tests/dynamic_bitset/select_test.cu | 2 +- tests/dynamic_bitset/size_test.cu | 2 +- 7 files changed, 17 insertions(+), 17 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index 26f5a565e..56a3a8dda 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -100,28 +100,28 @@ class dynamic_bitset { constexpr dynamic_bitset(Allocator const& allocator = Allocator{}); /** - * @brief adds a new bit at the end + * @brief Appends the given element `value` to the end of the bitset * - * Grows internal storage if needed + * This API may involve data reallocation if the current storage is exhausted. * - * @param bit Boolean value of new bit to be added + * @param value Boolean value of the new bit to be added */ - constexpr void append(bool bit) noexcept; + constexpr void push_back(bool value) noexcept; /** - * @brief Modifies a single bit + * @brief Sets the target bit indexed by `index` to a specified `value`. * - * @param index position of bit to be modified - * @param bit new value of bit + * @param index Position of bit to be modified + * @param value New value of the target bit */ - constexpr void set(size_type index, bool bit) noexcept; + constexpr void set(size_type index, bool value) noexcept; /** - * @brief Sets last bit to specified value + * @brief Sets the last bit to a specified value * - * @param bit new value of last bit + * @param value New value of the last bit */ - constexpr void set_last(bool bit) noexcept; + constexpr void set_last(bool value) noexcept; /** * @brief Builds indexes for rank and select diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index 7fb57cbc8..5aa17eb0e 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -41,7 +41,7 @@ constexpr dynamic_bitset::dynamic_bitset(Allocator const& allocator) } template -constexpr void dynamic_bitset::append(bool bit) noexcept +constexpr void dynamic_bitset::push_back(bool bit) noexcept { if (n_bits_ % bits_per_block == 0) { words_.resize(words_.size() + words_per_block); // Extend storage by one block diff --git a/tests/dynamic_bitset/find_next_test.cu b/tests/dynamic_bitset/find_next_test.cu index e36fa58ea..4b4ebd8c5 100644 --- a/tests/dynamic_bitset/find_next_test.cu +++ b/tests/dynamic_bitset/find_next_test.cu @@ -45,7 +45,7 @@ TEST_CASE("Find next set test", "") constexpr size_type num_elements{400}; for (size_type i = 0; i < num_elements; i++) { - bv.append(modulo_bitgen(i)); + bv.push_back(modulo_bitgen(i)); } bv.build(); diff --git a/tests/dynamic_bitset/get_test.cu b/tests/dynamic_bitset/get_test.cu index e785bc65d..68582a9e6 100644 --- a/tests/dynamic_bitset/get_test.cu +++ b/tests/dynamic_bitset/get_test.cu @@ -43,7 +43,7 @@ TEST_CASE("Get test", "") size_type num_set_ref = 0; for (size_type i = 0; i < num_elements; i++) { - bv.append(modulo_bitgen(i)); + bv.push_back(modulo_bitgen(i)); num_set_ref += modulo_bitgen(i); } bv.build(); diff --git a/tests/dynamic_bitset/rank_test.cu b/tests/dynamic_bitset/rank_test.cu index 4a5e5c424..15b73abdc 100644 --- a/tests/dynamic_bitset/rank_test.cu +++ b/tests/dynamic_bitset/rank_test.cu @@ -34,7 +34,7 @@ TEST_CASE("Rank test", "") constexpr size_type num_elements{4000}; for (size_type i = 0; i < num_elements; i++) { - bv.append(modulo_bitgen(i)); + bv.push_back(modulo_bitgen(i)); } bv.build(); diff --git a/tests/dynamic_bitset/select_test.cu b/tests/dynamic_bitset/select_test.cu index f352a4a59..fd4553229 100644 --- a/tests/dynamic_bitset/select_test.cu +++ b/tests/dynamic_bitset/select_test.cu @@ -46,7 +46,7 @@ TEST_CASE("Select test", "") size_type num_set = 0; for (size_type i = 0; i < num_elements; i++) { - bv.append(modulo_bitgen(i)); + bv.push_back(modulo_bitgen(i)); num_set += modulo_bitgen(i); } bv.build(); diff --git a/tests/dynamic_bitset/size_test.cu b/tests/dynamic_bitset/size_test.cu index 940050602..4b238cf26 100644 --- a/tests/dynamic_bitset/size_test.cu +++ b/tests/dynamic_bitset/size_test.cu @@ -25,7 +25,7 @@ TEST_CASE("Size computation", "") constexpr size_type num_elements{400}; for (size_type i = 0; i < num_elements; i++) { - bv.append(i % 2 == 0); // Alternate 0s and 1s pattern + bv.push_back(i % 2 == 0); // Alternate 0s and 1s pattern } bv.build(); From 63ed55224354aacccde9054fc3ab698e91186b22 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 1 Sep 2023 05:52:06 +0000 Subject: [PATCH 88/99] Rename slot_type to word_type --- .../detail/trie/dynamic_bitset/dynamic_bitset.cuh | 14 +++++++------- .../detail/trie/dynamic_bitset/dynamic_bitset.inl | 6 +++--- .../cuco/detail/trie/dynamic_bitset/kernels.cuh | 6 +++--- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index 56a3a8dda..13a85b8bf 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -84,12 +84,12 @@ template > class dynamic_bitset { public: using size_type = std::size_t; ///< size type to specify bit index - using slot_type = uint64_t; ///< Slot type + using word_type = uint64_t; ///< word type /// Type of the allocator to (de)allocate words - using allocator_type = typename std::allocator_traits::rebind_alloc; + using allocator_type = typename std::allocator_traits::rebind_alloc; static constexpr size_type words_per_block = 4; ///< Tradeoff between space efficiency and perf. - static constexpr size_type bits_per_word = sizeof(slot_type) * CHAR_BIT; ///< Bits in a word + static constexpr size_type bits_per_word = sizeof(word_type) * CHAR_BIT; ///< Bits in a word static constexpr size_type bits_per_block = words_per_block * bits_per_word; ///< Trivial /** @@ -186,7 +186,7 @@ class dynamic_bitset { */ // TODO: this is not a real ref type, to be changed struct storage_ref_type { - const slot_type* words_ref_; ///< Words ref + const word_type* words_ref_; ///< Words ref const rank_type* ranks_true_ref_; ///< Ranks ref for 1 bits const size_type* selects_true_ref_; ///< Selects ref for 1 bits @@ -223,7 +223,7 @@ class dynamic_bitset { * * @return Word at position specified by index */ - [[nodiscard]] __device__ constexpr slot_type word(size_type word_id) const noexcept; + [[nodiscard]] __device__ constexpr word_type word(size_type word_id) const noexcept; /** * @brief Find position of first set bit starting from a given position (inclusive) @@ -297,7 +297,7 @@ class dynamic_bitset { * @return Position of Nth set bit */ [[nodiscard]] __device__ size_type select_bit_in_word(size_type N, - slot_type word) const noexcept; + word_type word) const noexcept; storage_ref_type storage_; ///< Non-owning storage }; @@ -328,7 +328,7 @@ class dynamic_bitset { size_type n_bits_; ///< Number of bits dynamic_bitset currently holds /// Words vector that represents all bits - thrust::device_vector words_; + thrust::device_vector words_; /// Rank values for every 256-th bit (4-th word) thrust::device_vector ranks_true_; /// Same as ranks_ but for `0` bits diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index 5aa17eb0e..83bdd8e05 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -221,7 +221,7 @@ __device__ constexpr bool dynamic_bitset::reference::test(size_type k } template -__device__ constexpr typename dynamic_bitset::slot_type +__device__ constexpr typename dynamic_bitset::word_type dynamic_bitset::reference::word(size_type word_id) const noexcept { return storage_.words_ref_[word_id]; @@ -233,7 +233,7 @@ dynamic_bitset::reference::find_next(size_type key) const noexcept { size_type word_id = key / bits_per_word; size_type bit_id = key % bits_per_word; - slot_type word = storage_.words_ref_[word_id]; + word_type word = storage_.words_ref_[word_id]; word &= ~(0lu) << bit_id; while (word == 0) { word = storage_.words_ref_[++word_id]; @@ -335,7 +335,7 @@ dynamic_bitset::reference::subtract_rank_from_count(size_type& count, template __device__ typename dynamic_bitset::size_type -dynamic_bitset::reference::select_bit_in_word(size_type N, slot_type word) const noexcept +dynamic_bitset::reference::select_bit_in_word(size_type N, word_type word) const noexcept { for (size_type pos = 0; pos < N; pos++) { word &= word - 1; diff --git a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh index 963e61aa3..63785ca39 100644 --- a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh @@ -103,7 +103,7 @@ __global__ void bitset_select_kernel(BitsetRef ref, KeyIt keys, ValueIt outputs, /* * @brief Computes number of set or not-set bits in each word * - * @tparam slot_type Word type + * @tparam word_type Word type * @tparam size_type Size type * * @param words Input array of words @@ -111,8 +111,8 @@ __global__ void bitset_select_kernel(BitsetRef ref, KeyIt keys, ValueIt outputs, * @param num_words Number of words * @param flip_bits Boolean to request negation of words before counting bits */ -template -__global__ void bit_counts_kernel(const slot_type* words, +template +__global__ void bit_counts_kernel(const word_type* words, size_type* bit_counts, size_type num_words, bool flip_bits) From 81b8e907e0acc19a5eabfaeeb927dd39db103739 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 1 Sep 2023 06:11:32 +0000 Subject: [PATCH 89/99] Specify iterator's value_type in doxygen comments --- .../trie/dynamic_bitset/dynamic_bitset.cuh | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index 13a85b8bf..351ea6faf 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -131,8 +131,10 @@ class dynamic_bitset { /** * @brief Bulk test operation * - * @tparam KeyIt Device-accessible iterator to keys - * @tparam OutputIt Device-accessible iterator to outputs + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from boolean + * type * * @param keys_begin Begin iterator to keys list whose values are queried * @param keys_end End iterator to keys list @@ -148,8 +150,10 @@ class dynamic_bitset { /** * @brief Bulk rank operation * - * @tparam KeyIt Device-accessible iterator to keys - * @tparam OutputIt Device-accessible iterator to output ranks + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` * * @param keys_begin Begin iterator to keys list whose ranks are queried * @param keys_end End iterator to keys list @@ -165,8 +169,10 @@ class dynamic_bitset { /** * @brief Bulk select operation * - * @tparam KeyIt Device-accessible iterator to keys - * @tparam OutputIt Device-accessible iterator to outputs + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` * * @param keys_begin Begin iterator to keys list whose select values are queried * @param keys_end End iterator to keys list From b2c88ded9df8bfbd45a98be9f88c3c9e51bcaffc Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 1 Sep 2023 06:38:19 +0000 Subject: [PATCH 90/99] Comments --- .../cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index 351ea6faf..92c40c481 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -129,7 +129,8 @@ class dynamic_bitset { constexpr void build() noexcept; /** - * @brief Bulk test operation + * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the + * boolean value at position `keys_begin[i]` to `output_begin[i]`. * * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's * `size_type` @@ -148,7 +149,8 @@ class dynamic_bitset { cuda_stream_ref stream = {}) const noexcept; /** - * @brief Bulk rank operation + * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores total + * count of `1` bits preceeding (but not including) position `keys_begin[i]` to `output_begin[i]`. * * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's * `size_type` @@ -167,7 +169,8 @@ class dynamic_bitset { cuda_stream_ref stream = {}) const noexcept; /** - * @brief Bulk select operation + * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the + * position of `keys_begin[i]`th `1` bit to `output_begin[i]`. * * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's * `size_type` From 37ebd0c3c652f22328eb654d1b8ccf4e73d08d62 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 6 Sep 2023 10:23:20 -0700 Subject: [PATCH 91/99] Use detail CUDA utilities to determine grid size --- .../trie/dynamic_bitset/dynamic_bitset.cuh | 10 ------ .../trie/dynamic_bitset/dynamic_bitset.inl | 33 +++++++------------ 2 files changed, 12 insertions(+), 31 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index 92c40c481..edc4ac3f6 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -358,16 +358,6 @@ class dynamic_bitset { thrust::device_vector& ranks, thrust::device_vector& selects, bool flip_bits) noexcept; - - /** - * @brief Helper function to calculate grid size for simple kernels - * - * @param num_elements Elements being processed by kernel - * - * @return grid size - */ - // TODO: to be moved to the CUDA utility header - constexpr size_type default_grid_size(size_type num_elements) const noexcept; }; } // namespace detail diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index 83bdd8e05..4342c2fa4 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -16,8 +16,7 @@ */ #include -#include -#include +#include #include #include @@ -79,9 +78,9 @@ constexpr void dynamic_bitset::test(KeyIt keys_begin, auto const num_keys = cuco::detail::distance(keys_begin, keys_end); if (num_keys == 0) { return; } - auto grid_size = default_grid_size(num_keys); + auto const grid_size = cuco::detail::grid_size(num_keys); - bitset_test_kernel<<>>( + bitset_test_kernel<<>>( ref(), keys_begin, outputs_begin, num_keys); } @@ -91,14 +90,13 @@ constexpr void dynamic_bitset::rank(KeyIt keys_begin, KeyIt keys_end, OutputIt outputs_begin, cuda_stream_ref stream) const noexcept - { auto const num_keys = cuco::detail::distance(keys_begin, keys_end); if (num_keys == 0) { return; } - auto grid_size = default_grid_size(num_keys); + auto const grid_size = cuco::detail::grid_size(num_keys); - bitset_rank_kernel<<>>( + bitset_rank_kernel<<>>( ref(), keys_begin, outputs_begin, num_keys); } @@ -113,9 +111,9 @@ constexpr void dynamic_bitset::select(KeyIt keys_begin, auto const num_keys = cuco::detail::distance(keys_begin, keys_end); if (num_keys == 0) { return; } - auto grid_size = default_grid_size(num_keys); + auto const grid_size = cuco::detail::grid_size(num_keys); - bitset_select_kernel<<>>( + bitset_select_kernel<<>>( ref(), keys_begin, outputs_begin, num_keys); } @@ -132,8 +130,8 @@ constexpr void dynamic_bitset::build_ranks_and_selects( // Sized to have one extra entry for subsequent prefix sum size_type num_words = words_.size(); thrust::device_vector bit_counts(num_words + 1); - auto grid_size = default_grid_size(num_words); - bit_counts_kernel<<>>( + auto grid_size = cuco::detail::grid_size(num_words); + bit_counts_kernel<<>>( thrust::raw_pointer_cast(words_.data()), thrust::raw_pointer_cast(bit_counts.data()), num_words, @@ -145,8 +143,8 @@ constexpr void dynamic_bitset::build_ranks_and_selects( size_type num_blocks = (num_words - 1) / words_per_block + 2; ranks.resize(num_blocks); - grid_size = default_grid_size(num_blocks); - encode_ranks_from_prefix_bit_counts<<>>( + grid_size = cuco::detail::grid_size(num_blocks); + encode_ranks_from_prefix_bit_counts<<>>( thrust::raw_pointer_cast(bit_counts.data()), thrust::raw_pointer_cast(ranks.data()), num_words, @@ -155,7 +153,7 @@ constexpr void dynamic_bitset::build_ranks_and_selects( // Step 3. Compute selects thrust::device_vector select_markers(num_blocks); - mark_blocks_with_select_entries<<>>( + mark_blocks_with_select_entries<<>>( thrust::raw_pointer_cast(bit_counts.data()), thrust::raw_pointer_cast(select_markers.data()), num_blocks, @@ -198,13 +196,6 @@ constexpr dynamic_bitset::size_type dynamic_bitset::size() return n_bits_; } -template -constexpr dynamic_bitset::size_type dynamic_bitset::default_grid_size( - size_type num_elements) const noexcept -{ - return (num_elements - 1) / (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE) + 1; -} - // Device reference implementations template From a629730842d08d41696b673f7031a3480a8b7465 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 6 Sep 2023 17:32:50 +0000 Subject: [PATCH 92/99] Make build() a private member Query methods will check and perform build() --- .../trie/dynamic_bitset/dynamic_bitset.cuh | 21 ++++++++++--------- .../trie/dynamic_bitset/dynamic_bitset.inl | 18 +++++++++++----- tests/dynamic_bitset/find_next_test.cu | 1 - tests/dynamic_bitset/get_test.cu | 19 +++++++++-------- tests/dynamic_bitset/rank_test.cu | 1 - tests/dynamic_bitset/select_test.cu | 3 +-- tests/dynamic_bitset/size_test.cu | 1 - 7 files changed, 35 insertions(+), 29 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index 92c40c481..c502d730c 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -123,11 +123,6 @@ class dynamic_bitset { */ constexpr void set_last(bool value) noexcept; - /** - * @brief Builds indexes for rank and select - */ - constexpr void build() noexcept; - /** * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the * boolean value at position `keys_begin[i]` to `output_begin[i]`. @@ -146,7 +141,7 @@ class dynamic_bitset { constexpr void test(KeyIt keys_begin, KeyIt keys_end, OutputIt outputs_begin, - cuda_stream_ref stream = {}) const noexcept; + cuda_stream_ref stream = {}) noexcept; /** * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores total @@ -166,7 +161,7 @@ class dynamic_bitset { constexpr void rank(KeyIt keys_begin, KeyIt keys_end, OutputIt outputs_begin, - cuda_stream_ref stream = {}) const noexcept; + cuda_stream_ref stream = {}) noexcept; /** * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the @@ -186,7 +181,7 @@ class dynamic_bitset { constexpr void select(KeyIt keys_begin, KeyIt keys_end, OutputIt outputs_begin, - cuda_stream_ref stream = {}) const noexcept; + cuda_stream_ref stream = {}) noexcept; using rank_type = cuco::experimental::detail::rank; ///< Rank type @@ -335,6 +330,7 @@ class dynamic_bitset { allocator_type allocator_; ///< Words allocator size_type n_bits_; ///< Number of bits dynamic_bitset currently holds + bool is_built_; ///< Flag indicating whether the rank and select indices are built or not /// Words vector that represents all bits thrust::device_vector words_; @@ -348,11 +344,16 @@ class dynamic_bitset { thrust::device_vector selects_false_; /** - * @brief Populates rank and select indexes on device + * @brief Builds indexes for rank and select + */ + constexpr void build() noexcept; + + /** + * @brief Populates rank and select indexes for true or false bits * * @param ranks Output array of ranks * @param selects Output array of selects - * @param flip_bits If true, negate bits to construct indexes for `0` bits + * @param flip_bits If true, negate bits to construct indexes for false bits */ constexpr void build_ranks_and_selects( thrust::device_vector& ranks, diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index 83bdd8e05..105cdcb31 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -32,6 +32,7 @@ template constexpr dynamic_bitset::dynamic_bitset(Allocator const& allocator) : allocator_{allocator}, n_bits_{0}, + is_built_{false}, words_{allocator}, ranks_true_{allocator}, ranks_false_{allocator}, @@ -53,6 +54,7 @@ constexpr void dynamic_bitset::push_back(bool bit) noexcept template constexpr void dynamic_bitset::set(size_type index, bool bit) noexcept { + is_built_ = false; size_type word_id = index / bits_per_word; size_type bit_id = index % bits_per_word; if (bit) { @@ -73,9 +75,10 @@ template constexpr void dynamic_bitset::test(KeyIt keys_begin, KeyIt keys_end, OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept + cuda_stream_ref stream) noexcept { + build(); auto const num_keys = cuco::detail::distance(keys_begin, keys_end); if (num_keys == 0) { return; } @@ -90,9 +93,10 @@ template constexpr void dynamic_bitset::rank(KeyIt keys_begin, KeyIt keys_end, OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept + cuda_stream_ref stream) noexcept { + build(); auto const num_keys = cuco::detail::distance(keys_begin, keys_end); if (num_keys == 0) { return; } @@ -107,9 +111,10 @@ template constexpr void dynamic_bitset::select(KeyIt keys_begin, KeyIt keys_end, OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept + cuda_stream_ref stream) noexcept { + build(); auto const num_keys = cuco::detail::distance(keys_begin, keys_end); if (num_keys == 0) { return; } @@ -178,8 +183,11 @@ constexpr void dynamic_bitset::build_ranks_and_selects( template constexpr void dynamic_bitset::build() noexcept { - build_ranks_and_selects(ranks_true_, selects_true_, false); // 1 bits - build_ranks_and_selects(ranks_false_, selects_false_, true); // 0 bits + if (not is_built_) { + build_ranks_and_selects(ranks_true_, selects_true_, false); // 1 bits + build_ranks_and_selects(ranks_false_, selects_false_, true); // 0 bits + is_built_ = true; + } } template diff --git a/tests/dynamic_bitset/find_next_test.cu b/tests/dynamic_bitset/find_next_test.cu index 4b4ebd8c5..97ba366ea 100644 --- a/tests/dynamic_bitset/find_next_test.cu +++ b/tests/dynamic_bitset/find_next_test.cu @@ -47,7 +47,6 @@ TEST_CASE("Find next set test", "") for (size_type i = 0; i < num_elements; i++) { bv.push_back(modulo_bitgen(i)); } - bv.build(); thrust::device_vector device_result(num_elements); auto ref = bv.ref(); diff --git a/tests/dynamic_bitset/get_test.cu b/tests/dynamic_bitset/get_test.cu index 68582a9e6..10f81a116 100644 --- a/tests/dynamic_bitset/get_test.cu +++ b/tests/dynamic_bitset/get_test.cu @@ -46,23 +46,24 @@ TEST_CASE("Get test", "") bv.push_back(modulo_bitgen(i)); num_set_ref += modulo_bitgen(i); } - bv.build(); - - // Device-ref test - auto ref = bv.ref(); - thrust::device_vector test_result(num_elements); - test_kernel<<<1, 1024>>>(ref, num_elements, test_result.data()); - - size_type num_set = thrust::reduce(thrust::device, test_result.begin(), test_result.end(), 0); - REQUIRE(num_set == num_set_ref); // Host-bulk test thrust::device_vector keys(num_elements); thrust::sequence(keys.begin(), keys.end(), 0); + + thrust::device_vector test_result(num_elements); thrust::fill(test_result.begin(), test_result.end(), 0); bv.test(keys.begin(), keys.end(), test_result.begin()); + size_type num_set = thrust::reduce(thrust::device, test_result.begin(), test_result.end(), 0); + REQUIRE(num_set == num_set_ref); + + // Device-ref test + auto ref = bv.ref(); + thrust::fill(test_result.begin(), test_result.end(), 0); + test_kernel<<<1, 1024>>>(ref, num_elements, test_result.data()); + num_set = thrust::reduce(thrust::device, test_result.begin(), test_result.end(), 0); REQUIRE(num_set == num_set_ref); } diff --git a/tests/dynamic_bitset/rank_test.cu b/tests/dynamic_bitset/rank_test.cu index 15b73abdc..3b4d17cca 100644 --- a/tests/dynamic_bitset/rank_test.cu +++ b/tests/dynamic_bitset/rank_test.cu @@ -36,7 +36,6 @@ TEST_CASE("Rank test", "") for (size_type i = 0; i < num_elements; i++) { bv.push_back(modulo_bitgen(i)); } - bv.build(); thrust::device_vector keys(num_elements); thrust::sequence(keys.begin(), keys.end(), 0); diff --git a/tests/dynamic_bitset/select_test.cu b/tests/dynamic_bitset/select_test.cu index fd4553229..3dc0d74da 100644 --- a/tests/dynamic_bitset/select_test.cu +++ b/tests/dynamic_bitset/select_test.cu @@ -49,8 +49,6 @@ TEST_CASE("Select test", "") bv.push_back(modulo_bitgen(i)); num_set += modulo_bitgen(i); } - bv.build(); - auto ref = bv.ref(); // Check select { @@ -79,6 +77,7 @@ TEST_CASE("Select test", "") { size_type num_not_set = num_elements - num_set; + auto ref = bv.ref(); thrust::device_vector device_result(num_not_set); select_false_kernel<<<1, 1024>>>(ref, num_not_set, device_result.data()); thrust::host_vector host_result = device_result; diff --git a/tests/dynamic_bitset/size_test.cu b/tests/dynamic_bitset/size_test.cu index 4b238cf26..611159dc3 100644 --- a/tests/dynamic_bitset/size_test.cu +++ b/tests/dynamic_bitset/size_test.cu @@ -27,7 +27,6 @@ TEST_CASE("Size computation", "") for (size_type i = 0; i < num_elements; i++) { bv.push_back(i % 2 == 0); // Alternate 0s and 1s pattern } - bv.build(); auto size = bv.size(); REQUIRE(size == num_elements); From 9a3018decbca383474dcafa6417e5668e994a823 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 8 Sep 2023 11:02:44 -0700 Subject: [PATCH 93/99] Minor doc updates --- .../detail/trie/dynamic_bitset/dynamic_bitset.cuh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index fb3cc23a5..9d9259b06 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -32,7 +32,7 @@ namespace experimental { namespace detail { /** - * @brief Struct to store ranks of bits at 256-bit intervals + * @brief Struct to store ranks of bits at 256-bit intervals (or blocks) * * This struct encodes a list of four rank values using base + offset format * e.g. [1000, 1005, 1006, 1009] is stored as base = 1000, offsets = [5, 6, 9] @@ -88,9 +88,12 @@ class dynamic_bitset { /// Type of the allocator to (de)allocate words using allocator_type = typename std::allocator_traits::rebind_alloc; - static constexpr size_type words_per_block = 4; ///< Tradeoff between space efficiency and perf. - static constexpr size_type bits_per_word = sizeof(word_type) * CHAR_BIT; ///< Bits in a word - static constexpr size_type bits_per_block = words_per_block * bits_per_word; ///< Trivial + /// Number of bits per block. Note this is a tradeoff between space efficiency and perf. + static constexpr size_type words_per_block = 4; + /// Number of bits in a word + static constexpr size_type bits_per_word = sizeof(word_type) * CHAR_BIT; + /// Number of bits in a block + static constexpr size_type bits_per_block = words_per_block * bits_per_word; /** * @brief Constructs an empty bitset From e1527ec221ab74dd0bd2dcdfa7f8071a9d4edf07 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 8 Sep 2023 11:18:17 -0700 Subject: [PATCH 94/99] Clean up kernels with cuda utilities --- .../trie/dynamic_bitset/dynamic_bitset.inl | 1 + .../detail/trie/dynamic_bitset/kernels.cuh | 99 ++++++++++--------- 2 files changed, 54 insertions(+), 46 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index abfc80f85..27e00e8cf 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -17,6 +17,7 @@ #include #include +#include #include #include diff --git a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh index 63785ca39..5d79fd0d8 100644 --- a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh @@ -17,7 +17,8 @@ #pragma once -#include +#include +#include #include @@ -31,18 +32,20 @@ namespace detail { * @tparam BitsetRef Bitset reference type * @tparam KeyIt Device-accessible iterator to input keys * @tparam ValueIt Device-accessible iterator to values - * @tparam size_type Size type * * @param ref Bitset ref * @param keys Begin iterator to keys * @param outputs Begin iterator to outputs * @param num_keys Number of input keys */ -template -__global__ void bitset_test_kernel(BitsetRef ref, KeyIt keys, ValueIt outputs, size_type num_keys) +template +__global__ void bitset_test_kernel(BitsetRef ref, + KeyIt keys, + ValueIt outputs, + cuco::detail::index_type num_keys) { - cuco::detail::index_type key_id = blockDim.x * blockIdx.x + threadIdx.x; - cuco::detail::index_type stride = gridDim.x * blockDim.x; + auto key_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); while (key_id < num_keys) { outputs[key_id] = ref.test(keys[key_id]); @@ -56,18 +59,20 @@ __global__ void bitset_test_kernel(BitsetRef ref, KeyIt keys, ValueIt outputs, s * @tparam BitsetRef Bitset reference type * @tparam KeyIt Device-accessible iterator to input keys * @tparam ValueIt Device-accessible iterator to values - * @tparam size_type Size type * * @param ref Bitset ref * @param keys Begin iterator to keys * @param outputs Begin iterator to outputs * @param num_keys Number of input keys */ -template -__global__ void bitset_rank_kernel(BitsetRef ref, KeyIt keys, ValueIt outputs, size_type num_keys) +template +__global__ void bitset_rank_kernel(BitsetRef ref, + KeyIt keys, + ValueIt outputs, + cuco::detail::index_type num_keys) { - cuco::detail::index_type key_id = blockDim.x * blockIdx.x + threadIdx.x; - cuco::detail::index_type stride = gridDim.x * blockDim.x; + auto key_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); while (key_id < num_keys) { outputs[key_id] = ref.rank(keys[key_id]); @@ -81,18 +86,20 @@ __global__ void bitset_rank_kernel(BitsetRef ref, KeyIt keys, ValueIt outputs, s * @tparam BitsetRef Bitset reference type * @tparam KeyIt Device-accessible iterator to input keys * @tparam ValueIt Device-accessible iterator to values - * @tparam size_type Size type * * @param ref Bitset ref * @param keys Begin iterator to keys * @param outputs Begin iterator to outputs * @param num_keys Number of input keys */ -template -__global__ void bitset_select_kernel(BitsetRef ref, KeyIt keys, ValueIt outputs, size_type num_keys) +template +__global__ void bitset_select_kernel(BitsetRef ref, + KeyIt keys, + ValueIt outputs, + cuco::detail::index_type num_keys) { - cuco::detail::index_type key_id = blockDim.x * blockIdx.x + threadIdx.x; - cuco::detail::index_type stride = gridDim.x * blockDim.x; + auto key_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); while (key_id < num_keys) { outputs[key_id] = ref.select(keys[key_id]); @@ -103,22 +110,22 @@ __global__ void bitset_select_kernel(BitsetRef ref, KeyIt keys, ValueIt outputs, /* * @brief Computes number of set or not-set bits in each word * - * @tparam word_type Word type - * @tparam size_type Size type + * @tparam WordType Word type + * @tparam SizeType Size type * * @param words Input array of words * @param bit_counts Output array of per-word bit counts * @param num_words Number of words * @param flip_bits Boolean to request negation of words before counting bits */ -template -__global__ void bit_counts_kernel(const word_type* words, - size_type* bit_counts, - size_type num_words, +template +__global__ void bit_counts_kernel(WordType const* words, + SizeType* bit_counts, + cuco::detail::index_type num_words, bool flip_bits) { - cuco::detail::index_type word_id = blockDim.x * blockIdx.x + threadIdx.x; - cuco::detail::index_type stride = gridDim.x * blockDim.x; + auto word_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); while (word_id < num_words) { auto word = words[word_id]; @@ -135,7 +142,7 @@ __global__ void bit_counts_kernel(const word_type* words, * into base-delta encoding style of `rank` struct. * Since prefix sum is available, there are no dependencies across blocks. - * @tparam size_type Size type + * @tparam SizeType Size type * * @param prefix_bit_counts Prefix sum array of per-word bit counts * @param ranks Output array of ranks @@ -143,18 +150,18 @@ __global__ void bit_counts_kernel(const word_type* words, * @param num_blocks Length of ouput array * @param words_per_block Number of words in each block */ -template -__global__ void encode_ranks_from_prefix_bit_counts(const size_type* prefix_bit_counts, +template +__global__ void encode_ranks_from_prefix_bit_counts(const SizeType* prefix_bit_counts, rank* ranks, - size_type num_words, - size_type num_blocks, - size_type words_per_block) + SizeType num_words, + SizeType num_blocks, + SizeType words_per_block) { - cuco::detail::index_type rank_id = blockDim.x * blockIdx.x + threadIdx.x; - cuco::detail::index_type stride = gridDim.x * blockDim.x; + auto rank_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); while (rank_id < num_blocks) { - size_type word_id = rank_id * words_per_block; + SizeType word_id = rank_id * words_per_block; // Set base value of rank auto& rank = ranks[rank_id]; @@ -162,7 +169,7 @@ __global__ void encode_ranks_from_prefix_bit_counts(const size_type* prefix_bit_ if (rank_id < num_blocks - 1) { // For each subsequent word in this block, compute deltas from base - for (size_type block_offset = 0; block_offset < words_per_block - 1; block_offset++) { + for (SizeType block_offset = 0; block_offset < words_per_block - 1; block_offset++) { auto delta = prefix_bit_counts[word_id + block_offset + 1] - prefix_bit_counts[word_id]; rank.offsets_[block_offset] = delta; } @@ -178,7 +185,7 @@ __global__ void encode_ranks_from_prefix_bit_counts(const size_type* prefix_bit_ * This kernel check for blocks where prefix sum crosses a multiple of `bits_per_block`. * Such blocks are marked in the output boolean array * - * @tparam size_type Size type + * @tparam SizeType Size type * * @param prefix_bit_counts Prefix sum array of per-word bit counts * @param selects_markers Ouput array indicating whether a block has selects entry or not @@ -186,15 +193,15 @@ __global__ void encode_ranks_from_prefix_bit_counts(const size_type* prefix_bit_ * @param words_per_block Number of words in each block * @param bits_per_block Number of bits in each block */ -template -__global__ void mark_blocks_with_select_entries(const size_type* prefix_bit_counts, - size_type* select_markers, - size_type num_blocks, - size_type words_per_block, - size_type bits_per_block) +template +__global__ void mark_blocks_with_select_entries(SizeType const* prefix_bit_counts, + SizeType* select_markers, + SizeType num_blocks, + SizeType words_per_block, + SizeType bits_per_block) { - cuco::detail::index_type block_id = blockDim.x * blockIdx.x + threadIdx.x; - cuco::detail::index_type stride = gridDim.x * blockDim.x; + auto block_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); while (block_id < num_blocks) { if (block_id == 0) { // Block 0 always has a selects entry @@ -204,11 +211,11 @@ __global__ void mark_blocks_with_select_entries(const size_type* prefix_bit_coun } select_markers[block_id] = 0; // Always clear marker first - size_type word_id = block_id * words_per_block; - size_type prev_count = prefix_bit_counts[word_id]; + SizeType word_id = block_id * words_per_block; + SizeType prev_count = prefix_bit_counts[word_id]; for (size_t block_offset = 1; block_offset <= words_per_block; block_offset++) { - size_type count = prefix_bit_counts[word_id + block_offset]; + SizeType count = prefix_bit_counts[word_id + block_offset]; // Selects entry is added when cumulative bitcount crosses a multiple of bits_per_block if ((prev_count - 1) / bits_per_block != (count - 1) / bits_per_block) { From 4d0d78ab006c032ff48343b9c3a0efc1877798b3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 8 Sep 2023 11:22:05 -0700 Subject: [PATCH 95/99] Minor style cleanup --- include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index 27e00e8cf..6813fd86a 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -234,7 +234,7 @@ dynamic_bitset::reference::find_next(size_type key) const noexcept size_type word_id = key / bits_per_word; size_type bit_id = key % bits_per_word; word_type word = storage_.words_ref_[word_id]; - word &= ~(0lu) << bit_id; + word &= ~(0UL) << bit_id; while (word == 0) { word = storage_.words_ref_[++word_id]; } From fceb5f955bc81a4c39a20efede546f0cb8aa232e Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 8 Sep 2023 11:32:35 -0700 Subject: [PATCH 96/99] Cleanups: renaming + update docs --- .../detail/trie/dynamic_bitset/kernels.cuh | 30 +++++++++++-------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh index 5d79fd0d8..c92ab60b2 100644 --- a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh @@ -30,18 +30,20 @@ namespace detail { * @brief Test bits for a range of keys * * @tparam BitsetRef Bitset reference type - * @tparam KeyIt Device-accessible iterator to input keys - * @tparam ValueIt Device-accessible iterator to values + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from boolean + * type * * @param ref Bitset ref * @param keys Begin iterator to keys * @param outputs Begin iterator to outputs * @param num_keys Number of input keys */ -template +template __global__ void bitset_test_kernel(BitsetRef ref, KeyIt keys, - ValueIt outputs, + OutputIt outputs, cuco::detail::index_type num_keys) { auto key_id = cuco::detail::global_thread_id(); @@ -57,18 +59,20 @@ __global__ void bitset_test_kernel(BitsetRef ref, * @brief Gather rank values for a range of keys * * @tparam BitsetRef Bitset reference type - * @tparam KeyIt Device-accessible iterator to input keys - * @tparam ValueIt Device-accessible iterator to values + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` * * @param ref Bitset ref * @param keys Begin iterator to keys * @param outputs Begin iterator to outputs * @param num_keys Number of input keys */ -template +template __global__ void bitset_rank_kernel(BitsetRef ref, KeyIt keys, - ValueIt outputs, + OutputIt outputs, cuco::detail::index_type num_keys) { auto key_id = cuco::detail::global_thread_id(); @@ -84,18 +88,20 @@ __global__ void bitset_rank_kernel(BitsetRef ref, * @brief Gather select values for a range of keys * * @tparam BitsetRef Bitset reference type - * @tparam KeyIt Device-accessible iterator to input keys - * @tparam ValueIt Device-accessible iterator to values + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` * * @param ref Bitset ref * @param keys Begin iterator to keys * @param outputs Begin iterator to outputs * @param num_keys Number of input keys */ -template +template __global__ void bitset_select_kernel(BitsetRef ref, KeyIt keys, - ValueIt outputs, + OutputIt outputs, cuco::detail::index_type num_keys) { auto key_id = cuco::detail::global_thread_id(); From adab866f0f47f377efb7efd1744e0c718cff20a7 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 8 Sep 2023 11:42:07 -0700 Subject: [PATCH 97/99] Consistently use the same allocator for intermediate vars --- include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index 6813fd86a..e5f1a246b 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -135,7 +135,7 @@ constexpr void dynamic_bitset::build_ranks_and_selects( // Population counts for each word // Sized to have one extra entry for subsequent prefix sum size_type num_words = words_.size(); - thrust::device_vector bit_counts(num_words + 1); + thrust::device_vector bit_counts(num_words + 1, this->allocator_); auto grid_size = cuco::detail::grid_size(num_words); bit_counts_kernel<<>>( thrust::raw_pointer_cast(words_.data()), @@ -158,7 +158,8 @@ constexpr void dynamic_bitset::build_ranks_and_selects( words_per_block); // Step 3. Compute selects - thrust::device_vector select_markers(num_blocks); + thrust::device_vector select_markers(num_blocks, + this->allocator_); mark_blocks_with_select_entries<<>>( thrust::raw_pointer_cast(bit_counts.data()), thrust::raw_pointer_cast(select_markers.data()), From 2b8851e9e34eaccea3086cbdb0f7ab8c5df4dd50 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 8 Sep 2023 14:57:42 -0700 Subject: [PATCH 98/99] Make build process exposed to CUDA stream --- .../trie/dynamic_bitset/dynamic_bitset.cuh | 8 +- .../trie/dynamic_bitset/dynamic_bitset.inl | 111 ++++++++++++++---- 2 files changed, 91 insertions(+), 28 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index 9d9259b06..8383669fc 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -348,8 +348,10 @@ class dynamic_bitset { /** * @brief Builds indexes for rank and select + * + * @param stream Stream to execute kernels */ - constexpr void build() noexcept; + constexpr void build(cuda_stream_ref stream = {}) noexcept; /** * @brief Populates rank and select indexes for true or false bits @@ -357,11 +359,13 @@ class dynamic_bitset { * @param ranks Output array of ranks * @param selects Output array of selects * @param flip_bits If true, negate bits to construct indexes for false bits + * @param stream Stream to execute kernels */ constexpr void build_ranks_and_selects( thrust::device_vector& ranks, thrust::device_vector& selects, - bool flip_bits) noexcept; + bool flip_bits, + cuda_stream_ref stream = {}); }; } // namespace detail diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index e5f1a246b..d3b7bbb5f 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -20,7 +20,10 @@ #include #include -#include +#include + +#include +#include #include @@ -127,31 +130,50 @@ template constexpr void dynamic_bitset::build_ranks_and_selects( thrust::device_vector& ranks, thrust::device_vector& selects, - bool flip_bits) noexcept + bool flip_bits, + cuda_stream_ref stream) { if (n_bits_ == 0) { return; } // Step 1. Compute prefix sum of per-word bit counts // Population counts for each word + size_type const num_words = words_.size(); // Sized to have one extra entry for subsequent prefix sum - size_type num_words = words_.size(); + auto const bit_counts_size = num_words + 1; + thrust::device_vector bit_counts(num_words + 1, this->allocator_); + auto const bit_counts_begin = thrust::raw_pointer_cast(bit_counts.data()); + auto grid_size = cuco::detail::grid_size(num_words); - bit_counts_kernel<<>>( - thrust::raw_pointer_cast(words_.data()), - thrust::raw_pointer_cast(bit_counts.data()), - num_words, - flip_bits); + bit_counts_kernel<<>>( + thrust::raw_pointer_cast(words_.data()), bit_counts_begin, num_words, flip_bits); + + std::size_t temp_storage_bytes = 0; + using temp_allocator_type = typename std::allocator_traits::rebind_alloc; + auto temp_allocator = temp_allocator_type{this->allocator_}; - thrust::exclusive_scan(thrust::device, bit_counts.begin(), bit_counts.end(), bit_counts.begin()); + CUCO_CUDA_TRY(cub::DeviceScan::ExclusiveSum( + nullptr, temp_storage_bytes, bit_counts_begin, bit_counts_begin, bit_counts_size, stream)); + + // Allocate temporary storage + auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes); + + CUCO_CUDA_TRY(cub::DeviceScan::ExclusiveSum(thrust::raw_pointer_cast(d_temp_storage), + temp_storage_bytes, + bit_counts_begin, + bit_counts_begin, + bit_counts_size, + stream)); + + temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); // Step 2. Compute ranks - size_type num_blocks = (num_words - 1) / words_per_block + 2; + auto const num_blocks = (num_words - 1) / words_per_block + 2; ranks.resize(num_blocks); grid_size = cuco::detail::grid_size(num_blocks); - encode_ranks_from_prefix_bit_counts<<>>( - thrust::raw_pointer_cast(bit_counts.data()), + encode_ranks_from_prefix_bit_counts<<>>( + bit_counts_begin, thrust::raw_pointer_cast(ranks.data()), num_words, num_blocks, @@ -160,32 +182,69 @@ constexpr void dynamic_bitset::build_ranks_and_selects( // Step 3. Compute selects thrust::device_vector select_markers(num_blocks, this->allocator_); - mark_blocks_with_select_entries<<>>( - thrust::raw_pointer_cast(bit_counts.data()), + mark_blocks_with_select_entries<<>>( + bit_counts_begin, thrust::raw_pointer_cast(select_markers.data()), num_blocks, words_per_block, bits_per_block); - size_type num_selects = - thrust::reduce(thrust::device, select_markers.begin(), select_markers.end()); + auto d_sum = reinterpret_cast(thrust::raw_pointer_cast( + std::allocator_traits::allocate(temp_allocator, sizeof(size_type)))); + CUCO_CUDA_TRY(cub::DeviceReduce::Sum(nullptr, + temp_storage_bytes, + thrust::raw_pointer_cast(select_markers.data()), + d_sum, + num_blocks, + stream)); + + d_temp_storage = temp_allocator.allocate(temp_storage_bytes); + + CUCO_CUDA_TRY(cub::DeviceReduce::Sum(thrust::raw_pointer_cast(d_temp_storage), + temp_storage_bytes, + thrust::raw_pointer_cast(select_markers.data()), + d_sum, + num_blocks, + stream)); + + size_type num_selects{}; + CUCO_CUDA_TRY( + cudaMemcpyAsync(&num_selects, d_sum, sizeof(size_type), cudaMemcpyDeviceToHost, stream)); + stream.synchronize(); + std::allocator_traits::deallocate( + temp_allocator, thrust::device_ptr{reinterpret_cast(d_sum)}, sizeof(size_type)); + selects.resize(num_selects); - // Generate indices of non-zeros in select_markers - thrust::copy_if(thrust::device, - thrust::make_counting_iterator(0lu), - thrust::make_counting_iterator(num_blocks), - select_markers.begin(), - selects.begin(), - thrust::identity()); + CUCO_CUDA_TRY(cub::DeviceSelect::Flagged(nullptr, + temp_storage_bytes, + thrust::make_counting_iterator(0UL), + thrust::raw_pointer_cast(select_markers.data()), + thrust::raw_pointer_cast(selects.data()), + thrust::make_discard_iterator(), + num_blocks, + stream)); + + d_temp_storage = temp_allocator.allocate(temp_storage_bytes); + + CUCO_CUDA_TRY(cub::DeviceSelect::Flagged(thrust::raw_pointer_cast(d_temp_storage), + temp_storage_bytes, + thrust::make_counting_iterator(0UL), + thrust::raw_pointer_cast(select_markers.data()), + thrust::raw_pointer_cast(selects.data()), + thrust::discard_iterator(), + num_blocks, + stream)); + + temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); } template -constexpr void dynamic_bitset::build() noexcept +constexpr void dynamic_bitset::build(cuda_stream_ref stream) noexcept { if (not is_built_) { - build_ranks_and_selects(ranks_true_, selects_true_, false); // 1 bits - build_ranks_and_selects(ranks_false_, selects_false_, true); // 0 bits + build_ranks_and_selects(ranks_true_, selects_true_, false, stream); // 1 bits + build_ranks_and_selects(ranks_false_, selects_false_, true, stream); // 0 bits is_built_ = true; } } From 8cf54b8e5c62797f3c595ad65358683a671374e5 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 8 Sep 2023 15:07:20 -0700 Subject: [PATCH 99/99] Cleanups + deallocate before return --- .../trie/dynamic_bitset/dynamic_bitset.inl | 31 +++++++++---------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index d3b7bbb5f..d56ef9d7c 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -182,27 +182,21 @@ constexpr void dynamic_bitset::build_ranks_and_selects( // Step 3. Compute selects thrust::device_vector select_markers(num_blocks, this->allocator_); + auto const select_markers_begin = thrust::raw_pointer_cast(select_markers.data()); + mark_blocks_with_select_entries<<>>( - bit_counts_begin, - thrust::raw_pointer_cast(select_markers.data()), - num_blocks, - words_per_block, - bits_per_block); + bit_counts_begin, select_markers_begin, num_blocks, words_per_block, bits_per_block); auto d_sum = reinterpret_cast(thrust::raw_pointer_cast( std::allocator_traits::allocate(temp_allocator, sizeof(size_type)))); - CUCO_CUDA_TRY(cub::DeviceReduce::Sum(nullptr, - temp_storage_bytes, - thrust::raw_pointer_cast(select_markers.data()), - d_sum, - num_blocks, - stream)); + CUCO_CUDA_TRY(cub::DeviceReduce::Sum( + nullptr, temp_storage_bytes, select_markers_begin, d_sum, num_blocks, stream)); d_temp_storage = temp_allocator.allocate(temp_storage_bytes); CUCO_CUDA_TRY(cub::DeviceReduce::Sum(thrust::raw_pointer_cast(d_temp_storage), temp_storage_bytes, - thrust::raw_pointer_cast(select_markers.data()), + select_markers_begin, d_sum, num_blocks, stream)); @@ -213,14 +207,17 @@ constexpr void dynamic_bitset::build_ranks_and_selects( stream.synchronize(); std::allocator_traits::deallocate( temp_allocator, thrust::device_ptr{reinterpret_cast(d_sum)}, sizeof(size_type)); + temp_allocator.deallocate(d_temp_storage, temp_storage_bytes); selects.resize(num_selects); + auto const select_begin = thrust::raw_pointer_cast(selects.data()); + CUCO_CUDA_TRY(cub::DeviceSelect::Flagged(nullptr, temp_storage_bytes, thrust::make_counting_iterator(0UL), - thrust::raw_pointer_cast(select_markers.data()), - thrust::raw_pointer_cast(selects.data()), + select_markers_begin, + select_begin, thrust::make_discard_iterator(), num_blocks, stream)); @@ -230,9 +227,9 @@ constexpr void dynamic_bitset::build_ranks_and_selects( CUCO_CUDA_TRY(cub::DeviceSelect::Flagged(thrust::raw_pointer_cast(d_temp_storage), temp_storage_bytes, thrust::make_counting_iterator(0UL), - thrust::raw_pointer_cast(select_markers.data()), - thrust::raw_pointer_cast(selects.data()), - thrust::discard_iterator(), + select_markers_begin, + select_begin, + thrust::make_discard_iterator(), num_blocks, stream));