From 9e97c6708fdc3c7f52420992184b1bfdbd2c515f Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 6 Feb 2025 18:39:04 -0800 Subject: [PATCH 1/8] Eliminate IO from bloom_filter::add benchmark --- benchmarks/bloom_filter/add_bench.cu | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/benchmarks/bloom_filter/add_bench.cu b/benchmarks/bloom_filter/add_bench.cu index 8b502d0d5..5d3d54b05 100644 --- a/benchmarks/bloom_filter/add_bench.cu +++ b/benchmarks/bloom_filter/add_bench.cu @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include @@ -61,10 +61,7 @@ void bloom_filter_add(nvbench::state& state, (filter_size_mb * 1024 * 1024) / (sizeof(typename filter_type::word_type) * filter_type::words_per_block); - thrust::device_vector keys(num_keys); - - key_generator gen; - gen.generate(dist_from_state(state), keys.begin(), keys.end()); + thrust::counting_iterator keys(0); state.add_element_count(num_keys); @@ -79,7 +76,7 @@ void bloom_filter_add(nvbench::state& state, add_fpr_summary(state, filter); state.exec([&](nvbench::launch& launch) { - filter.add_async(keys.begin(), keys.end(), {launch.get_stream()}); + filter.add_async(keys, keys + num_keys, {launch.get_stream()}); }); } @@ -106,10 +103,7 @@ void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list // configurations } - thrust::device_vector keys(num_keys); - - key_generator gen; - gen.generate(dist_from_state(state), keys.begin(), keys.end()); + thrust::counting_iterator keys(0); state.add_element_count(num_keys); @@ -124,7 +118,7 @@ void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list add_fpr_summary(state, filter); state.exec([&](nvbench::launch& launch) { - filter.add_async(keys.begin(), keys.end(), {launch.get_stream()}); + filter.add_async(keys, keys + num_keys, {launch.get_stream()}); }); } From e2bb17923a220bcf93c892c52d50ea34747b52b7 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 7 Feb 2025 15:43:28 -0800 Subject: [PATCH 2/8] Don't read benchmark input data from gmem --- benchmarks/bloom_filter/add_bench.cu | 9 +------- benchmarks/bloom_filter/contains_bench.cu | 28 +++++++---------------- 2 files changed, 9 insertions(+), 28 deletions(-) diff --git a/benchmarks/bloom_filter/add_bench.cu b/benchmarks/bloom_filter/add_bench.cu index 5d3d54b05..72322bc21 100644 --- a/benchmarks/bloom_filter/add_bench.cu +++ b/benchmarks/bloom_filter/add_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,6 @@ #include #include -#include #include @@ -68,10 +67,7 @@ void bloom_filter_add(nvbench::state& state, filter_type filter{num_sub_filters, {}, {static_cast(pattern_bits)}}; state.collect_dram_throughput(); - state.collect_l1_hit_rates(); state.collect_l2_hit_rates(); - state.collect_loads_efficiency(); - state.collect_stores_efficiency(); add_fpr_summary(state, filter); @@ -110,10 +106,7 @@ void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list filter_type filter{num_sub_filters}; state.collect_dram_throughput(); - state.collect_l1_hit_rates(); state.collect_l2_hit_rates(); - state.collect_loads_efficiency(); - state.collect_stores_efficiency(); add_fpr_summary(state, filter); diff --git a/benchmarks/bloom_filter/contains_bench.cu b/benchmarks/bloom_filter/contains_bench.cu index 3d2ed1e54..1eae4e13f 100644 --- a/benchmarks/bloom_filter/contains_bench.cu +++ b/benchmarks/bloom_filter/contains_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,12 +21,12 @@ #include #include -#include #include #include #include +#include #include @@ -63,28 +63,22 @@ void bloom_filter_contains( (filter_size_mb * 1024 * 1024) / (sizeof(typename filter_type::word_type) * filter_type::words_per_block); - thrust::device_vector keys(num_keys); + thrust::counting_iterator keys(0); thrust::device_vector result(num_keys, false); - key_generator gen; - gen.generate(dist_from_state(state), keys.begin(), keys.end()); - state.add_element_count(num_keys); filter_type filter{num_sub_filters, {}, {static_cast(pattern_bits)}}; state.collect_dram_throughput(); - state.collect_l1_hit_rates(); state.collect_l2_hit_rates(); - state.collect_loads_efficiency(); - state.collect_stores_efficiency(); add_fpr_summary(state, filter); - filter.add(keys.begin(), keys.end()); + filter.add(keys, keys + num_keys); state.exec([&](nvbench::launch& launch) { - filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()}); + filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()}); }); } @@ -113,28 +107,22 @@ void arrow_bloom_filter_contains(nvbench::state& state, nvbench::type_list keys(num_keys); + thrust::counting_iterator keys(0); thrust::device_vector result(num_keys, false); - key_generator gen; - gen.generate(dist_from_state(state), keys.begin(), keys.end()); - state.add_element_count(num_keys); filter_type filter{num_sub_filters}; state.collect_dram_throughput(); - state.collect_l1_hit_rates(); state.collect_l2_hit_rates(); - state.collect_loads_efficiency(); - state.collect_stores_efficiency(); add_fpr_summary(state, filter); - filter.add(keys.begin(), keys.end()); + filter.add(keys, keys + num_keys); state.exec([&](nvbench::launch& launch) { - filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()}); + filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()}); }); } From 15ecc9363d3cb9c30c7e531871db939d6d47c909 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 11 Feb 2025 16:47:35 -0800 Subject: [PATCH 3/8] Increase benchmark input to reduce noise in measurements --- benchmarks/bloom_filter/defaults.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/bloom_filter/defaults.hpp b/benchmarks/bloom_filter/defaults.hpp index 67f3cf6ff..8ca9d711c 100644 --- a/benchmarks/bloom_filter/defaults.hpp +++ b/benchmarks/bloom_filter/defaults.hpp @@ -30,7 +30,7 @@ using BF_KEY = nvbench::int64_t; using BF_HASH = cuco::xxhash_64; using BF_WORD = nvbench::uint32_t; -static constexpr auto BF_N = 400'000'000; +static constexpr auto BF_N = 1'000'000'000; static constexpr auto BF_SIZE_MB = 2'000; static constexpr auto BF_WORDS_PER_BLOCK = 8; From 8e3caf9d99adf207ece96dd703a9a179a2ab3a00 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 12 Feb 2025 07:18:21 -0800 Subject: [PATCH 4/8] Rename hash_value_type -> hash_result_type --- .../detail/bloom_filter/arrow_filter_policy.cuh | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh index bfe97cfaf..c7b7b2fb0 100644 --- a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh +++ b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -83,10 +83,10 @@ namespace cuco::detail { template class XXHash64> class arrow_filter_policy { public: - using hasher = XXHash64; ///< 64-bit XXHash hasher for Arrow bloom filter policy - using word_type = std::uint32_t; ///< uint32_t for Arrow bloom filter policy - using key_type = Key; ///< Hash function input type - using hash_value_type = std::uint64_t; ///< hash function output type + using hasher = XXHash64; ///< 64-bit XXHash hasher for Arrow bloom filter policy + using word_type = std::uint32_t; ///< uint32_t for Arrow bloom filter policy + using key_type = Key; ///< Hash function input type + using hash_result_type = std::uint64_t; ///< hash function output type static constexpr uint32_t bits_set_per_block = 8; ///< hardcoded bits set per Arrow filter block static constexpr uint32_t words_per_block = 8; ///< hardcoded words per Arrow filter block @@ -133,7 +133,7 @@ class arrow_filter_policy { * * @return The hash value of the key */ - __device__ constexpr hash_value_type hash(key_type const& key) const { return hash_(key); } + __device__ constexpr hash_result_type hash(key_type const& key) const { return hash_(key); } /** * @brief Determines the filter block a key is added into. @@ -150,7 +150,7 @@ class arrow_filter_policy { * @return The block index for the given key's hash value */ template - __device__ constexpr auto block_index(hash_value_type hash, Extent num_blocks) const + __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const { constexpr auto hash_bits = cuda::std::numeric_limits::digits; // TODO: assert if num_blocks > max_filter_blocks @@ -168,7 +168,7 @@ class arrow_filter_policy { * * @return The bit pattern for the word/segment in the filter block */ - __device__ constexpr word_type word_pattern(hash_value_type hash, std::uint32_t word_index) const + __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const { // SALT array to calculate bit indexes for the current word auto constexpr salt = SALT(); From a4b91cb4d5524adab4465a2bec34ed583454d37e Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 12 Feb 2025 07:22:43 -0800 Subject: [PATCH 5/8] Eliminate lmem access during salt lookup --- .../bloom_filter/arrow_filter_policy.cuh | 42 +++++++++++-------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh index c7b7b2fb0..2f17fa726 100644 --- a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh +++ b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh @@ -99,21 +99,6 @@ class arrow_filter_policy { (max_arrow_filter_bytes / bytes_per_filter_block); ///< Max sub-filter blocks allowed in Arrow bloom filter - private: - // Arrow's block-based bloom filter algorithm needs these eight odd SALT values to calculate - // eight indexes of bit to set, one bit in each 32-bit (uint32_t) word. - __device__ static constexpr cuda::std::array SALT() - { - return {0x47b6137bU, - 0x44974d91U, - 0x8824ad5bU, - 0xa2b7289dU, - 0x705495c7U, - 0x2df1424bU, - 0x9efc4947U, - 0x5c6bfb31U}; - } - public: /** * @brief Constructs the `arrow_filter_policy` object. @@ -170,10 +155,31 @@ class arrow_filter_policy { */ __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const { - // SALT array to calculate bit indexes for the current word - auto constexpr salt = SALT(); word_type const key = static_cast(hash); - return word_type{1} << ((key * salt[word_index]) >> 27); + std::uint32_t salt; + + // Basically a switch (word_index) { case 0-7 ... } + // First split: 0..3 versus 4..7. + if (word_index < 4) { + // For indices 0..3, further split into 0..1 and 2..3. + if (word_index < 2) { + // word_index is 0 or 1. + salt = (word_index == 0) ? 0x47b6137bU : 0x44974d91U; + } else { + // word_index is 2 or 3. + salt = (word_index == 2) ? 0x8824ad5bU : 0xa2b7289dU; + } + } else { + // For indices 4..7, further split into 4..5 and 6..7. + if (word_index < 6) { + // word_index is 4 or 5. + salt = (word_index == 4) ? 0x705495c7U : 0x2df1424bU; + } else { + // word_index is 6 or 7. + salt = (word_index == 6) ? 0x9efc4947U : 0x5c6bfb31U; + } + } + return word_type{1} << ((key * salt) >> 27); } private: From a444f561555f3f53dd4ee69d6c9f65d964dd2a2f Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 12 Feb 2025 07:27:49 -0800 Subject: [PATCH 6/8] Avoid error handling through __trap --- .../default_filter_policy_impl.cuh | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh b/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh index ae2331b44..e69ae917d 100644 --- a/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh +++ b/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -50,32 +50,32 @@ class default_filter_policy_impl { remainder_bits_{pattern_bits_ % words_per_block}, hash_{hash} { - // This ensures each word in the block has at least one bit set; otherwise we would never use - // some of the words - constexpr uint32_t min_pattern_bits = words_per_block; - - // The maximum number of bits to be set for a key is capped by the total number of bits in the - // filter block - constexpr uint32_t max_pattern_bits = word_bits * words_per_block; - - constexpr uint32_t hash_bits = cuda::std::numeric_limits::digits; - constexpr uint32_t max_pattern_bits_from_hash = hash_bits / bit_index_width; - NV_DISPATCH_TARGET( NV_IS_HOST, - (CUCO_EXPECTS( - pattern_bits <= max_pattern_bits_from_hash, - "`hash_result_type` too narrow to generate the requested number of `pattern_bits`"); - CUCO_EXPECTS(pattern_bits_ >= min_pattern_bits, - "`pattern_bits` must be at least `words_per_block`"); - CUCO_EXPECTS( - pattern_bits_ <= max_pattern_bits, - "`pattern_bits` must be less than the total number of bits in a filter block");), - NV_IS_DEVICE, - (if (pattern_bits_ > max_pattern_bits_from_hash or pattern_bits_ < min_pattern_bits or - pattern_bits_ > max_pattern_bits) { + ( // This ensures each word in the block has at least one bit set; otherwise we would never + // use some of the words + constexpr uint32_t min_pattern_bits = words_per_block; + + // The maximum number of bits to be set for a key is capped by the total number of bits in + // the filter block + constexpr uint32_t max_pattern_bits = word_bits * words_per_block; + + constexpr uint32_t hash_bits = cuda::std::numeric_limits::digits; + constexpr uint32_t max_pattern_bits_from_hash = hash_bits / bit_index_width; + CUCO_EXPECTS( + pattern_bits <= max_pattern_bits_from_hash, + "`hash_result_type` too narrow to generate the requested number of `pattern_bits`"); + CUCO_EXPECTS(pattern_bits_ >= min_pattern_bits, + "`pattern_bits` must be at least `words_per_block`"); + CUCO_EXPECTS(pattern_bits_ <= max_pattern_bits, + "`pattern_bits` must be less than the total number of bits in a filter " + "block");) + /*, + NV_IS_DEVICE, + (if (pattern_bits_ > max_pattern_bits_from_hash or pattern_bits_ < min_pattern_bits or + pattern_bits_ > max_pattern_bits) { __trap(); // TODO this kills the kernel and corrupts the CUDA context. Not ideal. - })) + })*/) } __device__ constexpr hash_result_type hash(hash_argument_type const& key) const @@ -98,8 +98,8 @@ class default_filter_policy_impl { hash >>= bits_so_far * bit_index_width; - word_type word = 0; - int32_t bits_per_word = min_bits_per_word_ + (word_index < remainder_bits_ ? 1 : 0); + word_type word = 0; + int32_t const bits_per_word = min_bits_per_word_ + (word_index < remainder_bits_ ? 1 : 0); for (int32_t bit = 0; bit < bits_per_word; ++bit) { word |= word_type{1} << (hash & bit_index_mask); From c784355d7017af67a9df057f1d283a21abf3f832 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 12 Feb 2025 07:36:54 -0800 Subject: [PATCH 7/8] Add device bulk add --- include/cuco/bloom_filter_ref.cuh | 18 ++- .../detail/bloom_filter/bloom_filter_impl.cuh | 136 +++++++++++++++--- .../detail/bloom_filter/bloom_filter_ref.inl | 11 +- include/cuco/detail/bloom_filter/kernels.cuh | 24 +++- 4 files changed, 169 insertions(+), 20 deletions(-) diff --git a/include/cuco/bloom_filter_ref.cuh b/include/cuco/bloom_filter_ref.cuh index ee65c52bb..2f3dcfa2b 100644 --- a/include/cuco/bloom_filter_ref.cuh +++ b/include/cuco/bloom_filter_ref.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -134,6 +134,22 @@ class bloom_filter_ref { template __device__ void add(CG const& group, ProbeKey const& key); + /** + * @brief Device function that adds all keys in the range `[first, last)` to the filter. + * + * @note Best performance is achieved if the size of the CG is larger than or equal to + * `words_per_block`. + * + * @tparam CG Cooperative Group type + * @tparam InputIt Device-accessible random access input key iterator + * + * @param group The Cooperative Group this operation is executed with + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + */ + template + __device__ void add(CG const& group, InputIt first, InputIt last); + /** * @brief Adds all keys in the range `[first, last)` to the filter. * diff --git a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh index 2669dd41e..b4a38846c 100644 --- a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh +++ b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,6 +35,8 @@ #include #include +#include + #include namespace cuco::detail { @@ -121,14 +123,28 @@ class bloom_filter_impl { __device__ void add(ProbeKey const& key) { auto const hash_value = policy_.hash(key); - auto const idx = policy_.block_index(hash_value, num_blocks_); + this->add_impl(hash_value, policy_.block_index(hash_value, num_blocks_)); + } + template + __device__ void add(InputIt first, InputIt last) + { + auto const num_keys = cuco::detail::distance(first, last); + for (decltype(num_keys) i = 0; i < num_keys; ++i) { + auto const hash_value = policy_.hash(*(first + i)); + this->add_impl(hash_value, policy_.block_index(hash_value, num_blocks_)); + } + } + + template + __device__ void add_impl(HashValue const& hash_value, BlockIndex block_index) + { #pragma unroll words_per_block for (uint32_t i = 0; i < words_per_block; ++i) { auto const word = policy_.word_pattern(hash_value, i); if (word != 0) { - auto atom_word = - cuda::atomic_ref{*(words_ + (idx * words_per_block + i))}; + auto atom_word = cuda::atomic_ref{ + *(words_ + (block_index * words_per_block + i))}; atom_word.fetch_or(word, cuda::memory_order_relaxed); } } @@ -139,24 +155,97 @@ class bloom_filter_impl { { constexpr auto num_threads = tile_size_v; constexpr auto optimal_num_threads = add_optimal_cg_size(); - constexpr auto words_per_thread = words_per_block / optimal_num_threads; + constexpr auto worker_num_threads = + (num_threads < optimal_num_threads) ? num_threads : optimal_num_threads; // If single thread is optimal, use scalar add - if constexpr (num_threads == 1 or optimal_num_threads == 1) { + if constexpr (worker_num_threads == 1) { this->add(key); } else { - auto const rank = group.thread_rank(); - auto const hash_value = policy_.hash(key); - auto const idx = policy_.block_index(hash_value, num_blocks_); + this->add_impl(hash_value, policy_.block_index(hash_value, num_blocks_)); + } + } -#pragma unroll - for (uint32_t i = rank; i < optimal_num_threads; i += num_threads) { - auto const word = policy_.word_pattern(hash_value, rank); + template + __device__ void add(CG const& group, InputIt first, InputIt last) + { + namespace cg = cooperative_groups; - auto atom_word = - cuda::atomic_ref{*(words_ + (idx * words_per_block + rank))}; - atom_word.fetch_or(word, cuda::memory_order_relaxed); + constexpr auto num_threads = tile_size_v; + constexpr auto optimal_num_threads = add_optimal_cg_size(); + constexpr auto worker_num_threads = + (num_threads < optimal_num_threads) ? num_threads : optimal_num_threads; + + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const rank = group.thread_rank(); + + // If single thread is optimal, use scalar add + if constexpr (worker_num_threads == 1) { + for (auto i = rank; i < num_keys; i += num_threads) { + typename std::iterator_traits::value_type const& insert_element{*(first + i)}; + this->add(insert_element); + } + } else if constexpr (num_threads == worker_num_threads) { // given CG is optimal CG + typename policy_type::hash_result_type hash_value; + size_type block_index; + + auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads); + for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) { + if (i + rank < num_keys) { + typename std::iterator_traits::value_type const& insert_element{ + *(first + i + rank)}; + hash_value = policy_.hash(insert_element); + block_index = policy_.block_index(hash_value, num_blocks_); + } + + for (uint32_t j = 0; (j < num_threads) and (i + j < num_keys); ++j) { + this->add_impl(group, group.shfl(hash_value, j), group.shfl(block_index, j)); + } + } + } else { // subdivide given CG into multiple optimal CGs + typename policy_type::hash_result_type hash_value; + size_type block_index; + + auto const worker_group = cg::tiled_partition(group); + auto const worker_offset = worker_num_threads * worker_group.meta_group_rank(); + + auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads); + + for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) { + if (i + rank < num_keys) { + typename std::iterator_traits::value_type const& key{*(first + i + rank)}; + hash_value = policy_.hash(key); + block_index = policy_.block_index(hash_value, num_blocks_); + } + + for (uint32_t j = 0; (j < worker_num_threads) and (i + worker_offset + j < num_keys); ++j) { + this->add_impl( + worker_group, worker_group.shfl(hash_value, j), worker_group.shfl(block_index, j)); + } + } + } + } + + template + __device__ void add_impl(CG const& group, HashValue const& hash_value, BlockIndex block_index) + { + constexpr auto num_threads = tile_size_v; + + auto const rank = group.thread_rank(); + + if constexpr (num_threads == words_per_block) { + auto atom_word = cuda::atomic_ref{ + *(words_ + (block_index * words_per_block + rank))}; + atom_word.fetch_or(policy_.word_pattern(hash_value, rank), cuda::memory_order_relaxed); + } else { +#pragma unroll + for (auto i = rank; i < words_per_block; i += num_threads) { + auto atom_word = cuda::atomic_ref{ + *(words_ + (block_index * words_per_block + i))}; + atom_word.fetch_or(policy_.word_pattern(hash_value, i), cuda::memory_order_relaxed); } } } @@ -181,8 +270,21 @@ class bloom_filter_impl { [*this] __device__(key_type const key) mutable { this->add(key); }, stream.get())); } else { - auto const always_true = thrust::constant_iterator{true}; - this->add_if_async(first, last, always_true, thrust::identity{}, stream); + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto constexpr cg_size = add_optimal_cg_size(); + auto constexpr block_size = cuco::detail::default_block_size(); + void const* kernel = reinterpret_cast( + detail::bloom_filter_ns::add); + auto const grid_size = cuco::detail::max_occupancy_grid_size(block_size, kernel) * 1.5; + + detail::bloom_filter_ns::add + <<>>(first, num_keys, *this); + + // fallback method + // auto const always_true = thrust::constant_iterator{true}; + // this->add_if_async(first, last, always_true, thrust::identity{}, stream); } } diff --git a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl index ee99396db..96d2c0573 100644 --- a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl +++ b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -72,6 +72,15 @@ __device__ void bloom_filter_ref::add(CG const& grou impl_.add(group, key); } +template +template +__device__ void bloom_filter_ref::add(CG const& group, + InputIt first, + InputIt last) +{ + impl_.add(group, first, last); +} + template template __host__ constexpr void bloom_filter_ref::add(InputIt first, diff --git a/include/cuco/detail/bloom_filter/kernels.cuh b/include/cuco/detail/bloom_filter/kernels.cuh index b0ef7b684..9e04b73c4 100644 --- a/include/cuco/detail/bloom_filter/kernels.cuh +++ b/include/cuco/detail/bloom_filter/kernels.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,6 +26,28 @@ namespace cuco::detail::bloom_filter_ns { CUCO_SUPPRESS_KERNEL_WARNINGS +template +CUCO_KERNEL __launch_bounds__(BlockSize) void add(InputIt first, + cuco::detail::index_type n, + Ref ref) +{ + namespace cg = cooperative_groups; + + constexpr auto tile_size = cuco::detail::warp_size(); + + auto const tile_idx = cuco::detail::global_thread_id() / tile_size; + auto const n_tiles = gridDim.x * BlockSize / tile_size; + auto const items_per_tile = cuco::detail::int_div_ceil(n, n_tiles); + + auto const tile_start = tile_idx * items_per_tile; + if (tile_start >= n) { return; } + auto const tile_stop = (tile_start + items_per_tile < n) ? tile_start + items_per_tile : n; + + auto const tile = cg::tiled_partition(cg::this_thread_block()); + + ref.add(tile, first + tile_start, first + tile_stop); +} + template Date: Wed, 12 Feb 2025 07:38:45 -0800 Subject: [PATCH 8/8] Add device bulk contains --- include/cuco/bloom_filter_ref.cuh | 24 +++- .../detail/bloom_filter/bloom_filter_impl.cuh | 135 +++++++++++++++--- .../detail/bloom_filter/bloom_filter_ref.inl | 10 ++ include/cuco/detail/bloom_filter/kernels.cuh | 25 ++++ 4 files changed, 171 insertions(+), 23 deletions(-) diff --git a/include/cuco/bloom_filter_ref.cuh b/include/cuco/bloom_filter_ref.cuh index 2f3dcfa2b..7cab63cdf 100644 --- a/include/cuco/bloom_filter_ref.cuh +++ b/include/cuco/bloom_filter_ref.cuh @@ -257,10 +257,26 @@ class bloom_filter_ref { template [[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const; - // TODO - // template - // __device__ void contains(CG const& group, InputIt first, InputIt last, OutputIt output_begin) - // const; + /** + * @brief Device function that tests keys in the range `[first, last)` are present in filter. + * + * @note Best performance is achieved if the size of the CG is larger than or equal to + * `(words_per_block * sizeof(word_type)) / 32`. + * + * @tparam CG Cooperative Group type + * @tparam InputIt Device-accessible random access input key iterator + * @tparam OutputIt Device-accessible output iterator assignable from `bool` + * + * @param group The Cooperative Group this operation is executed with + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param output_begin Beginning of the sequence of booleans for the presence of each key + */ + template + __device__ void contains(CG const& group, + InputIt first, + InputIt last, + OutputIt output_begin) const; /** * @brief Tests all keys in the range `[first, last)` if their fingerprints are present in the diff --git a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh index b4a38846c..44b1890d7 100644 --- a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh +++ b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh @@ -343,30 +343,112 @@ class bloom_filter_impl { if constexpr (num_threads == 1 or optimal_num_threads == 1) { return this->contains(key); } else { - auto const rank = group.thread_rank(); - auto const hash_value = policy_.hash(key); - bool success = true; + auto const hash_value = policy_.hash(key); + auto const block_index = policy_.block_index(hash_value, num_blocks_); -#pragma unroll - for (uint32_t i = rank; i < optimal_num_threads; i += num_threads) { - auto const thread_offset = i * words_per_thread; - auto const stored_pattern = this->vec_load_words( - policy_.block_index(hash_value, num_blocks_) * words_per_block + thread_offset); -#pragma unroll words_per_thread - for (uint32_t j = 0; j < words_per_thread; ++j) { - auto const expected_pattern = policy_.word_pattern(hash_value, thread_offset + j); - if ((stored_pattern[j] & expected_pattern) != expected_pattern) { success = false; } + return this->contains_impl(group, hash_value, block_index); + } + } + + template + __device__ void contains(CG const& group, + InputIt first, + InputIt last, + OutputIt output_begin) const + { + namespace cg = cooperative_groups; + + constexpr auto num_threads = tile_size_v; + constexpr auto optimal_num_threads = contains_optimal_cg_size(); + constexpr auto worker_num_threads = + (num_threads < optimal_num_threads) ? num_threads : optimal_num_threads; + constexpr auto words_per_thread = words_per_block / worker_num_threads; + + auto const num_keys = cuco::detail::distance(first, last); + if (num_keys == 0) { return; } + + auto const rank = group.thread_rank(); + + // If single thread is optimal, use scalar contains + if constexpr (worker_num_threads == 1) { + for (auto i = rank; i < num_keys; i += num_threads) { + typename std::iterator_traits::value_type const& insert_element{*(first + i)}; + *(output_begin + i) = this->contains(insert_element); + } + } else if constexpr (num_threads == worker_num_threads) { // given CG is optimal CG + typename policy_type::hash_result_type hash_value; + size_type block_index; + + auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads); + for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) { + if (i + rank < num_keys) { + typename std::iterator_traits::value_type const& insert_element{ + *(first + i + rank)}; + hash_value = policy_.hash(insert_element); + block_index = policy_.block_index(hash_value, num_blocks_); + } + + for (uint32_t j = 0; (j < num_threads) and (i + j < num_keys); ++j) { + bool const success = + this->contains_impl(group, group.shfl(hash_value, j), group.shfl(block_index, j)); + if (group.thread_rank() == 0) { *(output_begin + i + j) = success; } } } + } else { // subdivide given CG into multiple optimal CGs + typename policy_type::hash_result_type hash_value; + size_type block_index; + + auto const worker_group = cg::tiled_partition(group); + auto const worker_offset = worker_num_threads * worker_group.meta_group_rank(); + + auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads); + + for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) { + if (i + rank < num_keys) { + typename std::iterator_traits::value_type const& key{*(first + i + rank)}; + hash_value = policy_.hash(key); + block_index = policy_.block_index(hash_value, num_blocks_); + } - return group.all(success); + for (uint32_t j = 0; (j < worker_num_threads) and (i + worker_offset + j < num_keys); ++j) { + bool const success = this->contains_impl( + worker_group, worker_group.shfl(hash_value, j), worker_group.shfl(block_index, j)); + if (group.thread_rank() == 0) { *(output_begin + i + j) = success; } + } + } } } - // TODO - // template - // __device__ void contains(CG const& group, InputIt first, InputIt last, OutputIt output_begin) - // const; + template + __device__ bool contains_impl(CG const& group, + HashValue const& hash_value, + BlockIndex block_index) const + { + constexpr auto num_threads = tile_size_v; + constexpr auto optimal_num_threads = contains_optimal_cg_size(); + constexpr auto words_per_thread = words_per_block / optimal_num_threads; + + auto const rank = group.thread_rank(); + bool success = true; + +#pragma unroll + for (uint32_t i = rank; i < optimal_num_threads; i += num_threads) { + auto const thread_offset = i * words_per_thread; + auto const stored_pattern = + this->vec_load_words(block_index * words_per_block + thread_offset); +#pragma unroll words_per_thread + for (uint32_t j = 0; j < words_per_thread; ++j) { + auto const expected_pattern = policy_.word_pattern(hash_value, thread_offset + j); + if ((stored_pattern[j] & expected_pattern) != expected_pattern) { + success = false; + break; + } + } + if (not success) { break; } + } + + return group.all(success); + } template __host__ constexpr void contains(InputIt first, @@ -384,6 +466,20 @@ class bloom_filter_impl { OutputIt output_begin, cuda::stream_ref stream) const noexcept { + // TODO perfoms worse than fallback + // auto const num_keys = cuco::detail::distance(first, last); + // if (num_keys == 0) { return; } + + // auto constexpr cg_size = contains_optimal_cg_size(); + // auto constexpr block_size = cuco::detail::default_block_size(); + // void const* kernel = reinterpret_cast(detail::bloom_filter_ns::contains); auto const grid_size = + // cuco::detail::max_occupancy_grid_size(block_size, kernel) * 2.5; + + // detail::bloom_filter_ns::contains + // <<>>(first, num_keys, output_begin, *this); + + // fallback method auto const always_true = thrust::constant_iterator{true}; this->contains_if_async(first, last, always_true, thrust::identity{}, output_begin, stream); } @@ -413,8 +509,9 @@ class bloom_filter_impl { auto constexpr cg_size = contains_optimal_cg_size(); auto constexpr block_size = cuco::detail::default_block_size(); - auto const grid_size = - cuco::detail::grid_size(num_keys, cg_size, cuco::detail::default_stride(), block_size); + // TODO stride = 1 is optimal for arrow policy while stride = 16-32 is optimal for default + // policy + auto const grid_size = cuco::detail::grid_size(num_keys, cg_size, 2, block_size); detail::bloom_filter_ns::contains_if_n <<>>( diff --git a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl index 96d2c0573..c9fe32ea0 100644 --- a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl +++ b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl @@ -130,6 +130,16 @@ template return impl_.contains(group, key); } +template +template +__device__ void bloom_filter_ref::contains(CG const& group, + InputIt first, + InputIt last, + OutputIt output_begin) const +{ + impl_.contains(group, first, last, output_begin); +} + template template __host__ constexpr void bloom_filter_ref::contains( diff --git a/include/cuco/detail/bloom_filter/kernels.cuh b/include/cuco/detail/bloom_filter/kernels.cuh index 9e04b73c4..08adb814f 100644 --- a/include/cuco/detail/bloom_filter/kernels.cuh +++ b/include/cuco/detail/bloom_filter/kernels.cuh @@ -73,6 +73,31 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void add_if_n( } } +template +CUCO_KERNEL __launch_bounds__(BlockSize) void contains(InputIt first, + cuco::detail::index_type n, + OutputIt out, + Ref ref) +{ + namespace cg = cooperative_groups; + + // TODO use shmem + async copies for output + + constexpr auto tile_size = cuco::detail::warp_size(); + + auto const tile_idx = cuco::detail::global_thread_id() / tile_size; + auto const n_tiles = gridDim.x * BlockSize / tile_size; + auto const items_per_tile = cuco::detail::int_div_ceil(n, n_tiles); + + auto const tile_start = tile_idx * items_per_tile; + if (tile_start >= n) { return; } + auto const tile_stop = (tile_start + items_per_tile < n) ? tile_start + items_per_tile : n; + + auto const tile = cg::tiled_partition(cg::this_thread_block()); + + ref.contains(tile, first + tile_start, first + tile_stop, out + tile_start); +} + template