From 9e97c6708fdc3c7f52420992184b1bfdbd2c515f Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 6 Feb 2025 18:39:04 -0800
Subject: [PATCH 1/8] Eliminate IO from bloom_filter::add benchmark

---
 benchmarks/bloom_filter/add_bench.cu | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)
diff --git a/benchmarks/bloom_filter/add_bench.cu b/benchmarks/bloom_filter/add_bench.cu
index 8b502d0d5..5d3d54b05 100644
--- a/benchmarks/bloom_filter/add_bench.cu
+++ b/benchmarks/bloom_filter/add_bench.cu
@@ -26,7 +26,7 @@
 #include <nvbench/nvbench.cuh>
 
 #include <cuda/std/limits>
-#include <thrust/device_vector.h>
+#include <thrust/iterator/counting_iterator.h>
 
 #include <cstdint>
 #include <exception>
@@ -61,10 +61,7 @@ void bloom_filter_add(nvbench::state& state,
     (filter_size_mb * 1024 * 1024) /
     (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
 
-  thrust::device_vector<Key> keys(num_keys);
-
-  key_generator gen;
-  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+  thrust::counting_iterator<Key> keys(0);
 
   state.add_element_count(num_keys);
 
@@ -79,7 +76,7 @@ void bloom_filter_add(nvbench::state& state,
   add_fpr_summary(state, filter);
 
   state.exec([&](nvbench::launch& launch) {
-    filter.add_async(keys.begin(), keys.end(), {launch.get_stream()});
+    filter.add_async(keys, keys + num_keys, {launch.get_stream()});
   });
 }
 
@@ -106,10 +103,7 @@ void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Dist>
                                                                                  // configurations
   }
 
-  thrust::device_vector<Key> keys(num_keys);
-
-  key_generator gen;
-  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+  thrust::counting_iterator<Key> keys(0);
 
   state.add_element_count(num_keys);
 
@@ -124,7 +118,7 @@ void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Dist>
   add_fpr_summary(state, filter);
 
   state.exec([&](nvbench::launch& launch) {
-    filter.add_async(keys.begin(), keys.end(), {launch.get_stream()});
+    filter.add_async(keys, keys + num_keys, {launch.get_stream()});
   });
 }
 

From e2bb17923a220bcf93c892c52d50ea34747b52b7 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 7 Feb 2025 15:43:28 -0800
Subject: [PATCH 2/8] Don't read benchmark input data from gmem

---
 benchmarks/bloom_filter/add_bench.cu      |  9 +-------
 benchmarks/bloom_filter/contains_bench.cu | 28 +++++++----------------
 2 files changed, 9 insertions(+), 28 deletions(-)

diff --git a/benchmarks/bloom_filter/add_bench.cu b/benchmarks/bloom_filter/add_bench.cu
index 5d3d54b05..72322bc21 100644
--- a/benchmarks/bloom_filter/add_bench.cu
+++ b/benchmarks/bloom_filter/add_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@
 #include <benchmark_utils.hpp>
 
 #include <cuco/bloom_filter.cuh>
-#include <cuco/utility/key_generator.cuh>
 
 #include <nvbench/nvbench.cuh>
 
@@ -68,10 +67,7 @@ void bloom_filter_add(nvbench::state& state,
   filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};
 
   state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
 
   add_fpr_summary(state, filter);
 
@@ -110,10 +106,7 @@ void arrow_bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Dist>
   filter_type filter{num_sub_filters};
 
   state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
 
   add_fpr_summary(state, filter);
 
diff --git a/benchmarks/bloom_filter/contains_bench.cu b/benchmarks/bloom_filter/contains_bench.cu
index 3d2ed1e54..1eae4e13f 100644
--- a/benchmarks/bloom_filter/contains_bench.cu
+++ b/benchmarks/bloom_filter/contains_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,12 +21,12 @@
 #include <benchmark_utils.hpp>
 
 #include <cuco/bloom_filter.cuh>
-#include <cuco/utility/key_generator.cuh>
 
 #include <nvbench/nvbench.cuh>
 
 #include <cuda/std/limits>
 #include <thrust/device_vector.h>
+#include <thrust/iterator/counting_iterator.h>
 
 #include <exception>
 
@@ -63,28 +63,22 @@ void bloom_filter_contains(
     (filter_size_mb * 1024 * 1024) /
     (sizeof(typename filter_type::word_type) * filter_type::words_per_block);
 
-  thrust::device_vector<Key> keys(num_keys);
+  thrust::counting_iterator<Key> keys(0);
   thrust::device_vector<bool> result(num_keys, false);
 
-  key_generator gen;
-  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
-
   state.add_element_count(num_keys);
 
   filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};
 
   state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
 
   add_fpr_summary(state, filter);
 
-  filter.add(keys.begin(), keys.end());
+  filter.add(keys, keys + num_keys);
 
   state.exec([&](nvbench::launch& launch) {
-    filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
+    filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()});
   });
 }
 
@@ -113,28 +107,22 @@ void arrow_bloom_filter_contains(nvbench::state& state, nvbench::type_list<Key,
                                                                                  // configurations
   }
 
-  thrust::device_vector<Key> keys(num_keys);
+  thrust::counting_iterator<Key> keys(0);
   thrust::device_vector<bool> result(num_keys, false);
 
-  key_generator gen;
-  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
-
   state.add_element_count(num_keys);
 
   filter_type filter{num_sub_filters};
 
   state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
 
   add_fpr_summary(state, filter);
 
-  filter.add(keys.begin(), keys.end());
+  filter.add(keys, keys + num_keys);
 
   state.exec([&](nvbench::launch& launch) {
-    filter.contains_async(keys.begin(), keys.end(), result.begin(), {launch.get_stream()});
+    filter.contains_async(keys, keys + num_keys, result.begin(), {launch.get_stream()});
   });
 }
 

From 15ecc9363d3cb9c30c7e531871db939d6d47c909 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Tue, 11 Feb 2025 16:47:35 -0800
Subject: [PATCH 3/8] Increase benchmark input to reduce noise in measurements

---
 benchmarks/bloom_filter/defaults.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/bloom_filter/defaults.hpp b/benchmarks/bloom_filter/defaults.hpp
index 67f3cf6ff..8ca9d711c 100644
--- a/benchmarks/bloom_filter/defaults.hpp
+++ b/benchmarks/bloom_filter/defaults.hpp
@@ -30,7 +30,7 @@ using BF_KEY  = nvbench::int64_t;
 using BF_HASH = cuco::xxhash_64<char>;
 using BF_WORD = nvbench::uint32_t;
 
-static constexpr auto BF_N               = 400'000'000;
+static constexpr auto BF_N               = 1'000'000'000;
 static constexpr auto BF_SIZE_MB         = 2'000;
 static constexpr auto BF_WORDS_PER_BLOCK = 8;
 

From 8e3caf9d99adf207ece96dd703a9a179a2ab3a00 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 12 Feb 2025 07:18:21 -0800
Subject: [PATCH 4/8] Rename hash_value_type -> hash_result_type

---
 .../detail/bloom_filter/arrow_filter_policy.cuh  | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh
index bfe97cfaf..c7b7b2fb0 100644
--- a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh
+++ b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -83,10 +83,10 @@ namespace cuco::detail {
 template <class Key, template <typename> class XXHash64>
 class arrow_filter_policy {
  public:
-  using hasher          = XXHash64<Key>;  ///< 64-bit XXHash hasher for Arrow bloom filter policy
-  using word_type       = std::uint32_t;  ///< uint32_t for Arrow bloom filter policy
-  using key_type        = Key;            ///< Hash function input type
-  using hash_value_type = std::uint64_t;  ///< hash function output type
+  using hasher           = XXHash64<Key>;  ///< 64-bit XXHash hasher for Arrow bloom filter policy
+  using word_type        = std::uint32_t;  ///< uint32_t for Arrow bloom filter policy
+  using key_type         = Key;            ///< Hash function input type
+  using hash_result_type = std::uint64_t;  ///< hash function output type
 
   static constexpr uint32_t bits_set_per_block = 8;  ///< hardcoded bits set per Arrow filter block
   static constexpr uint32_t words_per_block    = 8;  ///< hardcoded words per Arrow filter block
@@ -133,7 +133,7 @@ class arrow_filter_policy {
    *
    * @return The hash value of the key
    */
-  __device__ constexpr hash_value_type hash(key_type const& key) const { return hash_(key); }
+  __device__ constexpr hash_result_type hash(key_type const& key) const { return hash_(key); }
 
   /**
    * @brief Determines the filter block a key is added into.
@@ -150,7 +150,7 @@ class arrow_filter_policy {
    * @return The block index for the given key's hash value
    */
   template <class Extent>
-  __device__ constexpr auto block_index(hash_value_type hash, Extent num_blocks) const
+  __device__ constexpr auto block_index(hash_result_type hash, Extent num_blocks) const
   {
     constexpr auto hash_bits = cuda::std::numeric_limits<word_type>::digits;
     // TODO: assert if num_blocks > max_filter_blocks
@@ -168,7 +168,7 @@ class arrow_filter_policy {
    *
    * @return The bit pattern for the word/segment in the filter block
    */
-  __device__ constexpr word_type word_pattern(hash_value_type hash, std::uint32_t word_index) const
+  __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const
   {
     // SALT array to calculate bit indexes for the current word
     auto constexpr salt = SALT();

From a4b91cb4d5524adab4465a2bec34ed583454d37e Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 12 Feb 2025 07:22:43 -0800
Subject: [PATCH 5/8] Eliminate lmem access during salt lookup

---
 .../bloom_filter/arrow_filter_policy.cuh      | 42 +++++++++++--------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh
index c7b7b2fb0..2f17fa726 100644
--- a/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh
+++ b/include/cuco/detail/bloom_filter/arrow_filter_policy.cuh
@@ -99,21 +99,6 @@ class arrow_filter_policy {
     (max_arrow_filter_bytes /
      bytes_per_filter_block);  ///< Max sub-filter blocks allowed in Arrow bloom filter
 
- private:
-  // Arrow's block-based bloom filter algorithm needs these eight odd SALT values to calculate
-  // eight indexes of bit to set, one bit in each 32-bit (uint32_t) word.
-  __device__ static constexpr cuda::std::array<std::uint32_t, 8> SALT()
-  {
-    return {0x47b6137bU,
-            0x44974d91U,
-            0x8824ad5bU,
-            0xa2b7289dU,
-            0x705495c7U,
-            0x2df1424bU,
-            0x9efc4947U,
-            0x5c6bfb31U};
-  }
-
  public:
   /**
    * @brief Constructs the `arrow_filter_policy` object.
@@ -170,10 +155,31 @@ class arrow_filter_policy {
    */
   __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const
   {
-    // SALT array to calculate bit indexes for the current word
-    auto constexpr salt = SALT();
     word_type const key = static_cast<word_type>(hash);
-    return word_type{1} << ((key * salt[word_index]) >> 27);
+    std::uint32_t salt;
+
+    // Basically a switch (word_index) { case 0-7 ... }
+    // First split: 0..3 versus 4..7.
+    if (word_index < 4) {
+      // For indices 0..3, further split into 0..1 and 2..3.
+      if (word_index < 2) {
+        // word_index is 0 or 1.
+        salt = (word_index == 0) ? 0x47b6137bU : 0x44974d91U;
+      } else {
+        // word_index is 2 or 3.
+        salt = (word_index == 2) ? 0x8824ad5bU : 0xa2b7289dU;
+      }
+    } else {
+      // For indices 4..7, further split into 4..5 and 6..7.
+      if (word_index < 6) {
+        // word_index is 4 or 5.
+        salt = (word_index == 4) ? 0x705495c7U : 0x2df1424bU;
+      } else {
+        // word_index is 6 or 7.
+        salt = (word_index == 6) ? 0x9efc4947U : 0x5c6bfb31U;
+      }
+    }
+    return word_type{1} << ((key * salt) >> 27);
   }
 
  private:

From a444f561555f3f53dd4ee69d6c9f65d964dd2a2f Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 12 Feb 2025 07:27:49 -0800
Subject: [PATCH 6/8] Avoid error handling through __trap

---
 .../default_filter_policy_impl.cuh            | 52 +++++++++----------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh b/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh
index ae2331b44..e69ae917d 100644
--- a/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh
+++ b/include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,32 +50,32 @@ class default_filter_policy_impl {
       remainder_bits_{pattern_bits_ % words_per_block},
       hash_{hash}
   {
-    // This ensures each word in the block has at least one bit set; otherwise we would never use
-    // some of the words
-    constexpr uint32_t min_pattern_bits = words_per_block;
-
-    // The maximum number of bits to be set for a key is capped by the total number of bits in the
-    // filter block
-    constexpr uint32_t max_pattern_bits = word_bits * words_per_block;
-
-    constexpr uint32_t hash_bits = cuda::std::numeric_limits<hash_result_type>::digits;
-    constexpr uint32_t max_pattern_bits_from_hash = hash_bits / bit_index_width;
-
     NV_DISPATCH_TARGET(
       NV_IS_HOST,
-      (CUCO_EXPECTS(
-         pattern_bits <= max_pattern_bits_from_hash,
-         "`hash_result_type` too narrow to generate the requested number of `pattern_bits`");
-       CUCO_EXPECTS(pattern_bits_ >= min_pattern_bits,
-                    "`pattern_bits` must be at least `words_per_block`");
-       CUCO_EXPECTS(
-         pattern_bits_ <= max_pattern_bits,
-         "`pattern_bits` must be less than the total number of bits in a filter block");),
-      NV_IS_DEVICE,
-      (if (pattern_bits_ > max_pattern_bits_from_hash or pattern_bits_ < min_pattern_bits or
-           pattern_bits_ > max_pattern_bits) {
+      (  // This ensures each word in the block has at least one bit set; otherwise we would never
+         // use some of the words
+        constexpr uint32_t min_pattern_bits = words_per_block;
+
+        // The maximum number of bits to be set for a key is capped by the total number of bits in
+        // the filter block
+        constexpr uint32_t max_pattern_bits = word_bits * words_per_block;
+
+        constexpr uint32_t hash_bits = cuda::std::numeric_limits<hash_result_type>::digits;
+        constexpr uint32_t max_pattern_bits_from_hash = hash_bits / bit_index_width;
+        CUCO_EXPECTS(
+          pattern_bits <= max_pattern_bits_from_hash,
+          "`hash_result_type` too narrow to generate the requested number of `pattern_bits`");
+        CUCO_EXPECTS(pattern_bits_ >= min_pattern_bits,
+                     "`pattern_bits` must be at least `words_per_block`");
+        CUCO_EXPECTS(pattern_bits_ <= max_pattern_bits,
+                     "`pattern_bits` must be less than the total number of bits in a filter "
+                     "block");)
+        /*,
+        NV_IS_DEVICE,
+        (if (pattern_bits_ > max_pattern_bits_from_hash or pattern_bits_ < min_pattern_bits or
+        pattern_bits_ > max_pattern_bits) {
         __trap();  // TODO this kills the kernel and corrupts the CUDA context. Not ideal.
-      }))
+        })*/)
   }
 
   __device__ constexpr hash_result_type hash(hash_argument_type const& key) const
@@ -98,8 +98,8 @@ class default_filter_policy_impl {
 
     hash >>= bits_so_far * bit_index_width;
 
-    word_type word        = 0;
-    int32_t bits_per_word = min_bits_per_word_ + (word_index < remainder_bits_ ? 1 : 0);
+    word_type word              = 0;
+    int32_t const bits_per_word = min_bits_per_word_ + (word_index < remainder_bits_ ? 1 : 0);
 
     for (int32_t bit = 0; bit < bits_per_word; ++bit) {
       word |= word_type{1} << (hash & bit_index_mask);

From c784355d7017af67a9df057f1d283a21abf3f832 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 12 Feb 2025 07:36:54 -0800
Subject: [PATCH 7/8] Add device bulk add

---
 include/cuco/bloom_filter_ref.cuh             |  18 ++-
 .../detail/bloom_filter/bloom_filter_impl.cuh | 136 +++++++++++++++---
 .../detail/bloom_filter/bloom_filter_ref.inl  |  11 +-
 include/cuco/detail/bloom_filter/kernels.cuh  |  24 +++-
 4 files changed, 169 insertions(+), 20 deletions(-)

diff --git a/include/cuco/bloom_filter_ref.cuh b/include/cuco/bloom_filter_ref.cuh
index ee65c52bb..2f3dcfa2b 100644
--- a/include/cuco/bloom_filter_ref.cuh
+++ b/include/cuco/bloom_filter_ref.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -134,6 +134,22 @@ class bloom_filter_ref {
   template <class CG, class ProbeKey>
   __device__ void add(CG const& group, ProbeKey const& key);
 
+  /**
+   * @brief Device function that adds all keys in the range `[first, last)` to the filter.
+   *
+   * @note Best performance is achieved if the size of the CG is larger than or equal to
+   * `words_per_block`.
+   *
+   * @tparam CG Cooperative Group type
+   * @tparam InputIt Device-accessible random access input key iterator
+   *
+   * @param group The Cooperative Group this operation is executed with
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   */
+  template <class CG, class InputIt>
+  __device__ void add(CG const& group, InputIt first, InputIt last);
+
   /**
    * @brief Adds all keys in the range `[first, last)` to the filter.
    *
diff --git a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
index 2669dd41e..b4a38846c 100644
--- a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
+++ b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,6 +35,8 @@
 #include <thrust/functional.h>
 #include <thrust/iterator/constant_iterator.h>
 
+#include <cooperative_groups.h>
+
 #include <cstdint>
 
 namespace cuco::detail {
@@ -121,14 +123,28 @@ class bloom_filter_impl {
   __device__ void add(ProbeKey const& key)
   {
     auto const hash_value = policy_.hash(key);
-    auto const idx        = policy_.block_index(hash_value, num_blocks_);
+    this->add_impl(hash_value, policy_.block_index(hash_value, num_blocks_));
+  }
 
+  template <class InputIt>
+  __device__ void add(InputIt first, InputIt last)
+  {
+    auto const num_keys = cuco::detail::distance(first, last);
+    for (decltype(num_keys) i = 0; i < num_keys; ++i) {
+      auto const hash_value = policy_.hash(*(first + i));
+      this->add_impl(hash_value, policy_.block_index(hash_value, num_blocks_));
+    }
+  }
+
+  template <class HashValue, class BlockIndex>
+  __device__ void add_impl(HashValue const& hash_value, BlockIndex block_index)
+  {
 #pragma unroll words_per_block
     for (uint32_t i = 0; i < words_per_block; ++i) {
       auto const word = policy_.word_pattern(hash_value, i);
       if (word != 0) {
-        auto atom_word =
-          cuda::atomic_ref<word_type, thread_scope>{*(words_ + (idx * words_per_block + i))};
+        auto atom_word = cuda::atomic_ref<word_type, thread_scope>{
+          *(words_ + (block_index * words_per_block + i))};
         atom_word.fetch_or(word, cuda::memory_order_relaxed);
       }
     }
@@ -139,24 +155,97 @@ class bloom_filter_impl {
   {
     constexpr auto num_threads         = tile_size_v<CG>;
     constexpr auto optimal_num_threads = add_optimal_cg_size();
-    constexpr auto words_per_thread    = words_per_block / optimal_num_threads;
+    constexpr auto worker_num_threads =
+      (num_threads < optimal_num_threads) ? num_threads : optimal_num_threads;
 
     // If single thread is optimal, use scalar add
-    if constexpr (num_threads == 1 or optimal_num_threads == 1) {
+    if constexpr (worker_num_threads == 1) {
       this->add(key);
     } else {
-      auto const rank = group.thread_rank();
-
       auto const hash_value = policy_.hash(key);
-      auto const idx        = policy_.block_index(hash_value, num_blocks_);
+      this->add_impl(hash_value, policy_.block_index(hash_value, num_blocks_));
+    }
+  }
 
-#pragma unroll
-      for (uint32_t i = rank; i < optimal_num_threads; i += num_threads) {
-        auto const word = policy_.word_pattern(hash_value, rank);
+  template <class CG, class InputIt>
+  __device__ void add(CG const& group, InputIt first, InputIt last)
+  {
+    namespace cg = cooperative_groups;
 
-        auto atom_word =
-          cuda::atomic_ref<word_type, thread_scope>{*(words_ + (idx * words_per_block + rank))};
-        atom_word.fetch_or(word, cuda::memory_order_relaxed);
+    constexpr auto num_threads         = tile_size_v<CG>;
+    constexpr auto optimal_num_threads = add_optimal_cg_size();
+    constexpr auto worker_num_threads =
+      (num_threads < optimal_num_threads) ? num_threads : optimal_num_threads;
+
+    auto const num_keys = cuco::detail::distance(first, last);
+    if (num_keys == 0) { return; }
+
+    auto const rank = group.thread_rank();
+
+    // If single thread is optimal, use scalar add
+    if constexpr (worker_num_threads == 1) {
+      for (auto i = rank; i < num_keys; i += num_threads) {
+        typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + i)};
+        this->add(insert_element);
+      }
+    } else if constexpr (num_threads == worker_num_threads) {  // given CG is optimal CG
+      typename policy_type::hash_result_type hash_value;
+      size_type block_index;
+
+      auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads);
+      for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) {
+        if (i + rank < num_keys) {
+          typename std::iterator_traits<InputIt>::value_type const& insert_element{
+            *(first + i + rank)};
+          hash_value  = policy_.hash(insert_element);
+          block_index = policy_.block_index(hash_value, num_blocks_);
+        }
+
+        for (uint32_t j = 0; (j < num_threads) and (i + j < num_keys); ++j) {
+          this->add_impl(group, group.shfl(hash_value, j), group.shfl(block_index, j));
+        }
+      }
+    } else {  // subdivide given CG into multiple optimal CGs
+      typename policy_type::hash_result_type hash_value;
+      size_type block_index;
+
+      auto const worker_group  = cg::tiled_partition<worker_num_threads>(group);
+      auto const worker_offset = worker_num_threads * worker_group.meta_group_rank();
+
+      auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads);
+
+      for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) {
+        if (i + rank < num_keys) {
+          typename std::iterator_traits<InputIt>::value_type const& key{*(first + i + rank)};
+          hash_value  = policy_.hash(key);
+          block_index = policy_.block_index(hash_value, num_blocks_);
+        }
+
+        for (uint32_t j = 0; (j < worker_num_threads) and (i + worker_offset + j < num_keys); ++j) {
+          this->add_impl(
+            worker_group, worker_group.shfl(hash_value, j), worker_group.shfl(block_index, j));
+        }
+      }
+    }
+  }
+
+  template <class CG, class HashValue, class BlockIndex>
+  __device__ void add_impl(CG const& group, HashValue const& hash_value, BlockIndex block_index)
+  {
+    constexpr auto num_threads = tile_size_v<CG>;
+
+    auto const rank = group.thread_rank();
+
+    if constexpr (num_threads == words_per_block) {
+      auto atom_word = cuda::atomic_ref<word_type, thread_scope>{
+        *(words_ + (block_index * words_per_block + rank))};
+      atom_word.fetch_or(policy_.word_pattern(hash_value, rank), cuda::memory_order_relaxed);
+    } else {
+#pragma unroll
+      for (auto i = rank; i < words_per_block; i += num_threads) {
+        auto atom_word = cuda::atomic_ref<word_type, thread_scope>{
+          *(words_ + (block_index * words_per_block + i))};
+        atom_word.fetch_or(policy_.word_pattern(hash_value, i), cuda::memory_order_relaxed);
       }
     }
   }
@@ -181,8 +270,21 @@ class bloom_filter_impl {
         [*this] __device__(key_type const key) mutable { this->add(key); },
         stream.get()));
     } else {
-      auto const always_true = thrust::constant_iterator<bool>{true};
-      this->add_if_async(first, last, always_true, thrust::identity{}, stream);
+      auto const num_keys = cuco::detail::distance(first, last);
+      if (num_keys == 0) { return; }
+
+      auto constexpr cg_size    = add_optimal_cg_size();
+      auto constexpr block_size = cuco::detail::default_block_size();
+      void const* kernel        = reinterpret_cast<void const*>(
+        detail::bloom_filter_ns::add<cg_size, block_size, InputIt, bloom_filter_impl>);
+      auto const grid_size = cuco::detail::max_occupancy_grid_size(block_size, kernel) * 1.5;
+
+      detail::bloom_filter_ns::add<cg_size, block_size>
+        <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, *this);
+
+      // fallback method
+      // auto const always_true = thrust::constant_iterator<bool>{true};
+      // this->add_if_async(first, last, always_true, thrust::identity{}, stream);
     }
   }
 
diff --git a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl
index ee99396db..96d2c0573 100644
--- a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl
+++ b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -72,6 +72,15 @@ __device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(CG const& grou
   impl_.add(group, key);
 }
 
+template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
+template <class CG, class InputIt>
+__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(CG const& group,
+                                                                  InputIt first,
+                                                                  InputIt last)
+{
+  impl_.add(group, first, last);
+}
+
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 template <class InputIt>
 __host__ constexpr void bloom_filter_ref<Key, Extent, Scope, Policy>::add(InputIt first,
diff --git a/include/cuco/detail/bloom_filter/kernels.cuh b/include/cuco/detail/bloom_filter/kernels.cuh
index b0ef7b684..9e04b73c4 100644
--- a/include/cuco/detail/bloom_filter/kernels.cuh
+++ b/include/cuco/detail/bloom_filter/kernels.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,28 @@ namespace cuco::detail::bloom_filter_ns {
 
 CUCO_SUPPRESS_KERNEL_WARNINGS
 
+template <int32_t CGSize, int32_t BlockSize, class InputIt, class Ref>
+CUCO_KERNEL __launch_bounds__(BlockSize) void add(InputIt first,
+                                                  cuco::detail::index_type n,
+                                                  Ref ref)
+{
+  namespace cg = cooperative_groups;
+
+  constexpr auto tile_size = cuco::detail::warp_size();
+
+  auto const tile_idx       = cuco::detail::global_thread_id() / tile_size;
+  auto const n_tiles        = gridDim.x * BlockSize / tile_size;
+  auto const items_per_tile = cuco::detail::int_div_ceil(n, n_tiles);
+
+  auto const tile_start = tile_idx * items_per_tile;
+  if (tile_start >= n) { return; }
+  auto const tile_stop = (tile_start + items_per_tile < n) ? tile_start + items_per_tile : n;
+
+  auto const tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
+
+  ref.add(tile, first + tile_start, first + tile_stop);
+}
+
 template <int32_t CGSize,
           int32_t BlockSize,
           class InputIt,

From bcb9dcb35f298d1389fcbaa9d8ede048a0f2b99a Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 12 Feb 2025 07:38:45 -0800
Subject: [PATCH 8/8] Add device bulk contains

---
 include/cuco/bloom_filter_ref.cuh             |  24 +++-
 .../detail/bloom_filter/bloom_filter_impl.cuh | 135 +++++++++++++++---
 .../detail/bloom_filter/bloom_filter_ref.inl  |  10 ++
 include/cuco/detail/bloom_filter/kernels.cuh  |  25 ++++
 4 files changed, 171 insertions(+), 23 deletions(-)

diff --git a/include/cuco/bloom_filter_ref.cuh b/include/cuco/bloom_filter_ref.cuh
index 2f3dcfa2b..7cab63cdf 100644
--- a/include/cuco/bloom_filter_ref.cuh
+++ b/include/cuco/bloom_filter_ref.cuh
@@ -257,10 +257,26 @@ class bloom_filter_ref {
   template <class CG, class ProbeKey>
   [[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const;
 
-  // TODO
-  // template <class CG, class InputIt, class OutputIt>
-  // __device__ void contains(CG const& group, InputIt first, InputIt last, OutputIt output_begin)
-  // const;
+  /**
+   * @brief Device function that tests keys in the range `[first, last)` are present in filter.
+   *
+   * @note Best performance is achieved if the size of the CG is larger than or equal to
+   * `(words_per_block * sizeof(word_type)) / 32`.
+   *
+   * @tparam CG Cooperative Group type
+   * @tparam InputIt Device-accessible random access input key iterator
+   * @tparam OutputIt Device-accessible output iterator assignable from `bool`
+   *
+   * @param group The Cooperative Group this operation is executed with
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param output_begin Beginning of the sequence of booleans for the presence of each key
+   */
+  template <class CG, class InputIt, class OutputIt>
+  __device__ void contains(CG const& group,
+                           InputIt first,
+                           InputIt last,
+                           OutputIt output_begin) const;
 
   /**
    * @brief Tests all keys in the range `[first, last)` if their fingerprints are present in the
diff --git a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
index b4a38846c..44b1890d7 100644
--- a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
+++ b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
@@ -343,30 +343,112 @@ class bloom_filter_impl {
     if constexpr (num_threads == 1 or optimal_num_threads == 1) {
       return this->contains(key);
     } else {
-      auto const rank       = group.thread_rank();
-      auto const hash_value = policy_.hash(key);
-      bool success          = true;
+      auto const hash_value  = policy_.hash(key);
+      auto const block_index = policy_.block_index(hash_value, num_blocks_);
 
-#pragma unroll
-      for (uint32_t i = rank; i < optimal_num_threads; i += num_threads) {
-        auto const thread_offset  = i * words_per_thread;
-        auto const stored_pattern = this->vec_load_words<words_per_thread>(
-          policy_.block_index(hash_value, num_blocks_) * words_per_block + thread_offset);
-#pragma unroll words_per_thread
-        for (uint32_t j = 0; j < words_per_thread; ++j) {
-          auto const expected_pattern = policy_.word_pattern(hash_value, thread_offset + j);
-          if ((stored_pattern[j] & expected_pattern) != expected_pattern) { success = false; }
+      return this->contains_impl(group, hash_value, block_index);
+    }
+  }
+
+  template <class CG, class InputIt, class OutputIt>
+  __device__ void contains(CG const& group,
+                           InputIt first,
+                           InputIt last,
+                           OutputIt output_begin) const
+  {
+    namespace cg = cooperative_groups;
+
+    constexpr auto num_threads         = tile_size_v<CG>;
+    constexpr auto optimal_num_threads = contains_optimal_cg_size();
+    constexpr auto worker_num_threads =
+      (num_threads < optimal_num_threads) ? num_threads : optimal_num_threads;
+    constexpr auto words_per_thread = words_per_block / worker_num_threads;
+
+    auto const num_keys = cuco::detail::distance(first, last);
+    if (num_keys == 0) { return; }
+
+    auto const rank = group.thread_rank();
+
+    // If single thread is optimal, use scalar contains
+    if constexpr (worker_num_threads == 1) {
+      for (auto i = rank; i < num_keys; i += num_threads) {
+        typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + i)};
+        *(output_begin + i) = this->contains(insert_element);
+      }
+    } else if constexpr (num_threads == worker_num_threads) {  // given CG is optimal CG
+      typename policy_type::hash_result_type hash_value;
+      size_type block_index;
+
+      auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads);
+      for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) {
+        if (i + rank < num_keys) {
+          typename std::iterator_traits<InputIt>::value_type const& insert_element{
+            *(first + i + rank)};
+          hash_value  = policy_.hash(insert_element);
+          block_index = policy_.block_index(hash_value, num_blocks_);
+        }
+
+        for (uint32_t j = 0; (j < num_threads) and (i + j < num_keys); ++j) {
+          bool const success =
+            this->contains_impl(group, group.shfl(hash_value, j), group.shfl(block_index, j));
+          if (group.thread_rank() == 0) { *(output_begin + i + j) = success; }
         }
       }
+    } else {  // subdivide given CG into multiple optimal CGs
+      typename policy_type::hash_result_type hash_value;
+      size_type block_index;
+
+      auto const worker_group  = cg::tiled_partition<worker_num_threads>(group);
+      auto const worker_offset = worker_num_threads * worker_group.meta_group_rank();
+
+      auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads);
+
+      for (size_type i = 0; (i / num_threads) < group_iters; i += num_threads) {
+        if (i + rank < num_keys) {
+          typename std::iterator_traits<InputIt>::value_type const& key{*(first + i + rank)};
+          hash_value  = policy_.hash(key);
+          block_index = policy_.block_index(hash_value, num_blocks_);
+        }
 
-      return group.all(success);
+        for (uint32_t j = 0; (j < worker_num_threads) and (i + worker_offset + j < num_keys); ++j) {
+          bool const success = this->contains_impl(
+            worker_group, worker_group.shfl(hash_value, j), worker_group.shfl(block_index, j));
+          if (group.thread_rank() == 0) { *(output_begin + i + j) = success; }
+        }
+      }
     }
   }
 
-  // TODO
-  // template <class CG, class InputIt, class OutputIt>
-  // __device__ void contains(CG const& group, InputIt first, InputIt last, OutputIt output_begin)
-  // const;
+  template <class CG, class HashValue, class BlockIndex>
+  __device__ bool contains_impl(CG const& group,
+                                HashValue const& hash_value,
+                                BlockIndex block_index) const
+  {
+    constexpr auto num_threads         = tile_size_v<CG>;
+    constexpr auto optimal_num_threads = contains_optimal_cg_size();
+    constexpr auto words_per_thread    = words_per_block / optimal_num_threads;
+
+    auto const rank = group.thread_rank();
+    bool success    = true;
+
+#pragma unroll
+    for (uint32_t i = rank; i < optimal_num_threads; i += num_threads) {
+      auto const thread_offset = i * words_per_thread;
+      auto const stored_pattern =
+        this->vec_load_words<words_per_thread>(block_index * words_per_block + thread_offset);
+#pragma unroll words_per_thread
+      for (uint32_t j = 0; j < words_per_thread; ++j) {
+        auto const expected_pattern = policy_.word_pattern(hash_value, thread_offset + j);
+        if ((stored_pattern[j] & expected_pattern) != expected_pattern) {
+          success = false;
+          break;
+        }
+      }
+      if (not success) { break; }
+    }
+
+    return group.all(success);
+  }
 
   template <class InputIt, class OutputIt>
   __host__ constexpr void contains(InputIt first,
@@ -384,6 +466,20 @@ class bloom_filter_impl {
                                          OutputIt output_begin,
                                          cuda::stream_ref stream) const noexcept
   {
+    // TODO perfoms worse than fallback
+    // auto const num_keys = cuco::detail::distance(first, last);
+    // if (num_keys == 0) { return; }
+
+    // auto constexpr cg_size    = contains_optimal_cg_size();
+    // auto constexpr block_size = cuco::detail::default_block_size();
+    // void const* kernel = reinterpret_cast<void const*>(detail::bloom_filter_ns::contains<cg_size,
+    // block_size, InputIt, OutputIt, bloom_filter_impl>); auto const grid_size =
+    // cuco::detail::max_occupancy_grid_size(block_size, kernel) * 2.5;
+
+    // detail::bloom_filter_ns::contains<cg_size, block_size>
+    //   <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, output_begin, *this);
+
+    // fallback method
     auto const always_true = thrust::constant_iterator<bool>{true};
     this->contains_if_async(first, last, always_true, thrust::identity{}, output_begin, stream);
   }
@@ -413,8 +509,9 @@ class bloom_filter_impl {
 
     auto constexpr cg_size    = contains_optimal_cg_size();
     auto constexpr block_size = cuco::detail::default_block_size();
-    auto const grid_size =
-      cuco::detail::grid_size(num_keys, cg_size, cuco::detail::default_stride(), block_size);
+    // TODO stride = 1 is optimal for arrow policy while stride = 16-32 is optimal for default
+    // policy
+    auto const grid_size = cuco::detail::grid_size(num_keys, cg_size, 2, block_size);
 
     detail::bloom_filter_ns::contains_if_n<cg_size, block_size>
       <<<grid_size, block_size, 0, stream.get()>>>(
diff --git a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl
index 96d2c0573..c9fe32ea0 100644
--- a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl
+++ b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl
@@ -130,6 +130,16 @@ template <class CG, class ProbeKey>
   return impl_.contains(group, key);
 }
 
+template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
+template <class CG, class InputIt, class OutputIt>
+__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::contains(CG const& group,
+                                                                       InputIt first,
+                                                                       InputIt last,
+                                                                       OutputIt output_begin) const
+{
+  impl_.contains(group, first, last, output_begin);
+}
+
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 template <class InputIt, class OutputIt>
 __host__ constexpr void bloom_filter_ref<Key, Extent, Scope, Policy>::contains(
diff --git a/include/cuco/detail/bloom_filter/kernels.cuh b/include/cuco/detail/bloom_filter/kernels.cuh
index 9e04b73c4..08adb814f 100644
--- a/include/cuco/detail/bloom_filter/kernels.cuh
+++ b/include/cuco/detail/bloom_filter/kernels.cuh
@@ -73,6 +73,31 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void add_if_n(
   }
 }
 
+template <int32_t CGSize, int32_t BlockSize, class InputIt, class OutputIt, class Ref>
+CUCO_KERNEL __launch_bounds__(BlockSize) void contains(InputIt first,
+                                                       cuco::detail::index_type n,
+                                                       OutputIt out,
+                                                       Ref ref)
+{
+  namespace cg = cooperative_groups;
+
+  // TODO use shmem + async copies for output
+
+  constexpr auto tile_size = cuco::detail::warp_size();
+
+  auto const tile_idx       = cuco::detail::global_thread_id() / tile_size;
+  auto const n_tiles        = gridDim.x * BlockSize / tile_size;
+  auto const items_per_tile = cuco::detail::int_div_ceil(n, n_tiles);
+
+  auto const tile_start = tile_idx * items_per_tile;
+  if (tile_start >= n) { return; }
+  auto const tile_stop = (tile_start + items_per_tile < n) ? tile_start + items_per_tile : n;
+
+  auto const tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
+
+  ref.contains(tile, first + tile_start, first + tile_stop, out + tile_start);
+}
+
 template <int32_t CGSize,
           int32_t BlockSize,
           class InputIt,