From 84fd0b169f52a9c516964fb4544ffb78eba61b74 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Sat, 14 Oct 2023 00:26:10 +0000
Subject: [PATCH 01/16] WIP add insert_or_apply

---
 include/cuco/detail/static_map/kernels.cuh    |  39 ++++
 include/cuco/detail/static_map/static_map.inl |  38 ++++
 .../cuco/detail/static_map/static_map_ref.inl | 210 ++++++++++++++++++
 include/cuco/operator.hpp                     |  16 ++
 include/cuco/static_map.cuh                   |  11 +
 tests/CMakeLists.txt                          |   1 +
 tests/static_map/insert_or_apply_test.cu      | 104 +++++++++
 7 files changed, 419 insertions(+)
 create mode 100644 tests/static_map/insert_or_apply_test.cu
diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh
index f9171ef77..b65e59451 100644
--- a/include/cuco/detail/static_map/kernels.cuh
+++ b/include/cuco/detail/static_map/kernels.cuh
@@ -30,6 +30,7 @@ namespace experimental {
 namespace static_map_ns {
 namespace detail {
 
+// TODO user insert_or_assign internally
 /**
  * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to
  * `k` already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`.
@@ -67,6 +68,44 @@ __global__ void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref
   }
 }
 
+// TODO docs
+/**
+ * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to
+ * `k` already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`.
+ * If the key does not exist, inserts the pair as if by insert.
+ *
+ * @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element
+ * is inserted.
+ *
+ * @tparam CGSize Number of threads in each CG
+ * @tparam BlockSize Number of threads in each block
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the `value_type` of the data structure
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of input elements
+ * @param n Number of input elements
+ * @param ref Non-owning container device ref used to access the slot storage
+ */
+template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename Op, typename Ref>
+__global__ void insert_or_apply(InputIt first, cuco::detail::index_type n, Op op, Ref ref)
+{
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
+
+  while (idx < n) {
+    typename std::iterator_traits<InputIt>::value_type const& insert_pair = *(first + idx);
+    if constexpr (CGSize == 1) {
+      ref.insert_or_apply(insert_pair, op);
+    } else {
+      auto const tile =
+        cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
+      ref.insert_or_apply(tile, insert_pair, op);
+    }
+    idx += loop_stride;
+  }
+}
+
 /**
  * @brief Finds the equivalent map elements of all keys in the range `[first, first + n)`.
  *
diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
index 9249d4fa1..80d7135e9 100644
--- a/include/cuco/detail/static_map/static_map.inl
+++ b/include/cuco/detail/static_map/static_map.inl
@@ -241,6 +241,44 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
       first, num, ref(op::insert_or_assign));
 }
 
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename Op>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream) noexcept
+{
+  return this->insert_or_apply_async(first, last, op, stream);
+  stream.synchronize();
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename Op>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  insert_or_apply_async(InputIt first, InputIt last, Op op, cuda_stream_ref stream) noexcept
+{
+  auto const num = cuco::detail::distance(first, last);
+  if (num == 0) { return; }
+
+  auto const grid_size = cuco::detail::grid_size(num, cg_size);
+
+  static_map_ns::detail::insert_or_apply<cg_size, cuco::detail::default_block_size()>
+    <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+      first, num, op, ref(op::insert_or_apply));
+}
+
 template <class Key,
           class T,
           class Extent,
diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
index f27f21e76..526876cb4 100644
--- a/include/cuco/detail/static_map/static_map_ref.inl
+++ b/include/cuco/detail/static_map/static_map_ref.inl
@@ -212,6 +212,7 @@ class operator_impl<
   }
 };
 
+// TODO use insert_or_apply internally
 template <typename Key,
           typename T,
           cuda::thread_scope Scope,
@@ -392,6 +393,215 @@ class operator_impl<
   }
 };
 
+template <typename Key,
+          typename T,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+class operator_impl<
+  op::insert_or_apply_tag,
+  static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>> {
+  using base_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef>;
+  using ref_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>;
+  using key_type = typename base_type::key_type;
+  using value_type = typename base_type::value_type;
+
+  static constexpr auto cg_size     = base_type::cg_size;
+  static constexpr auto window_size = base_type::window_size;
+
+  static_assert(sizeof(T) == 4 or sizeof(T) == 8,
+                "sizeof(mapped_type) must be either 4 bytes or 8 bytes.");
+
+ public:
+  // TODO docs
+  /**
+   * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v`
+   * to the mapped_type corresponding to the key `k`.
+   *
+   * @tparam Value Input type which is implicitly convertible to 'value_type'
+   *
+   * @param value The element to insert
+   */
+  template <typename Value, typename Op>
+  __device__ void insert_or_apply(Value const& value, Op op) noexcept
+  {
+    static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
+
+    ref_type& ref_       = static_cast<ref_type&>(*this);
+    auto const key       = thrust::get<0>(value);
+    auto& probing_scheme = ref_.impl_.probing_scheme();
+    auto storage_ref     = ref_.impl_.storage_ref();
+    auto probing_iter    = probing_scheme(key, storage_ref.window_extent());
+
+    while (true) {
+      auto const window_slots = storage_ref[*probing_iter];
+
+      for (auto& slot_content : window_slots) {
+        auto const eq_res = ref_.impl_.predicate()(slot_content.first, key);
+
+        // If the key is already in the container, update the payload and return
+        if (eq_res == detail::equal_result::EQUAL) {
+          auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content);
+          op(((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second,
+             static_cast<T>(thrust::get<1>(value)));
+          return;
+        }
+        if (eq_res == detail::equal_result::EMPTY or
+            cuco::detail::bitwise_compare(slot_content.first, ref_.impl_.erased_key_sentinel())) {
+          auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content);
+          if (attempt_insert_or_apply(
+                (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op)) {
+            return;
+          }
+        }
+      }
+      ++probing_iter;
+    }
+  }
+
+  template <typename Value>
+  __device__ void insert_or_apply(Value const& value,
+                                  cuco::experimental::op::reduce::sum_tag) noexcept
+  {
+    auto& ref_ = static_cast<ref_type&>(*this);
+    ref_.insert_or_apply(value, [](T& slot, T const& payload) {
+      cuda::atomic_ref<T, Scope> slot_ref{slot};
+      slot_ref.fetch_add(payload, cuda::memory_order_relaxed);
+    });
+  }
+
+  // TODO docs
+  /**
+   * @brief Inserts an element.
+   *
+   * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v`
+   * to the mapped_type corresponding to the key `k`.
+   *
+   * @tparam Value Input type which is implicitly convertible to 'value_type'
+   *
+   * @param group The Cooperative Group used to perform group insert
+   * @param value The element to insert
+   */
+  template <typename Value, typename Op>
+  __device__ void insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
+                                  Value const& value,
+                                  Op op) noexcept
+  {
+    ref_type& ref_ = static_cast<ref_type&>(*this);
+
+    auto const key       = value.first;
+    auto& probing_scheme = ref_.impl_.probing_scheme();
+    auto storage_ref     = ref_.impl_.storage_ref();
+    auto probing_iter    = probing_scheme(group, key, storage_ref.window_extent());
+
+    while (true) {
+      auto const window_slots = storage_ref[*probing_iter];
+
+      auto const [state, intra_window_index] = [&]() {
+        for (auto i = 0; i < window_size; ++i) {
+          switch (ref_.impl_.predicate()(window_slots[i].first, key)) {
+            case detail::equal_result::EMPTY:
+              return detail::window_probing_results{detail::equal_result::EMPTY, i};
+            case detail::equal_result::EQUAL:
+              return detail::window_probing_results{detail::equal_result::EQUAL, i};
+            default: {
+              if (cuco::detail::bitwise_compare(window_slots[i].first,
+                                                ref_.impl_.erased_key_sentinel())) {
+                return window_probing_results{detail::equal_result::ERASED, i};
+              } else {
+                continue;
+              }
+            }
+          }
+        }
+        // returns dummy index `-1` for UNEQUAL
+        return detail::window_probing_results{detail::equal_result::UNEQUAL, -1};
+      }();
+
+      auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL);
+      if (group_contains_equal) {
+        auto const src_lane = __ffs(group_contains_equal) - 1;
+        if (group.thread_rank() == src_lane) {
+          op(&((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second,
+             value.second);
+        }
+        group.sync();
+        return;
+      }
+
+      auto const group_contains_available =
+        group.ballot(state == detail::equal_result::EMPTY or state == detail::equal_result::ERASED);
+      if (group_contains_available) {
+        auto const src_lane = __ffs(group_contains_available) - 1;
+        auto const status =
+          (group.thread_rank() == src_lane)
+            ? attempt_insert_or_apply(
+                (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op)
+            : false;
+
+        // Exit if inserted or assigned
+        if (group.shfl(status, src_lane)) { return; }
+      } else {
+        ++probing_iter;
+      }
+    }
+  }
+
+  template <typename Value>
+  __device__ void insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
+                                  Value const& value,
+                                  cuco::experimental::op::reduce::sum_tag) noexcept
+  {
+    auto& ref_ = static_cast<ref_type&>(*this);
+    ref_.insert_or_apply(group, value, [](T& slot, T const& payload) {
+      cuda::atomic_ref<T, Scope> slot_ref{slot};
+      slot_ref.fetch_add(payload, cuda::memory_order_relaxed);
+    });
+  }
+
+ private:
+  // TODO docs
+  /**
+   * @brief Attempts to insert an element into a slot or update the matching payload with the given
+   * element
+   *
+   * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v`
+   * to the mapped_type corresponding to the key `k`.
+   *
+   * @tparam Value Input type which is implicitly convertible to 'value_type'
+   *
+   * @param group The Cooperative Group used to perform group insert
+   * @param value The element to insert
+   *
+   * @return Returns `true` if the given `value` is inserted or `value` has a match in the map.
+   */
+  template <typename Value, typename Op>
+  __device__ constexpr bool attempt_insert_or_apply(value_type* slot,
+                                                    Value const& value,
+                                                    Op op) noexcept
+  {
+    ref_type& ref_          = static_cast<ref_type&>(*this);
+    auto const expected_key = ref_.impl_.empty_slot_sentinel().first;
+
+    auto old_key = ref_.impl_.compare_and_swap(
+      &slot->first, expected_key, static_cast<key_type>(thrust::get<0>(value)));
+    auto* old_key_ptr = reinterpret_cast<key_type*>(&old_key);
+
+    // if key success or key was already present in the map
+    if (cuco::detail::bitwise_compare(*old_key_ptr, expected_key) or
+        (ref_.impl_.predicate().equal_to(*old_key_ptr,
+                                         thrust::get<0>(thrust::raw_reference_cast(value))) ==
+         detail::equal_result::EQUAL)) {
+      // Update payload
+      op(slot->second, static_cast<T>(thrust::get<1>(value)));
+      return true;
+    }
+    return false;
+  }
+};
+
 template <typename Key,
           typename T,
           cuda::thread_scope Scope,
diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp
index 72f4646f9..167ffade3 100644
--- a/include/cuco/operator.hpp
+++ b/include/cuco/operator.hpp
@@ -39,6 +39,12 @@ struct insert_and_find_tag {
 struct insert_or_assign_tag {
 } inline constexpr insert_or_assign;
 
+/**
+ * @brief `insert_or_apply` operator tag
+ */
+struct insert_or_apply_tag {
+} inline constexpr insert_or_apply;
+
 /**
  * @brief `erase` operator tag
  */
@@ -57,6 +63,16 @@ struct contains_tag {
 struct find_tag {
 } inline constexpr find;
 
+namespace reduce {
+
+/**
+ * @brief `sum` reduction operator tag
+ */
+struct sum_tag {
+} inline constexpr sum;
+
+}  // namespace reduce
+
 }  // namespace op
 }  // namespace experimental
 }  // namespace cuco
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index 56fbabe23..864ba8f06 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -399,6 +399,17 @@ class static_map {
   template <typename InputIt>
   void insert_or_assign_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept;
 
+  // TODO docs
+  template <typename InputIt, typename Op>
+  void insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream = {}) noexcept;
+
+  // TODO docs
+  template <typename InputIt, typename Op>
+  void insert_or_apply_async(InputIt first,
+                             InputIt last,
+                             Op op,
+                             cuda_stream_ref stream = {}) noexcept;
+
   /**
    * @brief Erases keys in the range `[first, last)`.
    *
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 916e0ea42..c44de4037 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -77,6 +77,7 @@ ConfigureTest(STATIC_MAP_TEST
     static_map/heterogeneous_lookup_test.cu
     static_map/insert_and_find_test.cu
     static_map/insert_or_assign_test.cu
+    static_map/insert_or_apply_test.cu
     static_map/key_sentinel_test.cu
     static_map/shared_memory_test.cu
     static_map/stream_test.cu
diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu
new file mode 100644
index 000000000..127cfa6ba
--- /dev/null
+++ b/tests/static_map/insert_or_apply_test.cu
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/static_map.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+
+#include <cstdint>
+
+#include <iostream>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+using size_type = std::size_t;
+
+template <typename Map>
+__inline__ void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_keys)
+{
+  REQUIRE((num_keys % num_unique_keys) == 0);
+
+  using key_type    = typename Map::key_type;
+  using mapped_type = typename Map::mapped_type;
+
+  auto keys_begin = thrust::make_transform_iterator(
+    thrust::counting_iterator<key_type>(0),
+    [num_unique_keys] __host__ __device__(key_type const& x) -> key_type {
+      return x % num_unique_keys;
+    });
+
+  auto values_begin = thrust::make_constant_iterator<mapped_type>(1);
+
+  auto pairs_begin = thrust::make_zip_iterator(thrust::make_tuple(keys_begin, values_begin));
+
+  map.insert_or_apply(pairs_begin, pairs_begin + num_keys, cuco::experimental::op::reduce::sum);
+
+  REQUIRE(map.size() == num_unique_keys);
+
+  thrust::device_vector<key_type> d_keys(num_unique_keys);
+  thrust::device_vector<mapped_type> d_values(num_unique_keys);
+  map.retrieve_all(d_keys.begin(), d_values.begin());
+
+  // TODO remove
+  for (int i = 0; i < num_unique_keys; ++i) {
+    std::cout << d_keys[i] << " " << d_values[i] << std::endl;
+  }
+
+  REQUIRE(cuco::test::equal(d_values.begin(),
+                            d_values.end(),
+                            thrust::make_constant_iterator<mapped_type>(num_keys / num_unique_keys),
+                            thrust::equal_to<mapped_type>{}));
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "Insert or apply",
+  "",
+  ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize),
+   Key,
+   Value,
+   Probe,
+   CGSize),
+  (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1))
+{
+  constexpr size_type num_keys{10};
+  constexpr size_type num_unique_keys{10};
+
+  using probe =
+    std::conditional_t<Probe == cuco::test::probe_sequence::linear_probing,
+                       cuco::experimental::linear_probing<CGSize, cuco::murmurhash3_32<Key>>,
+                       cuco::experimental::double_hashing<CGSize,
+                                                          cuco::murmurhash3_32<Key>,
+                                                          cuco::murmurhash3_32<Key>>>;
+
+  auto map = cuco::experimental::static_map<Key,
+                                            Value,
+                                            cuco::experimental::extent<size_type>,
+                                            cuda::thread_scope_device,
+                                            thrust::equal_to<Key>,
+                                            probe,
+                                            cuco::cuda_allocator<std::byte>,
+                                            cuco::experimental::storage<2>>{
+    num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+
+  test_insert_or_apply(map, num_keys, num_unique_keys);
+}
\ No newline at end of file

From 0d76d1e43f6ac4be5a0634e8c5f6f45604b3a3fa Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 16 Oct 2023 16:43:41 +0000
Subject: [PATCH 02/16] Fix unit test

---
 .../cuco/detail/static_map/static_map_ref.inl |  6 ++--
 tests/static_map/insert_or_apply_test.cu      | 28 +++++++++++++------
 2 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
index 526876cb4..378d1d05e 100644
--- a/include/cuco/detail/static_map/static_map_ref.inl
+++ b/include/cuco/detail/static_map/static_map_ref.inl
@@ -491,7 +491,7 @@ class operator_impl<
   {
     ref_type& ref_ = static_cast<ref_type&>(*this);
 
-    auto const key       = value.first;
+    auto const key       = thrust::get<0>(thrust::raw_reference_cast(value));
     auto& probing_scheme = ref_.impl_.probing_scheme();
     auto storage_ref     = ref_.impl_.storage_ref();
     auto probing_iter    = probing_scheme(group, key, storage_ref.window_extent());
@@ -524,8 +524,8 @@ class operator_impl<
       if (group_contains_equal) {
         auto const src_lane = __ffs(group_contains_equal) - 1;
         if (group.thread_rank() == src_lane) {
-          op(&((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second,
-             value.second);
+          op(((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second,
+             static_cast<T>(thrust::get<1>(value)));
         }
         group.sync();
         return;
diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu
index 127cfa6ba..a7cbff438 100644
--- a/tests/static_map/insert_or_apply_test.cu
+++ b/tests/static_map/insert_or_apply_test.cu
@@ -59,11 +59,6 @@ __inline__ void test_insert_or_apply(Map& map, size_type num_keys, size_type num
   thrust::device_vector<mapped_type> d_values(num_unique_keys);
   map.retrieve_all(d_keys.begin(), d_values.begin());
 
-  // TODO remove
-  for (int i = 0; i < num_unique_keys; ++i) {
-    std::cout << d_keys[i] << " " << d_values[i] << std::endl;
-  }
-
   REQUIRE(cuco::test::equal(d_values.begin(),
                             d_values.end(),
                             thrust::make_constant_iterator<mapped_type>(num_keys / num_unique_keys),
@@ -78,10 +73,25 @@ TEMPLATE_TEST_CASE_SIG(
    Value,
    Probe,
    CGSize),
-  (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1))
+  (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2))
 {
-  constexpr size_type num_keys{10};
-  constexpr size_type num_unique_keys{10};
+  constexpr size_type num_keys{400};
+  constexpr size_type num_unique_keys{100};
 
   using probe =
     std::conditional_t<Probe == cuco::test::probe_sequence::linear_probing,
@@ -98,7 +108,7 @@ TEMPLATE_TEST_CASE_SIG(
                                             probe,
                                             cuco::cuda_allocator<std::byte>,
                                             cuco::experimental::storage<2>>{
-    num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+    num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
 
   test_insert_or_apply(map, num_keys, num_unique_keys);
 }
\ No newline at end of file

From 54e2697d326f12c56ff7d9c678c76e0fe2fa048e Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 16 Oct 2023 17:07:08 +0000
Subject: [PATCH 03/16] Add benchmark

---
 benchmarks/CMakeLists.txt                     |  3 +-
 .../static_map/insert_or_apply_bench.cu       | 93 +++++++++++++++++++
 2 files changed, 95 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/hash_table/static_map/insert_or_apply_bench.cu

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 6b03cb98c..00c5a46f3 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -62,7 +62,8 @@ ConfigureBench(STATIC_MAP_BENCH
   hash_table/static_map/insert_bench.cu
   hash_table/static_map/find_bench.cu
   hash_table/static_map/contains_bench.cu
-  hash_table/static_map/erase_bench.cu)
+  hash_table/static_map/erase_bench.cu
+  hash_table/static_map/insert_or_apply_bench.cu)
 
 ###################################################################################################
 # - static_multimap benchmarks --------------------------------------------------------------------
diff --git a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
new file mode 100644
index 000000000..725f7c15e
--- /dev/null
+++ b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <defaults.hpp>
+#include <utils.hpp>
+
+#include <cuco/static_map.cuh>
+#include <cuco/utility/key_generator.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/transform.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_map::insert_or_apply` performance
+ */
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert_or_apply(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  using pair_type = cuco::pair<Key, Value>;
+
+  auto const num_keys     = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy    = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+  auto const multiplicity = state.get_int64_or_default("Multiplicity", defaults::MULTIPLICITY);
+
+  std::size_t const size = cuco::detail::int_div_ceil(num_keys, multiplicity) / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  thrust::device_vector<pair_type> pairs(num_keys);
+  thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) {
+    return pair_type(key, static_cast<Value>(key));
+  });
+
+  state.add_element_count(num_keys);
+
+  cuco::experimental::static_map map{size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+
+  state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+    map.clear_async({launch.get_stream()});
+
+    timer.start();
+    map.insert_or_apply_async(
+      pairs.begin(), pairs.end(), cuco::experimental::op::reduce::sum, {launch.get_stream()});
+    timer.stop();
+  });
+}
+
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_insert_or_apply(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  state.skip("Key should be the same type as Value.");
+}
+
+NVBENCH_BENCH_TYPES(static_map_insert_or_apply,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_map_insert_or_apply_uniform_multiplicity")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_map_insert_or_apply,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_set_insert_or_apply_uniform_occupancy")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
\ No newline at end of file

From 2561965ffc308a6f31f0f31892e1c4f9f32e0303 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Mon, 24 Jun 2024 17:15:48 +0000
Subject: [PATCH 04/16] Update insert_or_apply to latest dev

---
 .../static_map/insert_or_apply_bench.cu       | 10 +--
 .../cuco/detail/static_map/static_map_ref.inl | 56 +++++++---------
 tests/static_map/insert_or_apply_test.cu      | 64 +++++++++----------
 3 files changed, 58 insertions(+), 72 deletions(-)

diff --git a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
index 725f7c15e..e670bd75d 100644
--- a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
+++ b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include <defaults.hpp>
-#include <utils.hpp>
+#include <benchmark_defaults.hpp>
+#include <benchmark_utils.hpp>
 
 #include <cuco/static_map.cuh>
-#include <cuco/utility/key_generator.hpp>
+#include <cuco/utility/key_generator.cuh>
 
 #include <nvbench/nvbench.cuh>
 
@@ -55,14 +55,14 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert_or_appl
 
   state.add_element_count(num_keys);
 
-  cuco::experimental::static_map map{size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+  cuco::static_map map{size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
 
   state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
     map.clear_async({launch.get_stream()});
 
     timer.start();
     map.insert_or_apply_async(
-      pairs.begin(), pairs.end(), cuco::experimental::op::reduce::sum, {launch.get_stream()});
+      pairs.begin(), pairs.end(), cuco::op::reduce::sum, {launch.get_stream()});
     timer.stop();
   });
 }
diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
index 014ce4dbd..a0fe659b2 100644
--- a/include/cuco/detail/static_map/static_map_ref.inl
+++ b/include/cuco/detail/static_map/static_map_ref.inl
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cuco/detail/equal_wrapper.cuh>
 #include <cuco/operator.hpp>
 
 #include <cuda/atomic>
@@ -586,8 +587,10 @@ class operator_impl<
   {
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
 
-    ref_type& ref_       = static_cast<ref_type&>(*this);
-    auto const key       = thrust::get<0>(value);
+    ref_type& ref_ = static_cast<ref_type&>(*this);
+
+    auto const val       = ref_.impl_.heterogeneous_value(value);
+    auto const key       = ref_.impl_.extract_key(val);
     auto& probing_scheme = ref_.impl_.probing_scheme();
     auto storage_ref     = ref_.impl_.storage_ref();
     auto probing_iter    = probing_scheme(key, storage_ref.window_extent());
@@ -596,17 +599,17 @@ class operator_impl<
       auto const window_slots = storage_ref[*probing_iter];
 
       for (auto& slot_content : window_slots) {
-        auto const eq_res = ref_.impl_.predicate()(slot_content.first, key);
+        auto const eq_res =
+          ref_.impl_.predicate_.operator()<is_insert::YES>(key, slot_content.first);
 
         // If the key is already in the container, update the payload and return
         if (eq_res == detail::equal_result::EQUAL) {
           auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content);
           op(((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second,
-             static_cast<T>(thrust::get<1>(value)));
+             val.second);
           return;
         }
-        if (eq_res == detail::equal_result::EMPTY or
-            cuco::detail::bitwise_compare(slot_content.first, ref_.impl_.erased_key_sentinel())) {
+        if (eq_res == detail::equal_result::AVAILABLE) {
           auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content);
           if (attempt_insert_or_apply(
                 (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op)) {
@@ -619,8 +622,7 @@ class operator_impl<
   }
 
   template <typename Value>
-  __device__ void insert_or_apply(Value const& value,
-                                  cuco::experimental::op::reduce::sum_tag) noexcept
+  __device__ void insert_or_apply(Value const& value, cuco::op::reduce::sum_tag) noexcept
   {
     auto& ref_ = static_cast<ref_type&>(*this);
     ref_.insert_or_apply(value, [](T& slot, T const& payload) {
@@ -648,7 +650,8 @@ class operator_impl<
   {
     ref_type& ref_ = static_cast<ref_type&>(*this);
 
-    auto const key       = thrust::get<0>(thrust::raw_reference_cast(value));
+    auto const val       = ref_.impl_.heterogeneous_value(value);
+    auto const key       = ref_.impl_.extract_key(val);
     auto& probing_scheme = ref_.impl_.probing_scheme();
     auto storage_ref     = ref_.impl_.storage_ref();
     auto probing_iter    = probing_scheme(group, key, storage_ref.window_extent());
@@ -657,24 +660,15 @@ class operator_impl<
       auto const window_slots = storage_ref[*probing_iter];
 
       auto const [state, intra_window_index] = [&]() {
+        auto res = detail::equal_result::UNEQUAL;
         for (auto i = 0; i < window_size; ++i) {
-          switch (ref_.impl_.predicate()(window_slots[i].first, key)) {
-            case detail::equal_result::EMPTY:
-              return detail::window_probing_results{detail::equal_result::EMPTY, i};
-            case detail::equal_result::EQUAL:
-              return detail::window_probing_results{detail::equal_result::EQUAL, i};
-            default: {
-              if (cuco::detail::bitwise_compare(window_slots[i].first,
-                                                ref_.impl_.erased_key_sentinel())) {
-                return window_probing_results{detail::equal_result::ERASED, i};
-              } else {
-                continue;
-              }
-            }
+          res = ref_.impl_.predicate_.operator()<is_insert::YES>(key, window_slots[i].first);
+          if (res != detail::equal_result::UNEQUAL) {
+            return detail::window_probing_results{res, i};
           }
         }
         // returns dummy index `-1` for UNEQUAL
-        return detail::window_probing_results{detail::equal_result::UNEQUAL, -1};
+        return detail::window_probing_results{res, -1};
       }();
 
       auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL);
@@ -682,14 +676,13 @@ class operator_impl<
         auto const src_lane = __ffs(group_contains_equal) - 1;
         if (group.thread_rank() == src_lane) {
           op(((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second,
-             static_cast<T>(thrust::get<1>(value)));
+             val.second);
         }
         group.sync();
         return;
       }
 
-      auto const group_contains_available =
-        group.ballot(state == detail::equal_result::EMPTY or state == detail::equal_result::ERASED);
+      auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE);
       if (group_contains_available) {
         auto const src_lane = __ffs(group_contains_available) - 1;
         auto const status =
@@ -709,7 +702,7 @@ class operator_impl<
   template <typename Value>
   __device__ void insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
                                   Value const& value,
-                                  cuco::experimental::op::reduce::sum_tag) noexcept
+                                  cuco::op::reduce::sum_tag) noexcept
   {
     auto& ref_ = static_cast<ref_type&>(*this);
     ref_.insert_or_apply(group, value, [](T& slot, T const& payload) {
@@ -742,17 +735,16 @@ class operator_impl<
     ref_type& ref_          = static_cast<ref_type&>(*this);
     auto const expected_key = ref_.impl_.empty_slot_sentinel().first;
 
-    auto old_key = ref_.impl_.compare_and_swap(
-      &slot->first, expected_key, static_cast<key_type>(thrust::get<0>(value)));
+    auto old_key =
+      ref_.impl_.compare_and_swap(&slot->first, expected_key, static_cast<key_type>(value.first));
     auto* old_key_ptr = reinterpret_cast<key_type*>(&old_key);
 
     // if key success or key was already present in the map
     if (cuco::detail::bitwise_compare(*old_key_ptr, expected_key) or
-        (ref_.impl_.predicate().equal_to(*old_key_ptr,
-                                         thrust::get<0>(thrust::raw_reference_cast(value))) ==
+        (ref_.impl_.predicate().equal_to(value.first, *old_key_ptr) ==
          detail::equal_result::EQUAL)) {
       // Update payload
-      op(slot->second, static_cast<T>(thrust::get<1>(value)));
+      op(slot->second, value.second);
       return true;
     }
     return false;
diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu
index a7cbff438..eade154ea 100644
--- a/tests/static_map/insert_or_apply_test.cu
+++ b/tests/static_map/insert_or_apply_test.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <utils.hpp>
+#include <test_utils.hpp>
 
 #include <cuco/static_map.cuh>
 
@@ -25,12 +25,11 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/tuple.h>
 
-#include <cstdint>
+#include <catch2/catch_template_test_macros.hpp>
 
+#include <cstdint>
 #include <iostream>
 
-#include <catch2/catch_template_test_macros.hpp>
-
 using size_type = std::size_t;
 
 template <typename Map>
@@ -38,31 +37,28 @@ __inline__ void test_insert_or_apply(Map& map, size_type num_keys, size_type num
 {
   REQUIRE((num_keys % num_unique_keys) == 0);
 
-  using key_type    = typename Map::key_type;
-  using mapped_type = typename Map::mapped_type;
-
-  auto keys_begin = thrust::make_transform_iterator(
-    thrust::counting_iterator<key_type>(0),
-    [num_unique_keys] __host__ __device__(key_type const& x) -> key_type {
-      return x % num_unique_keys;
-    });
-
-  auto values_begin = thrust::make_constant_iterator<mapped_type>(1);
+  using Key   = typename Map::key_type;
+  using Value = typename Map::mapped_type;
 
-  auto pairs_begin = thrust::make_zip_iterator(thrust::make_tuple(keys_begin, values_begin));
+  // Insert pairs
+  auto pairs_begin = thrust::make_transform_iterator(
+    thrust::counting_iterator<size_type>(0),
+    cuda::proclaim_return_type<cuco::pair<Key, Value>>([num_unique_keys] __device__(auto i) {
+      return cuco::pair<Key, Value>{i % num_unique_keys, 1};
+    }));
 
-  map.insert_or_apply(pairs_begin, pairs_begin + num_keys, cuco::experimental::op::reduce::sum);
+  map.insert_or_apply(pairs_begin, pairs_begin + num_keys, cuco::op::reduce::sum);
 
   REQUIRE(map.size() == num_unique_keys);
 
-  thrust::device_vector<key_type> d_keys(num_unique_keys);
-  thrust::device_vector<mapped_type> d_values(num_unique_keys);
+  thrust::device_vector<Key> d_keys(num_unique_keys);
+  thrust::device_vector<Value> d_values(num_unique_keys);
   map.retrieve_all(d_keys.begin(), d_values.begin());
 
   REQUIRE(cuco::test::equal(d_values.begin(),
                             d_values.end(),
-                            thrust::make_constant_iterator<mapped_type>(num_keys / num_unique_keys),
-                            thrust::equal_to<mapped_type>{}));
+                            thrust::make_constant_iterator<Value>(num_keys / num_unique_keys),
+                            thrust::equal_to<Value>{}));
 }
 
 TEMPLATE_TEST_CASE_SIG(
@@ -93,21 +89,19 @@ TEMPLATE_TEST_CASE_SIG(
   constexpr size_type num_keys{400};
   constexpr size_type num_unique_keys{100};
 
-  using probe =
-    std::conditional_t<Probe == cuco::test::probe_sequence::linear_probing,
-                       cuco::experimental::linear_probing<CGSize, cuco::murmurhash3_32<Key>>,
-                       cuco::experimental::double_hashing<CGSize,
-                                                          cuco::murmurhash3_32<Key>,
-                                                          cuco::murmurhash3_32<Key>>>;
-
-  auto map = cuco::experimental::static_map<Key,
-                                            Value,
-                                            cuco::experimental::extent<size_type>,
-                                            cuda::thread_scope_device,
-                                            thrust::equal_to<Key>,
-                                            probe,
-                                            cuco::cuda_allocator<std::byte>,
-                                            cuco::experimental::storage<2>>{
+  using probe = std::conditional_t<
+    Probe == cuco::test::probe_sequence::linear_probing,
+    cuco::linear_probing<CGSize, cuco::murmurhash3_32<Key>>,
+    cuco::double_hashing<CGSize, cuco::murmurhash3_32<Key>, cuco::murmurhash3_32<Key>>>;
+
+  auto map = cuco::static_map<Key,
+                              Value,
+                              cuco::extent<size_type>,
+                              cuda::thread_scope_device,
+                              thrust::equal_to<Key>,
+                              probe,
+                              cuco::cuda_allocator<std::byte>,
+                              cuco::storage<2>>{
     num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
 
   test_insert_or_apply(map, num_keys, num_unique_keys);

From 5e94f0621ae0e8b9d4440fead3e7b3f87772f9eb Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Mon, 24 Jun 2024 23:57:45 +0000
Subject: [PATCH 05/16] Update insert_or_apply to accept Op of a specific
 signature

---
 .../cuco/detail/static_map/static_map_ref.inl | 71 ++++++++++++-------
 tests/static_map/insert_or_apply_test.cu      | 28 +++++++-
 2 files changed, 72 insertions(+), 27 deletions(-)

diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
index a0fe659b2..678c481fb 100644
--- a/include/cuco/detail/static_map/static_map_ref.inl
+++ b/include/cuco/detail/static_map/static_map_ref.inl
@@ -573,20 +573,30 @@ class operator_impl<
                 "sizeof(mapped_type) must be either 4 bytes or 8 bytes.");
 
  public:
-  // TODO docs
   /**
-   * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v`
-   * to the mapped_type corresponding to the key `k`.
+   * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, applies
+   `Op`
+   * binary function to the mapped_type corresponding to the key `k` and the value `v`.
    *
    * @tparam Value Input type which is implicitly convertible to 'value_type'
-   *
+   * @tparam Op Callable type which is used as apply operation and can be
+   *   called with arguments as Op(cuda::atomic_ref<T, Scope>, T). Op strictly must
+   *   have this signature to atomically apply the operation.
+
    * @param value The element to insert
+   * @param op The callable object to perform binary operation between existing value at the slot
+   *  and the element to insert.
    */
+
   template <typename Value, typename Op>
   __device__ void insert_or_apply(Value const& value, Op op) noexcept
   {
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
 
+    static_assert(
+      std::is_invocable_v<Op, cuda::atomic_ref<T, Scope>, T>,
+      "insert_or_apply expects `Op` to be a callable as `Op(cuda::atomic_ref<T, Scope>, T)`");
+
     ref_type& ref_ = static_cast<ref_type&>(*this);
 
     auto const val       = ref_.impl_.heterogeneous_value(value);
@@ -605,8 +615,10 @@ class operator_impl<
         // If the key is already in the container, update the payload and return
         if (eq_res == detail::equal_result::EQUAL) {
           auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content);
-          op(((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second,
-             val.second);
+          op(
+            cuda::atomic_ref<T, Scope>{
+              ((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second},
+            val.second);
           return;
         }
         if (eq_res == detail::equal_result::AVAILABLE) {
@@ -625,29 +637,35 @@ class operator_impl<
   __device__ void insert_or_apply(Value const& value, cuco::op::reduce::sum_tag) noexcept
   {
     auto& ref_ = static_cast<ref_type&>(*this);
-    ref_.insert_or_apply(value, [](T& slot, T const& payload) {
-      cuda::atomic_ref<T, Scope> slot_ref{slot};
+    ref_.insert_or_apply(value, [](cuda::atomic_ref<T, Scope> slot_ref, T const& payload) {
       slot_ref.fetch_add(payload, cuda::memory_order_relaxed);
     });
   }
 
-  // TODO docs
   /**
-   * @brief Inserts an element.
-   *
-   * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v`
-   * to the mapped_type corresponding to the key `k`.
+   * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, applies
+   * `Op` binary function to the mapped_type corresponding to the key `k` and the value `v`.
    *
    * @tparam Value Input type which is implicitly convertible to 'value_type'
+   * @tparam Op Callable type which is used as apply operation and can be
+   *   called with arguments as Op(cuda::atomic_ref<T, Scope>, T). Op strictly must
+   *   have this signature to atomically apply the operation.
    *
    * @param group The Cooperative Group used to perform group insert
    * @param value The element to insert
+   * @param op The callable object to perform binary operation between existing value at the slot
+   *  and the element to insert.
    */
+
   template <typename Value, typename Op>
   __device__ void insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
                                   Value const& value,
                                   Op op) noexcept
   {
+    static_assert(
+      std::is_invocable_v<Op, cuda::atomic_ref<T, Scope>, T>,
+      "insert_or_apply expects `Op` to be a callable as `Op(cuda::atomic_ref<T, Scope>, T)`");
+
     ref_type& ref_ = static_cast<ref_type&>(*this);
 
     auto const val       = ref_.impl_.heterogeneous_value(value);
@@ -675,8 +693,10 @@ class operator_impl<
       if (group_contains_equal) {
         auto const src_lane = __ffs(group_contains_equal) - 1;
         if (group.thread_rank() == src_lane) {
-          op(((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second,
-             val.second);
+          op(
+            cuda::atomic_ref<T, Scope>{
+              ((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second},
+            val.second);
         }
         group.sync();
         return;
@@ -705,25 +725,24 @@ class operator_impl<
                                   cuco::op::reduce::sum_tag) noexcept
   {
     auto& ref_ = static_cast<ref_type&>(*this);
-    ref_.insert_or_apply(group, value, [](T& slot, T const& payload) {
-      cuda::atomic_ref<T, Scope> slot_ref{slot};
+    ref_.insert_or_apply(group, value, [](cuda::atomic_ref<T, Scope> slot_ref, T const& payload) {
       slot_ref.fetch_add(payload, cuda::memory_order_relaxed);
     });
   }
 
  private:
-  // TODO docs
   /**
-   * @brief Attempts to insert an element into a slot or update the matching payload with the given
-   * element
-   *
-   * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v`
-   * to the mapped_type corresponding to the key `k`.
+   * @brief Attempts to insert an element into a slot or update the matching payload by applying the
+   * binary operation on the payload and new value.
    *
    * @tparam Value Input type which is implicitly convertible to 'value_type'
-   *
-   * @param group The Cooperative Group used to perform group insert
+   * @tparam Op Callable type which is used as apply operation and called be
+   *   called with arguments as Op(cuda::atomic_ref<T, Scope>, T)
+
+   * @param slot value_type pointer to the slot to insert
    * @param value The element to insert
+   * @param op The callable object to perform binary operation between existing value at the slot
+   *  and element to insert.
    *
    * @return Returns `true` if the given `value` is inserted or `value` has a match in the map.
    */
@@ -744,7 +763,7 @@ class operator_impl<
         (ref_.impl_.predicate().equal_to(value.first, *old_key_ptr) ==
          detail::equal_result::EQUAL)) {
       // Update payload
-      op(slot->second, value.second);
+      op(cuda::atomic_ref<T, Scope>{slot->second}, value.second);
       return true;
     }
     return false;
diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu
index eade154ea..39d7d6df8 100644
--- a/tests/static_map/insert_or_apply_test.cu
+++ b/tests/static_map/insert_or_apply_test.cu
@@ -18,6 +18,7 @@
 
 #include <cuco/static_map.cuh>
 
+#include <cuda/atomic>
 #include <thrust/device_vector.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -47,7 +48,12 @@ __inline__ void test_insert_or_apply(Map& map, size_type num_keys, size_type num
       return cuco::pair<Key, Value>{i % num_unique_keys, 1};
     }));
 
-  map.insert_or_apply(pairs_begin, pairs_begin + num_keys, cuco::op::reduce::sum);
+  map.insert_or_apply(
+    pairs_begin,
+    pairs_begin + num_keys,
+    [] __device__(cuda::atomic_ref<Value, Map::thread_scope> lhs, const Value& rhs) {
+      lhs.fetch_add(rhs, cuda::memory_order_relaxed);
+    });
 
   REQUIRE(map.size() == num_unique_keys);
 
@@ -105,4 +111,24 @@ TEMPLATE_TEST_CASE_SIG(
     num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
 
   test_insert_or_apply(map, num_keys, num_unique_keys);
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "Insert or apply all unique keys", "", ((typename Key)), (int32_t), (int64_t))
+{
+  using Value = Key;
+
+  constexpr size_type num_keys = 100;
+
+  auto map = cuco::static_map<Key,
+                              Value,
+                              cuco::extent<size_type>,
+                              cuda::thread_scope_device,
+                              thrust::equal_to<Key>,
+                              cuco::linear_probing<1, cuco::murmurhash3_32<Key>>,
+                              cuco::cuda_allocator<std::byte>,
+                              cuco::storage<2>>{
+    num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+
+  test_insert_or_apply(map, num_keys, num_keys);
 }
\ No newline at end of file

From e7bca30fc8a78cce2cfe65cb50e4920940febad2 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Tue, 25 Jun 2024 00:38:26 +0000
Subject: [PATCH 06/16] Clean up and add docs

---
 include/cuco/detail/static_map/kernels.cuh | 72 +++-------------------
 include/cuco/static_map.cuh                | 42 ++++++++++++-
 2 files changed, 47 insertions(+), 67 deletions(-)

diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh
index fee007344..ec15a66ed 100644
--- a/include/cuco/detail/static_map/kernels.cuh
+++ b/include/cuco/detail/static_map/kernels.cuh
@@ -68,23 +68,25 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_assign(InputIt first,
   }
 }
 
-// TODO docs
 /**
  * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to
- * `k` already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`.
- * If the key does not exist, inserts the pair as if by insert.
+ * `k` already exists in the container, then binary operation is applied using `op` callable object
+ * on the existing value at slot and the element to insert. If the key does not exist, inserts the
+ * pair as if by insert.
  *
- * @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element
- * is inserted.
+ * @note Callable object to perform binary operation should be able to invoke as Op(cuda::atomic<T,
+ * Scope>, T>)
  *
  * @tparam CGSize Number of threads in each CG
  * @tparam BlockSize Number of threads in each block
  * @tparam InputIt Device accessible input iterator whose `value_type` is
  * convertible to the `value_type` of the data structure
+ * @tparam Op Callable type used to peform apply operation.
  * @tparam Ref Type of non-owning device ref allowing access to storage
  *
  * @param first Beginning of the sequence of input elements
  * @param n Number of input elements
+ * @param op callable object to perform apply operation.
  * @param ref Non-owning container device ref used to access the slot storage
  */
 template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename Op, typename Ref>
@@ -106,64 +108,4 @@ __global__ void insert_or_apply(InputIt first, cuco::detail::index_type n, Op op
   }
 }
 
-/**
- * @brief Finds the equivalent map elements of all keys in the range `[first, first + n)`.
- *
- * @note If the key `*(first + i)` has a match in the container, copies the payload of its matched
- * element to `(output_begin + i)`. Else, copies the empty value sentinel. Uses the CUDA Cooperative
- * Groups API to leverage groups of multiple threads to find each key. This provides a significant
- * boost in throughput compared to the non Cooperative Group `find` at moderate to high load
- * factors.
- *
- * @tparam CGSize Number of threads in each CG
- * @tparam BlockSize The size of the thread block
- * @tparam InputIt Device accessible input iterator
- * @tparam OutputIt Device accessible output iterator assignable from the map's `mapped_type`
- * @tparam Ref Type of non-owning device ref allowing access to storage
- *
- * @param first Beginning of the sequence of keys
- * @param n Number of keys to query
- * @param output_begin Beginning of the sequence of matched payloads retrieved for each key
- * @param ref Non-owning map device ref used to access the slot storage
- */
-template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename OutputIt, typename Ref>
-__global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref)
-{
-  namespace cg = cooperative_groups;
-
-  auto const block       = cg::this_thread_block();
-  auto const thread_idx  = block.thread_rank();
-  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
-  auto idx               = cuco::detail::global_thread_id() / CGSize;
-
-  __shared__ typename Ref::mapped_type output_buffer[BlockSize / CGSize];
-
-  while (idx - thread_idx < n) {  // the whole thread block falls into the same iteration
-    if (idx < n) {
-      typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
-      if constexpr (CGSize == 1) {
-        auto const found = ref.find(key);
-        /*
-         * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased
-         * sector stores from L2 to global memory. By writing results to shared memory and then
-         * synchronizing before writing back to global, we no longer rely on L1, preventing the
-         * increase in sector stores from L2 to global and improving performance.
-         */
-        output_buffer[thread_idx] =
-          found == ref.end() ? ref.empty_value_sentinel() : (*found).second;
-        block.sync();
-        *(output_begin + idx) = output_buffer[thread_idx];
-      } else {
-        auto const tile  = cg::tiled_partition<CGSize>(block);
-        auto const found = ref.find(tile, key);
-
-        if (tile.thread_rank() == 0) {
-          *(output_begin + idx) = found == ref.end() ? ref.empty_value_sentinel() : (*found).second;
-        }
-      }
-    }
-    idx += loop_stride;
-  }
-}
-
 }  // namespace cuco::static_map_ns::detail
\ No newline at end of file
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index 88a24c974..a3f274286 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -408,11 +408,49 @@ class static_map {
   template <typename InputIt>
   void insert_or_assign_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept;
 
-  // TODO docs
+  /**
+   * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent
+   * to `k` already exists in the container, then binary operation is applied using `op` callable
+   * object on the existing value at slot and the element to insert. If the key does not exist,
+   * inserts the pair as if by insert.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `insert_or_apply_async`.
+   * @note Callable object to perform binary operation should be able to invoke as
+   *  Op(cuda::atomic<T, Scope>, T>)
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * static_map<K, V>::value_type></tt> is `true`
+   * @tparam Op Callable type used to peform apply operation.
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param op callable object to perform apply operation.
+   * @param stream CUDA stream used for insert
+   */
   template <typename InputIt, typename Op>
   void insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream = {}) noexcept;
 
-  // TODO docs
+  /**
+   * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent
+   * to `k` already exists in the container, then binary operation is applied using `op` callable
+   * object on the existing value at slot and the element to insert. If the key does not exist,
+   * inserts the pair as if by insert.
+   *
+   * @note Callable object to perform binary operation should be able to invoke as
+   *  Op(cuda::atomic<T, Scope>, T>)
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * static_map<K, V>::value_type></tt> is `true`
+   * @tparam Op Callable type used to peform apply operation.
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param op callable object to perform apply operation.
+   * @param stream CUDA stream used for insert
+   */
   template <typename InputIt, typename Op>
   void insert_or_apply_async(InputIt first,
                              InputIt last,

From df3cd87ca7c712747206a59cbb4093f38926d614 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Tue, 25 Jun 2024 03:10:15 +0000
Subject: [PATCH 07/16] doxygen fixes

---
 include/cuco/operator.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp
index d4dbf2c65..4aa466c17 100644
--- a/include/cuco/operator.hpp
+++ b/include/cuco/operator.hpp
@@ -42,7 +42,7 @@ struct insert_or_assign_tag {
  * @brief `insert_or_apply` operator tag
  */
 struct insert_or_apply_tag {
-} inline constexpr insert_or_apply;
+} inline constexpr insert_or_apply;  ///< `cuco::insert_or_apply` operator
 
 /**
  * @brief `erase` operator tag

From 01fa6c373d7894534a0757de4ab92ea505441085 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Tue, 25 Jun 2024 22:25:14 +0000
Subject: [PATCH 08/16] minor fixes based on review comments

---
 .../static_map/insert_or_apply_bench.cu       |  2 +-
 .../cuco/detail/static_map/static_map_ref.inl | 19 ++++++-------------
 include/cuco/static_map_ref.cuh               |  3 ++-
 tests/static_map/insert_or_apply_test.cu      |  8 ++++----
 4 files changed, 13 insertions(+), 19 deletions(-)

diff --git a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
index e670bd75d..186b548ea 100644
--- a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
+++ b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
index 678c481fb..06bbd01de 100644
--- a/include/cuco/detail/static_map/static_map_ref.inl
+++ b/include/cuco/detail/static_map/static_map_ref.inl
@@ -375,7 +375,6 @@ class operator_impl<
   }
 };
 
-// TODO use insert_or_apply internally
 template <typename Key,
           typename T,
           cuda::thread_scope Scope,
@@ -395,9 +394,6 @@ class operator_impl<
   static constexpr auto cg_size     = base_type::cg_size;
   static constexpr auto window_size = base_type::window_size;
 
-  static_assert(sizeof(T) == 4 or sizeof(T) == 8,
-                "sizeof(mapped_type) must be either 4 bytes or 8 bytes.");
-
  public:
   /**
    * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v`
@@ -551,6 +547,7 @@ class operator_impl<
   }
 };
 
+// TODO use insert_or_apply internally
 template <typename Key,
           typename T,
           cuda::thread_scope Scope,
@@ -569,14 +566,10 @@ class operator_impl<
   static constexpr auto cg_size     = base_type::cg_size;
   static constexpr auto window_size = base_type::window_size;
 
-  static_assert(sizeof(T) == 4 or sizeof(T) == 8,
-                "sizeof(mapped_type) must be either 4 bytes or 8 bytes.");
-
  public:
   /**
    * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, applies
-   `Op`
-   * binary function to the mapped_type corresponding to the key `k` and the value `v`.
+   * `Op` binary function to the mapped_type corresponding to the key `k` and the value `v`.
    *
    * @tparam Value Input type which is implicitly convertible to 'value_type'
    * @tparam Op Callable type which is used as apply operation and can be
@@ -589,7 +582,7 @@ class operator_impl<
    */
 
   template <typename Value, typename Op>
-  __device__ void insert_or_apply(Value const& value, Op op) noexcept
+  __device__ void insert_or_apply(Value const& value, Op op)
   {
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
 
@@ -634,7 +627,7 @@ class operator_impl<
   }
 
   template <typename Value>
-  __device__ void insert_or_apply(Value const& value, cuco::op::reduce::sum_tag) noexcept
+  __device__ void insert_or_apply(Value const& value, cuco::op::reduce::sum_tag)
   {
     auto& ref_ = static_cast<ref_type&>(*this);
     ref_.insert_or_apply(value, [](cuda::atomic_ref<T, Scope> slot_ref, T const& payload) {
@@ -660,7 +653,7 @@ class operator_impl<
   template <typename Value, typename Op>
   __device__ void insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
                                   Value const& value,
-                                  Op op) noexcept
+                                  Op op)
   {
     static_assert(
       std::is_invocable_v<Op, cuda::atomic_ref<T, Scope>, T>,
@@ -722,7 +715,7 @@ class operator_impl<
   template <typename Value>
   __device__ void insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
                                   Value const& value,
-                                  cuco::op::reduce::sum_tag) noexcept
+                                  cuco::op::reduce::sum_tag)
   {
     auto& ref_ = static_cast<ref_type&>(*this);
     ref_.insert_or_apply(group, value, [](cuda::atomic_ref<T, Scope> slot_ref, T const& payload) {
diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh
index 5c96af776..4006d84ae 100644
--- a/include/cuco/static_map_ref.cuh
+++ b/include/cuco/static_map_ref.cuh
@@ -74,7 +74,8 @@ class static_map_ref
   using impl_type = detail::
     open_addressing_ref_impl<Key, Scope, KeyEqual, ProbingScheme, StorageRef, allows_duplicates>;
 
-  static_assert(sizeof(T) <= 8, "Container does not support payload types larger than 8 bytes.");
+  static_assert(sizeof(T) == 4 or sizeof(T) == 8,
+                "sizeof(mapped_type) must be either 4 bytes or 8 bytes.");
 
   static_assert(
     cuco::is_bitwise_comparable_v<Key>,
diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu
index 39d7d6df8..9f717474b 100644
--- a/tests/static_map/insert_or_apply_test.cu
+++ b/tests/static_map/insert_or_apply_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@
 using size_type = std::size_t;
 
 template <typename Map>
-__inline__ void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_keys)
+void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_keys)
 {
   REQUIRE((num_keys % num_unique_keys) == 0);
 
@@ -68,7 +68,7 @@ __inline__ void test_insert_or_apply(Map& map, size_type num_keys, size_type num
 }
 
 TEMPLATE_TEST_CASE_SIG(
-  "Insert or apply",
+  "static_map insert_or_apply tests",
   "",
   ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize),
    Key,
@@ -114,7 +114,7 @@ TEMPLATE_TEST_CASE_SIG(
 }
 
 TEMPLATE_TEST_CASE_SIG(
-  "Insert or apply all unique keys", "", ((typename Key)), (int32_t), (int64_t))
+  "static_map insert_or_apply all unique keys tests", "", ((typename Key)), (int32_t), (int64_t))
 {
   using Value = Key;
 

From 45adafc4b9abd75bce1d8ce80bc4cbcb5b2afc74 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Fri, 28 Jun 2024 03:13:29 +0000
Subject: [PATCH 09/16] Add identity element

---
 .../static_map/insert_or_apply_bench.cu       |  21 ++-
 include/cuco/detail/static_map/kernels.cuh    |  21 ++-
 include/cuco/detail/static_map/static_map.inl |  18 ++-
 .../cuco/detail/static_map/static_map_ref.inl | 138 +++++++++++++-----
 include/cuco/static_map.cuh                   |  12 +-
 tests/static_map/insert_or_apply_test.cu      |  60 ++++++--
 6 files changed, 202 insertions(+), 68 deletions(-)

diff --git a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
index 186b548ea..d103647a1 100644
--- a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
+++ b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
@@ -28,6 +28,8 @@
 using namespace cuco::benchmark;
 using namespace cuco::utility;
 
+const auto USE_IDENTITY       = std::vector<nvbench::int64_t>{0, 1};
+const auto MULTIPLICITY_RANGE = std::vector<nvbench::int64_t>{1, 2, 4, 8, 16, 32, 64, 128};
 /**
  * @brief A benchmark evaluating `cuco::static_map::insert_or_apply` performance
  */
@@ -40,9 +42,13 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert_or_appl
   auto const num_keys     = state.get_int64_or_default("NumInputs", defaults::N);
   auto const occupancy    = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
   auto const multiplicity = state.get_int64_or_default("Multiplicity", defaults::MULTIPLICITY);
+  auto const use_identity = state.get_int64_or_default("UseIdentity", 1);
 
   std::size_t const size = cuco::detail::int_div_ceil(num_keys, multiplicity) / occupancy;
 
+  cuda::std::optional<Value> identity{};
+  if (use_identity) identity = 0;
+
   thrust::device_vector<Key> keys(num_keys);
 
   key_generator gen;
@@ -57,12 +63,17 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert_or_appl
 
   cuco::static_map map{size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
 
+  using Map = decltype(map);
+
+  auto const op = [] __device__(cuda::atomic_ref<Value, Map::thread_scope> lhs, const Value& rhs) {
+    lhs.fetch_add(rhs, cuda::memory_order_relaxed);
+  };
+
   state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
     map.clear_async({launch.get_stream()});
 
     timer.start();
-    map.insert_or_apply_async(
-      pairs.begin(), pairs.end(), cuco::op::reduce::sum, {launch.get_stream()});
+    map.insert_or_apply_async(pairs.begin(), pairs.end(), op, identity, {launch.get_stream()});
     timer.stop();
   });
 }
@@ -81,7 +92,8 @@ NVBENCH_BENCH_TYPES(static_map_insert_or_apply,
   .set_name("static_map_insert_or_apply_uniform_multiplicity")
   .set_type_axes_names({"Key", "Value", "Distribution"})
   .set_max_noise(defaults::MAX_NOISE)
-  .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);
+  .add_int64_axis("Multiplicity", MULTIPLICITY_RANGE)
+  .add_int64_axis("UseIdentity", USE_IDENTITY);
 
 NVBENCH_BENCH_TYPES(static_map_insert_or_apply,
                     NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
@@ -90,4 +102,5 @@ NVBENCH_BENCH_TYPES(static_map_insert_or_apply,
   .set_name("static_set_insert_or_apply_uniform_occupancy")
   .set_type_axes_names({"Key", "Value", "Distribution"})
   .set_max_noise(defaults::MAX_NOISE)
-  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
\ No newline at end of file
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE)
+  .add_int64_axis("UseIdentity", USE_IDENTITY);
diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh
index ec15a66ed..3646e8c9c 100644
--- a/include/cuco/detail/static_map/kernels.cuh
+++ b/include/cuco/detail/static_map/kernels.cuh
@@ -20,6 +20,7 @@
 
 #include <cub/block/block_reduce.cuh>
 #include <cuda/atomic>
+#include <cuda/std/optional>
 
 #include <cooperative_groups.h>
 
@@ -82,15 +83,27 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_assign(InputIt first,
  * @tparam InputIt Device accessible input iterator whose `value_type` is
  * convertible to the `value_type` of the data structure
  * @tparam Op Callable type used to peform apply operation.
+ * @tparam T Type of optional idenitity element which is convertible
+ * to `value_type` of the data structure
  * @tparam Ref Type of non-owning device ref allowing access to storage
  *
  * @param first Beginning of the sequence of input elements
  * @param n Number of input elements
  * @param op callable object to perform apply operation.
+ * @param identity_element An optional Identity element of the binary operation
  * @param ref Non-owning container device ref used to access the slot storage
  */
-template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename Op, typename Ref>
-__global__ void insert_or_apply(InputIt first, cuco::detail::index_type n, Op op, Ref ref)
+template <int32_t CGSize,
+          int32_t BlockSize,
+          typename InputIt,
+          typename Op,
+          typename T,
+          typename Ref>
+__global__ void insert_or_apply(InputIt first,
+                                cuco::detail::index_type n,
+                                Op op,
+                                cuda::std::optional<T> identity_element,
+                                Ref ref)
 {
   auto const loop_stride = cuco::detail::grid_stride() / CGSize;
   auto idx               = cuco::detail::global_thread_id() / CGSize;
@@ -98,11 +111,11 @@ __global__ void insert_or_apply(InputIt first, cuco::detail::index_type n, Op op
   while (idx < n) {
     typename std::iterator_traits<InputIt>::value_type const& insert_pair = *(first + idx);
     if constexpr (CGSize == 1) {
-      ref.insert_or_apply(insert_pair, op);
+      ref.insert_or_apply(insert_pair, op, identity_element);
     } else {
       auto const tile =
         cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
-      ref.insert_or_apply(tile, insert_pair, op);
+      ref.insert_or_apply(tile, insert_pair, op, identity_element);
     }
     idx += loop_stride;
   }
diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
index 94cb194f6..80c858e26 100644
--- a/include/cuco/detail/static_map/static_map.inl
+++ b/include/cuco/detail/static_map/static_map.inl
@@ -22,6 +22,8 @@
 #include <cuco/operator.hpp>
 #include <cuco/static_map_ref.cuh>
 
+#include <cuda/std/optional>
+
 #include <cstddef>
 
 namespace cuco {
@@ -253,9 +255,13 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename Op>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
-  insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream) noexcept
+  insert_or_apply(InputIt first,
+                  InputIt last,
+                  Op op,
+                  cuda::std::optional<T> identity_element,
+                  cuda_stream_ref stream) noexcept
 {
-  return this->insert_or_apply_async(first, last, op, stream);
+  return this->insert_or_apply_async(first, last, op, identity_element, stream);
   stream.synchronize();
 }
 
@@ -269,7 +275,11 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename Op>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
-  insert_or_apply_async(InputIt first, InputIt last, Op op, cuda_stream_ref stream) noexcept
+  insert_or_apply_async(InputIt first,
+                        InputIt last,
+                        Op op,
+                        cuda::std::optional<T> identity_element,
+                        cuda_stream_ref stream) noexcept
 {
   auto const num = cuco::detail::distance(first, last);
   if (num == 0) { return; }
@@ -278,7 +288,7 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
 
   static_map_ns::detail::insert_or_apply<cg_size, cuco::detail::default_block_size()>
     <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
-      first, num, op, ref(op::insert_or_apply));
+      first, num, op, identity_element, ref(op::insert_or_apply));
 }
 
 template <class Key,
diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
index 06bbd01de..259faf2a0 100644
--- a/include/cuco/detail/static_map/static_map_ref.inl
+++ b/include/cuco/detail/static_map/static_map_ref.inl
@@ -20,6 +20,7 @@
 #include <cuco/operator.hpp>
 
 #include <cuda/atomic>
+#include <cuda/std/optional>
 #include <thrust/tuple.h>
 
 #include <cooperative_groups.h>
@@ -579,10 +580,13 @@ class operator_impl<
    * @param value The element to insert
    * @param op The callable object to perform binary operation between existing value at the slot
    *  and the element to insert.
+   * @param identity_element An optional Identity element of the binary operation
    */
 
   template <typename Value, typename Op>
-  __device__ void insert_or_apply(Value const& value, Op op)
+  __device__ void insert_or_apply(Value const& value,
+                                  Op op,
+                                  cuda::std::optional<T> identity_element = {})
   {
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
 
@@ -592,11 +596,20 @@ class operator_impl<
 
     ref_type& ref_ = static_cast<ref_type&>(*this);
 
-    auto const val       = ref_.impl_.heterogeneous_value(value);
-    auto const key       = ref_.impl_.extract_key(val);
-    auto& probing_scheme = ref_.impl_.probing_scheme();
-    auto storage_ref     = ref_.impl_.storage_ref();
-    auto probing_iter    = probing_scheme(key, storage_ref.window_extent());
+    auto const val         = ref_.impl_.heterogeneous_value(value);
+    auto const key         = ref_.impl_.extract_key(val);
+    auto& probing_scheme   = ref_.impl_.probing_scheme();
+    auto storage_ref       = ref_.impl_.storage_ref();
+    auto probing_iter      = probing_scheme(key, storage_ref.window_extent());
+    auto const empty_value = ref_.impl_.empty_slot_sentinel().second;
+
+    // optimize first insert when sentinel payload value equals identity element
+    auto const optimize_insert = [&]() {
+      if (identity_element.has_value()) {
+        if (identity_element.value() == empty_value) return true;
+      }
+      return false;
+    }();
 
     while (true) {
       auto const window_slots = storage_ref[*probing_iter];
@@ -604,21 +617,30 @@ class operator_impl<
       for (auto& slot_content : window_slots) {
         auto const eq_res =
           ref_.impl_.predicate_.operator()<is_insert::YES>(key, slot_content.first);
+        auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content);
+        auto slot_ptr = (storage_ref.data() + *probing_iter)->data() + intra_window_index;
 
         // If the key is already in the container, update the payload and return
         if (eq_res == detail::equal_result::EQUAL) {
-          auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content);
-          op(
-            cuda::atomic_ref<T, Scope>{
-              ((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second},
-            val.second);
+          op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
           return;
         }
         if (eq_res == detail::equal_result::AVAILABLE) {
-          auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content);
-          if (attempt_insert_or_apply(
-                (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op)) {
-            return;
+          // if the sentinel value and identity_element are same, perform op
+          // and return, no need to wait on payload
+          if (optimize_insert) {
+            if (attempt_insert_or_apply(slot_ptr, val, op)) return;
+            continue;
+          }
+          // else, attempt stable insert
+          switch (ref_.impl_.attempt_insert_stable(slot_ptr, slot_content, val)) {
+            case insert_result::CONTINUE: continue;
+            case insert_result::SUCCESS: return;
+            case insert_result::DUPLICATE: {
+              ref_.impl_.wait_for_payload(slot_ptr->second, empty_value);
+              op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
+              return;
+            }
           }
         }
       }
@@ -627,12 +649,17 @@ class operator_impl<
   }
 
   template <typename Value>
-  __device__ void insert_or_apply(Value const& value, cuco::op::reduce::sum_tag)
+  __device__ void insert_or_apply(Value const& value,
+                                  cuco::op::reduce::sum_tag,
+                                  cuda::std::optional<T> = {})
   {
     auto& ref_ = static_cast<ref_type&>(*this);
-    ref_.insert_or_apply(value, [](cuda::atomic_ref<T, Scope> slot_ref, T const& payload) {
-      slot_ref.fetch_add(payload, cuda::memory_order_relaxed);
-    });
+    ref_.insert_or_apply(
+      value,
+      [](cuda::atomic_ref<T, Scope> slot_ref, T const& payload) {
+        slot_ref.fetch_add(payload, cuda::memory_order_relaxed);
+      },
+      static_cast<T>(0));
   }
 
   /**
@@ -648,12 +675,14 @@ class operator_impl<
    * @param value The element to insert
    * @param op The callable object to perform binary operation between existing value at the slot
    *  and the element to insert.
+   * @param identity_element An optional Identity element of the binary operation
    */
 
   template <typename Value, typename Op>
   __device__ void insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
                                   Value const& value,
-                                  Op op)
+                                  Op op,
+                                  cuda::std::optional<T> identity_element = {})
   {
     static_assert(
       std::is_invocable_v<Op, cuda::atomic_ref<T, Scope>, T>,
@@ -661,11 +690,19 @@ class operator_impl<
 
     ref_type& ref_ = static_cast<ref_type&>(*this);
 
-    auto const val       = ref_.impl_.heterogeneous_value(value);
-    auto const key       = ref_.impl_.extract_key(val);
-    auto& probing_scheme = ref_.impl_.probing_scheme();
-    auto storage_ref     = ref_.impl_.storage_ref();
-    auto probing_iter    = probing_scheme(group, key, storage_ref.window_extent());
+    auto const val         = ref_.impl_.heterogeneous_value(value);
+    auto const key         = ref_.impl_.extract_key(val);
+    auto& probing_scheme   = ref_.impl_.probing_scheme();
+    auto storage_ref       = ref_.impl_.storage_ref();
+    auto probing_iter      = probing_scheme(group, key, storage_ref.window_extent());
+    auto const empty_value = ref_.impl_.empty_slot_sentinel().second;
+
+    auto const optimize_insert = [&]() {
+      if (identity_element.has_value()) {
+        if (identity_element.value() == empty_value) return true;
+      }
+      return false;
+    }();
 
     while (true) {
       auto const window_slots = storage_ref[*probing_iter];
@@ -682,14 +719,13 @@ class operator_impl<
         return detail::window_probing_results{res, -1};
       }();
 
+      auto* slot_ptr = (storage_ref.data() + *probing_iter)->data() + intra_window_index;
+
       auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL);
       if (group_contains_equal) {
         auto const src_lane = __ffs(group_contains_equal) - 1;
         if (group.thread_rank() == src_lane) {
-          op(
-            cuda::atomic_ref<T, Scope>{
-              ((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second},
-            val.second);
+          op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
         }
         group.sync();
         return;
@@ -698,14 +734,31 @@ class operator_impl<
       auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE);
       if (group_contains_available) {
         auto const src_lane = __ffs(group_contains_available) - 1;
-        auto const status =
-          (group.thread_rank() == src_lane)
-            ? attempt_insert_or_apply(
-                (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op)
-            : false;
 
-        // Exit if inserted or assigned
-        if (group.shfl(status, src_lane)) { return; }
+        if (optimize_insert) {
+          auto const status = (group.thread_rank() == src_lane)
+                                ? attempt_insert_or_apply(slot_ptr, value, op)
+                                : false;
+          if (group.shfl(status, src_lane)) { return; }
+          continue;
+        }
+        auto const status = [&, target_idx = intra_window_index]() {
+          if (group.thread_rank() != src_lane) { return insert_result::CONTINUE; }
+          return ref_.impl_.attempt_insert_stable(slot_ptr, window_slots[target_idx], val);
+        }();
+
+        switch (group.shfl(status, src_lane)) {
+          case insert_result::SUCCESS: return;
+          case insert_result::DUPLICATE: {
+            if (group.thread_rank() == src_lane) {
+              ref_.impl_.wait_for_payload(slot_ptr->second, empty_value);
+              op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
+            }
+            group.sync();
+            return;
+          }
+          default: continue;
+        }
       } else {
         ++probing_iter;
       }
@@ -715,12 +768,17 @@ class operator_impl<
   template <typename Value>
   __device__ void insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
                                   Value const& value,
-                                  cuco::op::reduce::sum_tag)
+                                  cuco::op::reduce::sum_tag,
+                                  cuda::std::optional<T> = {})
   {
     auto& ref_ = static_cast<ref_type&>(*this);
-    ref_.insert_or_apply(group, value, [](cuda::atomic_ref<T, Scope> slot_ref, T const& payload) {
-      slot_ref.fetch_add(payload, cuda::memory_order_relaxed);
-    });
+    ref_.insert_or_apply(
+      group,
+      value,
+      [](cuda::atomic_ref<T, Scope> slot_ref, T const& payload) {
+        slot_ref.fetch_add(payload, cuda::memory_order_relaxed);
+      },
+      static_cast<T>(0));
   }
 
  private:
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index a3f274286..d0d6cd3a5 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -29,6 +29,7 @@
 #include <cuco/utility/traits.hpp>
 
 #include <cuda/std/atomic>
+#include <cuda/std/optional>
 #include <thrust/functional.h>
 
 #if defined(CUCO_HAS_CUDA_BARRIER)
@@ -427,10 +428,15 @@ class static_map {
    * @param first Beginning of the sequence of keys
    * @param last End of the sequence of keys
    * @param op callable object to perform apply operation.
+   * @param identity_element An optional Identity element of the binary operation
    * @param stream CUDA stream used for insert
    */
   template <typename InputIt, typename Op>
-  void insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream = {}) noexcept;
+  void insert_or_apply(InputIt first,
+                       InputIt last,
+                       Op op,
+                       cuda::std::optional<T> identity_element = {},
+                       cuda_stream_ref stream                  = {}) noexcept;
 
   /**
    * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent
@@ -449,13 +455,15 @@ class static_map {
    * @param first Beginning of the sequence of keys
    * @param last End of the sequence of keys
    * @param op callable object to perform apply operation.
+   * @param identity_element An optional Identity element of the binary operation
    * @param stream CUDA stream used for insert
    */
   template <typename InputIt, typename Op>
   void insert_or_apply_async(InputIt first,
                              InputIt last,
                              Op op,
-                             cuda_stream_ref stream = {}) noexcept;
+                             cuda::std::optional<T> identity_element = {},
+                             cuda_stream_ref stream                  = {}) noexcept;
 
   /**
    * @brief Erases keys in the range `[first, last)`.
diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu
index 9f717474b..451cb3ce2 100644
--- a/tests/static_map/insert_or_apply_test.cu
+++ b/tests/static_map/insert_or_apply_test.cu
@@ -19,6 +19,7 @@
 #include <cuco/static_map.cuh>
 
 #include <cuda/atomic>
+#include <cuda/std/optional>
 #include <thrust/device_vector.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -34,7 +35,10 @@
 using size_type = std::size_t;
 
 template <typename Map>
-void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_keys)
+void test_insert_or_apply(Map& map,
+                          size_type num_keys,
+                          size_type num_unique_keys,
+                          bool use_identity)
 {
   REQUIRE((num_keys % num_unique_keys) == 0);
 
@@ -48,12 +52,17 @@ void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_key
       return cuco::pair<Key, Value>{i % num_unique_keys, 1};
     }));
 
+  cuda::std::optional<Value> identity{};
+
+  if (use_identity) identity = 0;
+
   map.insert_or_apply(
     pairs_begin,
     pairs_begin + num_keys,
     [] __device__(cuda::atomic_ref<Value, Map::thread_scope> lhs, const Value& rhs) {
       lhs.fetch_add(rhs, cuda::memory_order_relaxed);
-    });
+    },
+    identity);
 
   REQUIRE(map.size() == num_unique_keys);
 
@@ -100,17 +109,40 @@ TEMPLATE_TEST_CASE_SIG(
     cuco::linear_probing<CGSize, cuco::murmurhash3_32<Key>>,
     cuco::double_hashing<CGSize, cuco::murmurhash3_32<Key>, cuco::murmurhash3_32<Key>>>;
 
-  auto map = cuco::static_map<Key,
-                              Value,
-                              cuco::extent<size_type>,
-                              cuda::thread_scope_device,
-                              thrust::equal_to<Key>,
-                              probe,
-                              cuco::cuda_allocator<std::byte>,
-                              cuco::storage<2>>{
-    num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
-
-  test_insert_or_apply(map, num_keys, num_unique_keys);
+  using map_type = cuco::static_map<Key,
+                                    Value,
+                                    cuco::extent<size_type>,
+                                    cuda::thread_scope_device,
+                                    thrust::equal_to<Key>,
+                                    probe,
+                                    cuco::cuda_allocator<std::byte>,
+                                    cuco::storage<2>>;
+
+  // test all four case of sentienel value and identity
+  // only first case i.e sentienel = 0, identity = true will use optimized_insert code path
+  SECTION("sentienel = 0, use_identity = true")
+  {
+    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+    test_insert_or_apply(map, num_keys, num_unique_keys, true);
+  }
+
+  SECTION("sentienel = 0, use_identity = false")
+  {
+    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+    test_insert_or_apply(map, num_keys, num_unique_keys, false);
+  }
+
+  SECTION("sentienel = -1, use_identity = true")
+  {
+    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+    test_insert_or_apply(map, num_keys, num_unique_keys, true);
+  }
+
+  SECTION("sentienel = -1, use_identity = false")
+  {
+    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+    test_insert_or_apply(map, num_keys, num_unique_keys, false);
+  }
 }
 
 TEMPLATE_TEST_CASE_SIG(
@@ -130,5 +162,5 @@ TEMPLATE_TEST_CASE_SIG(
                               cuco::storage<2>>{
     num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
 
-  test_insert_or_apply(map, num_keys, num_keys);
+  test_insert_or_apply(map, num_keys, num_keys, true);
 }
\ No newline at end of file

From e00abec08b16bdf3ab9e409506844f35a0d01afc Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Fri, 28 Jun 2024 16:59:16 +0000
Subject: [PATCH 10/16] Revert "Add identity element"

This reverts commit 45adafc4b9abd75bce1d8ce80bc4cbcb5b2afc74, Because there is
no performance improvement in adding identity element optimization.
---
 .../static_map/insert_or_apply_bench.cu       |  21 +--
 include/cuco/detail/static_map/kernels.cuh    |  21 +--
 include/cuco/detail/static_map/static_map.inl |  18 +--
 .../cuco/detail/static_map/static_map_ref.inl | 138 +++++-------------
 include/cuco/static_map.cuh                   |  12 +-
 tests/static_map/insert_or_apply_test.cu      |  60 ++------
 6 files changed, 68 insertions(+), 202 deletions(-)

diff --git a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
index d103647a1..186b548ea 100644
--- a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
+++ b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
@@ -28,8 +28,6 @@
 using namespace cuco::benchmark;
 using namespace cuco::utility;
 
-const auto USE_IDENTITY       = std::vector<nvbench::int64_t>{0, 1};
-const auto MULTIPLICITY_RANGE = std::vector<nvbench::int64_t>{1, 2, 4, 8, 16, 32, 64, 128};
 /**
  * @brief A benchmark evaluating `cuco::static_map::insert_or_apply` performance
  */
@@ -42,13 +40,9 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert_or_appl
   auto const num_keys     = state.get_int64_or_default("NumInputs", defaults::N);
   auto const occupancy    = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
   auto const multiplicity = state.get_int64_or_default("Multiplicity", defaults::MULTIPLICITY);
-  auto const use_identity = state.get_int64_or_default("UseIdentity", 1);
 
   std::size_t const size = cuco::detail::int_div_ceil(num_keys, multiplicity) / occupancy;
 
-  cuda::std::optional<Value> identity{};
-  if (use_identity) identity = 0;
-
   thrust::device_vector<Key> keys(num_keys);
 
   key_generator gen;
@@ -63,17 +57,12 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert_or_appl
 
   cuco::static_map map{size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
 
-  using Map = decltype(map);
-
-  auto const op = [] __device__(cuda::atomic_ref<Value, Map::thread_scope> lhs, const Value& rhs) {
-    lhs.fetch_add(rhs, cuda::memory_order_relaxed);
-  };
-
   state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
     map.clear_async({launch.get_stream()});
 
     timer.start();
-    map.insert_or_apply_async(pairs.begin(), pairs.end(), op, identity, {launch.get_stream()});
+    map.insert_or_apply_async(
+      pairs.begin(), pairs.end(), cuco::op::reduce::sum, {launch.get_stream()});
     timer.stop();
   });
 }
@@ -92,8 +81,7 @@ NVBENCH_BENCH_TYPES(static_map_insert_or_apply,
   .set_name("static_map_insert_or_apply_uniform_multiplicity")
   .set_type_axes_names({"Key", "Value", "Distribution"})
   .set_max_noise(defaults::MAX_NOISE)
-  .add_int64_axis("Multiplicity", MULTIPLICITY_RANGE)
-  .add_int64_axis("UseIdentity", USE_IDENTITY);
+  .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);
 
 NVBENCH_BENCH_TYPES(static_map_insert_or_apply,
                     NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
@@ -102,5 +90,4 @@ NVBENCH_BENCH_TYPES(static_map_insert_or_apply,
   .set_name("static_set_insert_or_apply_uniform_occupancy")
   .set_type_axes_names({"Key", "Value", "Distribution"})
   .set_max_noise(defaults::MAX_NOISE)
-  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE)
-  .add_int64_axis("UseIdentity", USE_IDENTITY);
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
\ No newline at end of file
diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh
index 3646e8c9c..ec15a66ed 100644
--- a/include/cuco/detail/static_map/kernels.cuh
+++ b/include/cuco/detail/static_map/kernels.cuh
@@ -20,7 +20,6 @@
 
 #include <cub/block/block_reduce.cuh>
 #include <cuda/atomic>
-#include <cuda/std/optional>
 
 #include <cooperative_groups.h>
 
@@ -83,27 +82,15 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_assign(InputIt first,
  * @tparam InputIt Device accessible input iterator whose `value_type` is
  * convertible to the `value_type` of the data structure
  * @tparam Op Callable type used to peform apply operation.
- * @tparam T Type of optional idenitity element which is convertible
- * to `value_type` of the data structure
  * @tparam Ref Type of non-owning device ref allowing access to storage
  *
  * @param first Beginning of the sequence of input elements
  * @param n Number of input elements
  * @param op callable object to perform apply operation.
- * @param identity_element An optional Identity element of the binary operation
  * @param ref Non-owning container device ref used to access the slot storage
  */
-template <int32_t CGSize,
-          int32_t BlockSize,
-          typename InputIt,
-          typename Op,
-          typename T,
-          typename Ref>
-__global__ void insert_or_apply(InputIt first,
-                                cuco::detail::index_type n,
-                                Op op,
-                                cuda::std::optional<T> identity_element,
-                                Ref ref)
+template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename Op, typename Ref>
+__global__ void insert_or_apply(InputIt first, cuco::detail::index_type n, Op op, Ref ref)
 {
   auto const loop_stride = cuco::detail::grid_stride() / CGSize;
   auto idx               = cuco::detail::global_thread_id() / CGSize;
@@ -111,11 +98,11 @@ __global__ void insert_or_apply(InputIt first,
   while (idx < n) {
     typename std::iterator_traits<InputIt>::value_type const& insert_pair = *(first + idx);
     if constexpr (CGSize == 1) {
-      ref.insert_or_apply(insert_pair, op, identity_element);
+      ref.insert_or_apply(insert_pair, op);
     } else {
       auto const tile =
         cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
-      ref.insert_or_apply(tile, insert_pair, op, identity_element);
+      ref.insert_or_apply(tile, insert_pair, op);
     }
     idx += loop_stride;
   }
diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
index 80c858e26..94cb194f6 100644
--- a/include/cuco/detail/static_map/static_map.inl
+++ b/include/cuco/detail/static_map/static_map.inl
@@ -22,8 +22,6 @@
 #include <cuco/operator.hpp>
 #include <cuco/static_map_ref.cuh>
 
-#include <cuda/std/optional>
-
 #include <cstddef>
 
 namespace cuco {
@@ -255,13 +253,9 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename Op>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
-  insert_or_apply(InputIt first,
-                  InputIt last,
-                  Op op,
-                  cuda::std::optional<T> identity_element,
-                  cuda_stream_ref stream) noexcept
+  insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream) noexcept
 {
-  return this->insert_or_apply_async(first, last, op, identity_element, stream);
+  return this->insert_or_apply_async(first, last, op, stream);
   stream.synchronize();
 }
 
@@ -275,11 +269,7 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename Op>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
-  insert_or_apply_async(InputIt first,
-                        InputIt last,
-                        Op op,
-                        cuda::std::optional<T> identity_element,
-                        cuda_stream_ref stream) noexcept
+  insert_or_apply_async(InputIt first, InputIt last, Op op, cuda_stream_ref stream) noexcept
 {
   auto const num = cuco::detail::distance(first, last);
   if (num == 0) { return; }
@@ -288,7 +278,7 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
 
   static_map_ns::detail::insert_or_apply<cg_size, cuco::detail::default_block_size()>
     <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
-      first, num, op, identity_element, ref(op::insert_or_apply));
+      first, num, op, ref(op::insert_or_apply));
 }
 
 template <class Key,
diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
index 259faf2a0..06bbd01de 100644
--- a/include/cuco/detail/static_map/static_map_ref.inl
+++ b/include/cuco/detail/static_map/static_map_ref.inl
@@ -20,7 +20,6 @@
 #include <cuco/operator.hpp>
 
 #include <cuda/atomic>
-#include <cuda/std/optional>
 #include <thrust/tuple.h>
 
 #include <cooperative_groups.h>
@@ -580,13 +579,10 @@ class operator_impl<
    * @param value The element to insert
    * @param op The callable object to perform binary operation between existing value at the slot
    *  and the element to insert.
-   * @param identity_element An optional Identity element of the binary operation
    */
 
   template <typename Value, typename Op>
-  __device__ void insert_or_apply(Value const& value,
-                                  Op op,
-                                  cuda::std::optional<T> identity_element = {})
+  __device__ void insert_or_apply(Value const& value, Op op)
   {
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
 
@@ -596,20 +592,11 @@ class operator_impl<
 
     ref_type& ref_ = static_cast<ref_type&>(*this);
 
-    auto const val         = ref_.impl_.heterogeneous_value(value);
-    auto const key         = ref_.impl_.extract_key(val);
-    auto& probing_scheme   = ref_.impl_.probing_scheme();
-    auto storage_ref       = ref_.impl_.storage_ref();
-    auto probing_iter      = probing_scheme(key, storage_ref.window_extent());
-    auto const empty_value = ref_.impl_.empty_slot_sentinel().second;
-
-    // optimize first insert when sentinel payload value equals identity element
-    auto const optimize_insert = [&]() {
-      if (identity_element.has_value()) {
-        if (identity_element.value() == empty_value) return true;
-      }
-      return false;
-    }();
+    auto const val       = ref_.impl_.heterogeneous_value(value);
+    auto const key       = ref_.impl_.extract_key(val);
+    auto& probing_scheme = ref_.impl_.probing_scheme();
+    auto storage_ref     = ref_.impl_.storage_ref();
+    auto probing_iter    = probing_scheme(key, storage_ref.window_extent());
 
     while (true) {
       auto const window_slots = storage_ref[*probing_iter];
@@ -617,30 +604,21 @@ class operator_impl<
       for (auto& slot_content : window_slots) {
         auto const eq_res =
           ref_.impl_.predicate_.operator()<is_insert::YES>(key, slot_content.first);
-        auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content);
-        auto slot_ptr = (storage_ref.data() + *probing_iter)->data() + intra_window_index;
 
         // If the key is already in the container, update the payload and return
         if (eq_res == detail::equal_result::EQUAL) {
-          op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
+          auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content);
+          op(
+            cuda::atomic_ref<T, Scope>{
+              ((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second},
+            val.second);
           return;
         }
         if (eq_res == detail::equal_result::AVAILABLE) {
-          // if the sentinel value and identity_element are same, perform op
-          // and return, no need to wait on payload
-          if (optimize_insert) {
-            if (attempt_insert_or_apply(slot_ptr, val, op)) return;
-            continue;
-          }
-          // else, attempt stable insert
-          switch (ref_.impl_.attempt_insert_stable(slot_ptr, slot_content, val)) {
-            case insert_result::CONTINUE: continue;
-            case insert_result::SUCCESS: return;
-            case insert_result::DUPLICATE: {
-              ref_.impl_.wait_for_payload(slot_ptr->second, empty_value);
-              op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
-              return;
-            }
+          auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content);
+          if (attempt_insert_or_apply(
+                (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op)) {
+            return;
           }
         }
       }
@@ -649,17 +627,12 @@ class operator_impl<
   }
 
   template <typename Value>
-  __device__ void insert_or_apply(Value const& value,
-                                  cuco::op::reduce::sum_tag,
-                                  cuda::std::optional<T> = {})
+  __device__ void insert_or_apply(Value const& value, cuco::op::reduce::sum_tag)
   {
     auto& ref_ = static_cast<ref_type&>(*this);
-    ref_.insert_or_apply(
-      value,
-      [](cuda::atomic_ref<T, Scope> slot_ref, T const& payload) {
-        slot_ref.fetch_add(payload, cuda::memory_order_relaxed);
-      },
-      static_cast<T>(0));
+    ref_.insert_or_apply(value, [](cuda::atomic_ref<T, Scope> slot_ref, T const& payload) {
+      slot_ref.fetch_add(payload, cuda::memory_order_relaxed);
+    });
   }
 
   /**
@@ -675,14 +648,12 @@ class operator_impl<
    * @param value The element to insert
    * @param op The callable object to perform binary operation between existing value at the slot
    *  and the element to insert.
-   * @param identity_element An optional Identity element of the binary operation
    */
 
   template <typename Value, typename Op>
   __device__ void insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
                                   Value const& value,
-                                  Op op,
-                                  cuda::std::optional<T> identity_element = {})
+                                  Op op)
   {
     static_assert(
       std::is_invocable_v<Op, cuda::atomic_ref<T, Scope>, T>,
@@ -690,19 +661,11 @@ class operator_impl<
 
     ref_type& ref_ = static_cast<ref_type&>(*this);
 
-    auto const val         = ref_.impl_.heterogeneous_value(value);
-    auto const key         = ref_.impl_.extract_key(val);
-    auto& probing_scheme   = ref_.impl_.probing_scheme();
-    auto storage_ref       = ref_.impl_.storage_ref();
-    auto probing_iter      = probing_scheme(group, key, storage_ref.window_extent());
-    auto const empty_value = ref_.impl_.empty_slot_sentinel().second;
-
-    auto const optimize_insert = [&]() {
-      if (identity_element.has_value()) {
-        if (identity_element.value() == empty_value) return true;
-      }
-      return false;
-    }();
+    auto const val       = ref_.impl_.heterogeneous_value(value);
+    auto const key       = ref_.impl_.extract_key(val);
+    auto& probing_scheme = ref_.impl_.probing_scheme();
+    auto storage_ref     = ref_.impl_.storage_ref();
+    auto probing_iter    = probing_scheme(group, key, storage_ref.window_extent());
 
     while (true) {
       auto const window_slots = storage_ref[*probing_iter];
@@ -719,13 +682,14 @@ class operator_impl<
         return detail::window_probing_results{res, -1};
       }();
 
-      auto* slot_ptr = (storage_ref.data() + *probing_iter)->data() + intra_window_index;
-
       auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL);
       if (group_contains_equal) {
         auto const src_lane = __ffs(group_contains_equal) - 1;
         if (group.thread_rank() == src_lane) {
-          op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
+          op(
+            cuda::atomic_ref<T, Scope>{
+              ((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second},
+            val.second);
         }
         group.sync();
         return;
@@ -734,31 +698,14 @@ class operator_impl<
       auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE);
       if (group_contains_available) {
         auto const src_lane = __ffs(group_contains_available) - 1;
+        auto const status =
+          (group.thread_rank() == src_lane)
+            ? attempt_insert_or_apply(
+                (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op)
+            : false;
 
-        if (optimize_insert) {
-          auto const status = (group.thread_rank() == src_lane)
-                                ? attempt_insert_or_apply(slot_ptr, value, op)
-                                : false;
-          if (group.shfl(status, src_lane)) { return; }
-          continue;
-        }
-        auto const status = [&, target_idx = intra_window_index]() {
-          if (group.thread_rank() != src_lane) { return insert_result::CONTINUE; }
-          return ref_.impl_.attempt_insert_stable(slot_ptr, window_slots[target_idx], val);
-        }();
-
-        switch (group.shfl(status, src_lane)) {
-          case insert_result::SUCCESS: return;
-          case insert_result::DUPLICATE: {
-            if (group.thread_rank() == src_lane) {
-              ref_.impl_.wait_for_payload(slot_ptr->second, empty_value);
-              op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
-            }
-            group.sync();
-            return;
-          }
-          default: continue;
-        }
+        // Exit if inserted or assigned
+        if (group.shfl(status, src_lane)) { return; }
       } else {
         ++probing_iter;
       }
@@ -768,17 +715,12 @@ class operator_impl<
   template <typename Value>
   __device__ void insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
                                   Value const& value,
-                                  cuco::op::reduce::sum_tag,
-                                  cuda::std::optional<T> = {})
+                                  cuco::op::reduce::sum_tag)
   {
     auto& ref_ = static_cast<ref_type&>(*this);
-    ref_.insert_or_apply(
-      group,
-      value,
-      [](cuda::atomic_ref<T, Scope> slot_ref, T const& payload) {
-        slot_ref.fetch_add(payload, cuda::memory_order_relaxed);
-      },
-      static_cast<T>(0));
+    ref_.insert_or_apply(group, value, [](cuda::atomic_ref<T, Scope> slot_ref, T const& payload) {
+      slot_ref.fetch_add(payload, cuda::memory_order_relaxed);
+    });
   }
 
  private:
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index d0d6cd3a5..a3f274286 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -29,7 +29,6 @@
 #include <cuco/utility/traits.hpp>
 
 #include <cuda/std/atomic>
-#include <cuda/std/optional>
 #include <thrust/functional.h>
 
 #if defined(CUCO_HAS_CUDA_BARRIER)
@@ -428,15 +427,10 @@ class static_map {
    * @param first Beginning of the sequence of keys
    * @param last End of the sequence of keys
    * @param op callable object to perform apply operation.
-   * @param identity_element An optional Identity element of the binary operation
    * @param stream CUDA stream used for insert
    */
   template <typename InputIt, typename Op>
-  void insert_or_apply(InputIt first,
-                       InputIt last,
-                       Op op,
-                       cuda::std::optional<T> identity_element = {},
-                       cuda_stream_ref stream                  = {}) noexcept;
+  void insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream = {}) noexcept;
 
   /**
    * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent
@@ -455,15 +449,13 @@ class static_map {
    * @param first Beginning of the sequence of keys
    * @param last End of the sequence of keys
    * @param op callable object to perform apply operation.
-   * @param identity_element An optional Identity element of the binary operation
    * @param stream CUDA stream used for insert
    */
   template <typename InputIt, typename Op>
   void insert_or_apply_async(InputIt first,
                              InputIt last,
                              Op op,
-                             cuda::std::optional<T> identity_element = {},
-                             cuda_stream_ref stream                  = {}) noexcept;
+                             cuda_stream_ref stream = {}) noexcept;
 
   /**
    * @brief Erases keys in the range `[first, last)`.
diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu
index 451cb3ce2..9f717474b 100644
--- a/tests/static_map/insert_or_apply_test.cu
+++ b/tests/static_map/insert_or_apply_test.cu
@@ -19,7 +19,6 @@
 #include <cuco/static_map.cuh>
 
 #include <cuda/atomic>
-#include <cuda/std/optional>
 #include <thrust/device_vector.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -35,10 +34,7 @@
 using size_type = std::size_t;
 
 template <typename Map>
-void test_insert_or_apply(Map& map,
-                          size_type num_keys,
-                          size_type num_unique_keys,
-                          bool use_identity)
+void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_keys)
 {
   REQUIRE((num_keys % num_unique_keys) == 0);
 
@@ -52,17 +48,12 @@ void test_insert_or_apply(Map& map,
       return cuco::pair<Key, Value>{i % num_unique_keys, 1};
     }));
 
-  cuda::std::optional<Value> identity{};
-
-  if (use_identity) identity = 0;
-
   map.insert_or_apply(
     pairs_begin,
     pairs_begin + num_keys,
     [] __device__(cuda::atomic_ref<Value, Map::thread_scope> lhs, const Value& rhs) {
       lhs.fetch_add(rhs, cuda::memory_order_relaxed);
-    },
-    identity);
+    });
 
   REQUIRE(map.size() == num_unique_keys);
 
@@ -109,40 +100,17 @@ TEMPLATE_TEST_CASE_SIG(
     cuco::linear_probing<CGSize, cuco::murmurhash3_32<Key>>,
     cuco::double_hashing<CGSize, cuco::murmurhash3_32<Key>, cuco::murmurhash3_32<Key>>>;
 
-  using map_type = cuco::static_map<Key,
-                                    Value,
-                                    cuco::extent<size_type>,
-                                    cuda::thread_scope_device,
-                                    thrust::equal_to<Key>,
-                                    probe,
-                                    cuco::cuda_allocator<std::byte>,
-                                    cuco::storage<2>>;
-
-  // test all four case of sentienel value and identity
-  // only first case i.e sentienel = 0, identity = true will use optimized_insert code path
-  SECTION("sentienel = 0, use_identity = true")
-  {
-    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
-    test_insert_or_apply(map, num_keys, num_unique_keys, true);
-  }
-
-  SECTION("sentienel = 0, use_identity = false")
-  {
-    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
-    test_insert_or_apply(map, num_keys, num_unique_keys, false);
-  }
-
-  SECTION("sentienel = -1, use_identity = true")
-  {
-    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
-    test_insert_or_apply(map, num_keys, num_unique_keys, true);
-  }
-
-  SECTION("sentienel = -1, use_identity = false")
-  {
-    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
-    test_insert_or_apply(map, num_keys, num_unique_keys, false);
-  }
+  auto map = cuco::static_map<Key,
+                              Value,
+                              cuco::extent<size_type>,
+                              cuda::thread_scope_device,
+                              thrust::equal_to<Key>,
+                              probe,
+                              cuco::cuda_allocator<std::byte>,
+                              cuco::storage<2>>{
+    num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+
+  test_insert_or_apply(map, num_keys, num_unique_keys);
 }
 
 TEMPLATE_TEST_CASE_SIG(
@@ -162,5 +130,5 @@ TEMPLATE_TEST_CASE_SIG(
                               cuco::storage<2>>{
     num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
 
-  test_insert_or_apply(map, num_keys, num_keys, true);
+  test_insert_or_apply(map, num_keys, num_keys);
 }
\ No newline at end of file

From 5a9a31ca6265c774bc658ea0070e190b10b401e3 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Fri, 28 Jun 2024 17:33:41 +0000
Subject: [PATCH 11/16] use insert_stable and update tests

---
 .../cuco/detail/static_map/static_map_ref.inl | 74 +++++++++++--------
 tests/static_map/insert_or_apply_test.cu      | 33 ++++++---
 2 files changed, 64 insertions(+), 43 deletions(-)

diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
index 06bbd01de..2212f5879 100644
--- a/include/cuco/detail/static_map/static_map_ref.inl
+++ b/include/cuco/detail/static_map/static_map_ref.inl
@@ -592,11 +592,12 @@ class operator_impl<
 
     ref_type& ref_ = static_cast<ref_type&>(*this);
 
-    auto const val       = ref_.impl_.heterogeneous_value(value);
-    auto const key       = ref_.impl_.extract_key(val);
-    auto& probing_scheme = ref_.impl_.probing_scheme();
-    auto storage_ref     = ref_.impl_.storage_ref();
-    auto probing_iter    = probing_scheme(key, storage_ref.window_extent());
+    auto const val         = ref_.impl_.heterogeneous_value(value);
+    auto const key         = ref_.impl_.extract_key(val);
+    auto& probing_scheme   = ref_.impl_.probing_scheme();
+    auto storage_ref       = ref_.impl_.storage_ref();
+    auto probing_iter      = probing_scheme(key, storage_ref.window_extent());
+    auto const empty_value = ref_.impl_.empty_slot_sentinel().second;
 
     while (true) {
       auto const window_slots = storage_ref[*probing_iter];
@@ -604,21 +605,23 @@ class operator_impl<
       for (auto& slot_content : window_slots) {
         auto const eq_res =
           ref_.impl_.predicate_.operator()<is_insert::YES>(key, slot_content.first);
+        auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content);
+        auto slot_ptr = (storage_ref.data() + *probing_iter)->data() + intra_window_index;
 
         // If the key is already in the container, update the payload and return
         if (eq_res == detail::equal_result::EQUAL) {
-          auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content);
-          op(
-            cuda::atomic_ref<T, Scope>{
-              ((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second},
-            val.second);
+          op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
           return;
         }
         if (eq_res == detail::equal_result::AVAILABLE) {
-          auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content);
-          if (attempt_insert_or_apply(
-                (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op)) {
-            return;
+          switch (ref_.impl_.attempt_insert_stable(slot_ptr, slot_content, val)) {
+            case insert_result::SUCCESS: return;
+            case insert_result::DUPLICATE: {
+              ref_.impl_.wait_for_payload(slot_ptr->second, empty_value);
+              op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
+              return;
+            }
+            default: continue;
           }
         }
       }
@@ -661,11 +664,12 @@ class operator_impl<
 
     ref_type& ref_ = static_cast<ref_type&>(*this);
 
-    auto const val       = ref_.impl_.heterogeneous_value(value);
-    auto const key       = ref_.impl_.extract_key(val);
-    auto& probing_scheme = ref_.impl_.probing_scheme();
-    auto storage_ref     = ref_.impl_.storage_ref();
-    auto probing_iter    = probing_scheme(group, key, storage_ref.window_extent());
+    auto const val         = ref_.impl_.heterogeneous_value(value);
+    auto const key         = ref_.impl_.extract_key(val);
+    auto& probing_scheme   = ref_.impl_.probing_scheme();
+    auto storage_ref       = ref_.impl_.storage_ref();
+    auto probing_iter      = probing_scheme(group, key, storage_ref.window_extent());
+    auto const empty_value = ref_.impl_.empty_slot_sentinel().second;
 
     while (true) {
       auto const window_slots = storage_ref[*probing_iter];
@@ -682,30 +686,36 @@ class operator_impl<
         return detail::window_probing_results{res, -1};
       }();
 
+      auto* slot_ptr = (storage_ref.data() + *probing_iter)->data() + intra_window_index;
+
       auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL);
       if (group_contains_equal) {
         auto const src_lane = __ffs(group_contains_equal) - 1;
         if (group.thread_rank() == src_lane) {
-          op(
-            cuda::atomic_ref<T, Scope>{
-              ((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second},
-            val.second);
+          op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
         }
-        group.sync();
         return;
       }
 
       auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE);
       if (group_contains_available) {
         auto const src_lane = __ffs(group_contains_available) - 1;
-        auto const status =
-          (group.thread_rank() == src_lane)
-            ? attempt_insert_or_apply(
-                (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op)
-            : false;
-
-        // Exit if inserted or assigned
-        if (group.shfl(status, src_lane)) { return; }
+        auto const status   = [&, target_idx = intra_window_index]() {
+          if (group.thread_rank() != src_lane) { return insert_result::CONTINUE; }
+          return ref_.impl_.attempt_insert_stable(slot_ptr, window_slots[target_idx], val);
+        }();
+
+        switch (group.shfl(status, src_lane)) {
+          case insert_result::SUCCESS: return;
+          case insert_result::DUPLICATE: {
+            if (group.thread_rank() == src_lane) {
+              ref_.impl_.wait_for_payload(slot_ptr->second, empty_value);
+              op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
+            }
+            return;
+          }
+          default: continue;
+        }
       } else {
         ++probing_iter;
       }
diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu
index 9f717474b..07a05214b 100644
--- a/tests/static_map/insert_or_apply_test.cu
+++ b/tests/static_map/insert_or_apply_test.cu
@@ -100,17 +100,28 @@ TEMPLATE_TEST_CASE_SIG(
     cuco::linear_probing<CGSize, cuco::murmurhash3_32<Key>>,
     cuco::double_hashing<CGSize, cuco::murmurhash3_32<Key>, cuco::murmurhash3_32<Key>>>;
 
-  auto map = cuco::static_map<Key,
-                              Value,
-                              cuco::extent<size_type>,
-                              cuda::thread_scope_device,
-                              thrust::equal_to<Key>,
-                              probe,
-                              cuco::cuda_allocator<std::byte>,
-                              cuco::storage<2>>{
-    num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
-
-  test_insert_or_apply(map, num_keys, num_unique_keys);
+  using map_type = cuco::static_map<Key,
+                                    Value,
+                                    cuco::extent<size_type>,
+                                    cuda::thread_scope_device,
+                                    thrust::equal_to<Key>,
+                                    probe,
+                                    cuco::cuda_allocator<std::byte>,
+                                    cuco::storage<2>>;
+
+  SECTION("Sentinel equals to identity")
+  {
+    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+
+    test_insert_or_apply(map, num_keys, num_unique_keys);
+  }
+
+  SECTION("Sentinel not equals to identity")
+  {
+    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+
+    test_insert_or_apply(map, num_keys, num_unique_keys);
+  }
 }
 
 TEMPLATE_TEST_CASE_SIG(

From a936ab2a50f927205ca370fa67d935b00e49b490 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Mon, 1 Jul 2024 21:22:51 +0000
Subject: [PATCH 12/16] minor improvements

---
 .../cuco/detail/static_map/static_map_ref.inl   | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
index 2212f5879..d68abeadb 100644
--- a/include/cuco/detail/static_map/static_map_ref.inl
+++ b/include/cuco/detail/static_map/static_map_ref.inl
@@ -20,6 +20,7 @@
 #include <cuco/operator.hpp>
 
 #include <cuda/atomic>
+#include <cuda/std/functional>
 #include <thrust/tuple.h>
 
 #include <cooperative_groups.h>
@@ -587,7 +588,7 @@ class operator_impl<
     static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
 
     static_assert(
-      std::is_invocable_v<Op, cuda::atomic_ref<T, Scope>, T>,
+      cuda::std::is_invocable_v<Op, cuda::atomic_ref<T, Scope>, T>,
       "insert_or_apply expects `Op` to be a callable as `Op(cuda::atomic_ref<T, Scope>, T)`");
 
     ref_type& ref_ = static_cast<ref_type&>(*this);
@@ -617,7 +618,9 @@ class operator_impl<
           switch (ref_.impl_.attempt_insert_stable(slot_ptr, slot_content, val)) {
             case insert_result::SUCCESS: return;
             case insert_result::DUPLICATE: {
-              ref_.impl_.wait_for_payload(slot_ptr->second, empty_value);
+              if constexpr (sizeof(value_type) > 8) {
+                ref_.impl_.wait_for_payload(slot_ptr->second, empty_value);
+              }
               op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
               return;
             }
@@ -633,8 +636,8 @@ class operator_impl<
   __device__ void insert_or_apply(Value const& value, cuco::op::reduce::sum_tag)
   {
     auto& ref_ = static_cast<ref_type&>(*this);
-    ref_.insert_or_apply(value, [](cuda::atomic_ref<T, Scope> slot_ref, T const& payload) {
-      slot_ref.fetch_add(payload, cuda::memory_order_relaxed);
+    ref_.insert_or_apply(value, [](cuda::atomic_ref<T, Scope> payload_ref, T const& payload) {
+      payload_ref.fetch_add(payload, cuda::memory_order_relaxed);
     });
   }
 
@@ -659,7 +662,7 @@ class operator_impl<
                                   Op op)
   {
     static_assert(
-      std::is_invocable_v<Op, cuda::atomic_ref<T, Scope>, T>,
+      cuda::std::is_invocable_v<Op, cuda::atomic_ref<T, Scope>, T>,
       "insert_or_apply expects `Op` to be a callable as `Op(cuda::atomic_ref<T, Scope>, T)`");
 
     ref_type& ref_ = static_cast<ref_type&>(*this);
@@ -728,8 +731,8 @@ class operator_impl<
                                   cuco::op::reduce::sum_tag)
   {
     auto& ref_ = static_cast<ref_type&>(*this);
-    ref_.insert_or_apply(group, value, [](cuda::atomic_ref<T, Scope> slot_ref, T const& payload) {
-      slot_ref.fetch_add(payload, cuda::memory_order_relaxed);
+    ref_.insert_or_apply(group, value, [](cuda::atomic_ref<T, Scope> payload_ref, T const& payload) {
+      payload_ref.fetch_add(payload, cuda::memory_order_relaxed);
     });
   }
 

From 2eadddb615142d9da2fecd65a4c1c16430b250b9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 1 Jul 2024 21:24:35 +0000
Subject: [PATCH 13/16] [pre-commit.ci] auto code formatting

---
 include/cuco/detail/static_map/static_map_ref.inl | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
index d68abeadb..62f12e4e7 100644
--- a/include/cuco/detail/static_map/static_map_ref.inl
+++ b/include/cuco/detail/static_map/static_map_ref.inl
@@ -731,9 +731,10 @@ class operator_impl<
                                   cuco::op::reduce::sum_tag)
   {
     auto& ref_ = static_cast<ref_type&>(*this);
-    ref_.insert_or_apply(group, value, [](cuda::atomic_ref<T, Scope> payload_ref, T const& payload) {
-      payload_ref.fetch_add(payload, cuda::memory_order_relaxed);
-    });
+    ref_.insert_or_apply(
+      group, value, [](cuda::atomic_ref<T, Scope> payload_ref, T const& payload) {
+        payload_ref.fetch_add(payload, cuda::memory_order_relaxed);
+      });
   }
 
  private:

From 0568e65689d028111a2f987d43dd48c3a82b16f2 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Tue, 2 Jul 2024 16:33:06 +0000
Subject: [PATCH 14/16] more minor fixes

---
 .../hash_table/static_map/insert_or_apply_bench.cu |  2 +-
 include/cuco/detail/static_map/kernels.cuh         |  8 ++++----
 include/cuco/detail/static_map/static_map.inl      |  2 +-
 include/cuco/static_map.cuh                        | 14 +++++++-------
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
index 186b548ea..6bbb26421 100644
--- a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
+++ b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
@@ -87,7 +87,7 @@ NVBENCH_BENCH_TYPES(static_map_insert_or_apply,
                     NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
                                       defaults::VALUE_TYPE_RANGE,
                                       nvbench::type_list<distribution::uniform>))
-  .set_name("static_set_insert_or_apply_uniform_occupancy")
+  .set_name("static_map_insert_or_apply_uniform_occupancy")
   .set_type_axes_names({"Key", "Value", "Distribution"})
   .set_max_noise(defaults::MAX_NOISE)
   .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
\ No newline at end of file
diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh
index ec15a66ed..f47c86cdf 100644
--- a/include/cuco/detail/static_map/kernels.cuh
+++ b/include/cuco/detail/static_map/kernels.cuh
@@ -74,19 +74,19 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_assign(InputIt first,
  * on the existing value at slot and the element to insert. If the key does not exist, inserts the
  * pair as if by insert.
  *
- * @note Callable object to perform binary operation should be able to invoke as Op(cuda::atomic<T,
- * Scope>, T>)
+ * @note Callable object to perform binary operation should be able to invoke as
+ * Op(cuda::atomic_ref<T,Scope>, T>)
  *
  * @tparam CGSize Number of threads in each CG
  * @tparam BlockSize Number of threads in each block
  * @tparam InputIt Device accessible input iterator whose `value_type` is
  * convertible to the `value_type` of the data structure
- * @tparam Op Callable type used to peform apply operation.
+ * @tparam Op Callable type used to peform `apply` operation.
  * @tparam Ref Type of non-owning device ref allowing access to storage
  *
  * @param first Beginning of the sequence of input elements
  * @param n Number of input elements
- * @param op callable object to perform apply operation.
+ * @param op Callable object to perform apply operation.
  * @param ref Non-owning container device ref used to access the slot storage
  */
 template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename Op, typename Ref>
diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
index 94cb194f6..807200228 100644
--- a/include/cuco/detail/static_map/static_map.inl
+++ b/include/cuco/detail/static_map/static_map.inl
@@ -253,7 +253,7 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename Op>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
-  insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream) noexcept
+  insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream)
 {
   return this->insert_or_apply_async(first, last, op, stream);
   stream.synchronize();
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index a3f274286..5256dea51 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -417,20 +417,20 @@ class static_map {
    * @note This function synchronizes the given stream. For asynchronous execution use
    * `insert_or_apply_async`.
    * @note Callable object to perform binary operation should be able to invoke as
-   *  Op(cuda::atomic<T, Scope>, T>)
+   *  Op(cuda::atomic_ref<T, Scope>, T>)
    *
    * @tparam InputIt Device accessible random access input iterator where
    * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
    * static_map<K, V>::value_type></tt> is `true`
-   * @tparam Op Callable type used to peform apply operation.
+   * @tparam Op Callable type used to peform `apply` operation.
    *
    * @param first Beginning of the sequence of keys
    * @param last End of the sequence of keys
-   * @param op callable object to perform apply operation.
+   * @param op Callable object to perform apply operation.
    * @param stream CUDA stream used for insert
    */
   template <typename InputIt, typename Op>
-  void insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream = {}) noexcept;
+  void insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream = {});
 
   /**
    * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent
@@ -439,16 +439,16 @@ class static_map {
    * inserts the pair as if by insert.
    *
    * @note Callable object to perform binary operation should be able to invoke as
-   *  Op(cuda::atomic<T, Scope>, T>)
+   *  Op(cuda::atomic_ref<T, Scope>, T>)
    *
    * @tparam InputIt Device accessible random access input iterator where
    * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
    * static_map<K, V>::value_type></tt> is `true`
-   * @tparam Op Callable type used to peform apply operation.
+   * @tparam Op Callable type used to peform `apply` operation.
    *
    * @param first Beginning of the sequence of keys
    * @param last End of the sequence of keys
-   * @param op callable object to perform apply operation.
+   * @param op Callable object to perform apply operation.
    * @param stream CUDA stream used for insert
    */
   template <typename InputIt, typename Op>

From 4ae274d93bd320400424294cb5639571a306590b Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Wed, 3 Jul 2024 18:10:02 +0000
Subject: [PATCH 15/16] more minor cleanup

---
 .../cuco/detail/static_map/static_map_ref.inl | 43 ++-----------------
 1 file changed, 3 insertions(+), 40 deletions(-)

diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
index 62f12e4e7..0e59aa502 100644
--- a/include/cuco/detail/static_map/static_map_ref.inl
+++ b/include/cuco/detail/static_map/static_map_ref.inl
@@ -712,7 +712,9 @@ class operator_impl<
           case insert_result::SUCCESS: return;
           case insert_result::DUPLICATE: {
             if (group.thread_rank() == src_lane) {
-              ref_.impl_.wait_for_payload(slot_ptr->second, empty_value);
+              if constexpr (sizeof(value_type) > 8) {
+                ref_.impl_.wait_for_payload(slot_ptr->second, empty_value);
+              }
               op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
             }
             return;
@@ -736,45 +738,6 @@ class operator_impl<
         payload_ref.fetch_add(payload, cuda::memory_order_relaxed);
       });
   }
-
- private:
-  /**
-   * @brief Attempts to insert an element into a slot or update the matching payload by applying the
-   * binary operation on the payload and new value.
-   *
-   * @tparam Value Input type which is implicitly convertible to 'value_type'
-   * @tparam Op Callable type which is used as apply operation and called be
-   *   called with arguments as Op(cuda::atomic_ref<T, Scope>, T)
-
-   * @param slot value_type pointer to the slot to insert
-   * @param value The element to insert
-   * @param op The callable object to perform binary operation between existing value at the slot
-   *  and element to insert.
-   *
-   * @return Returns `true` if the given `value` is inserted or `value` has a match in the map.
-   */
-  template <typename Value, typename Op>
-  __device__ constexpr bool attempt_insert_or_apply(value_type* slot,
-                                                    Value const& value,
-                                                    Op op) noexcept
-  {
-    ref_type& ref_          = static_cast<ref_type&>(*this);
-    auto const expected_key = ref_.impl_.empty_slot_sentinel().first;
-
-    auto old_key =
-      ref_.impl_.compare_and_swap(&slot->first, expected_key, static_cast<key_type>(value.first));
-    auto* old_key_ptr = reinterpret_cast<key_type*>(&old_key);
-
-    // if key success or key was already present in the map
-    if (cuco::detail::bitwise_compare(*old_key_ptr, expected_key) or
-        (ref_.impl_.predicate().equal_to(value.first, *old_key_ptr) ==
-         detail::equal_result::EQUAL)) {
-      // Update payload
-      op(cuda::atomic_ref<T, Scope>{slot->second}, value.second);
-      return true;
-    }
-    return false;
-  }
 };
 
 template <typename Key,

From f5adcad16d48468524c940e47ca763625cb826c2 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav Singanaboina <ssinganaboin@nvidia.com>
Date: Wed, 3 Jul 2024 22:04:46 +0000
Subject: [PATCH 16/16] replace cuda_stream_ref with cuda::stream_ref

---
 include/cuco/detail/static_map/static_map.inl | 8 ++++----
 include/cuco/static_map.cuh                   | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
index 06ba8441d..a26092196 100644
--- a/include/cuco/detail/static_map/static_map.inl
+++ b/include/cuco/detail/static_map/static_map.inl
@@ -257,10 +257,10 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename Op>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
-  insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream)
+  insert_or_apply(InputIt first, InputIt last, Op op, cuda::stream_ref stream)
 {
   return this->insert_or_apply_async(first, last, op, stream);
-  stream.synchronize();
+  stream.wait();
 }
 
 template <class Key,
@@ -273,7 +273,7 @@ template <class Key,
           class Storage>
 template <typename InputIt, typename Op>
 void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
-  insert_or_apply_async(InputIt first, InputIt last, Op op, cuda_stream_ref stream) noexcept
+  insert_or_apply_async(InputIt first, InputIt last, Op op, cuda::stream_ref stream) noexcept
 {
   auto const num = cuco::detail::distance(first, last);
   if (num == 0) { return; }
@@ -281,7 +281,7 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
   auto const grid_size = cuco::detail::grid_size(num, cg_size);
 
   static_map_ns::detail::insert_or_apply<cg_size, cuco::detail::default_block_size()>
-    <<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+    <<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
       first, num, op, ref(op::insert_or_apply));
 }
 
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index b34c1481d..f84759b0f 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -430,7 +430,7 @@ class static_map {
    * @param stream CUDA stream used for insert
    */
   template <typename InputIt, typename Op>
-  void insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream = {});
+  void insert_or_apply(InputIt first, InputIt last, Op op, cuda::stream_ref stream = {});
 
   /**
    * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent
@@ -455,7 +455,7 @@ class static_map {
   void insert_or_apply_async(InputIt first,
                              InputIt last,
                              Op op,
-                             cuda_stream_ref stream = {}) noexcept;
+                             cuda::stream_ref stream = {}) noexcept;
 
   /**
    * @brief Erases keys in the range `[first, last)`.