diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 128052b98..4651d5646 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -62,7 +62,8 @@ ConfigureBench(STATIC_MAP_BENCH
   hash_table/static_map/insert_bench.cu
   hash_table/static_map/find_bench.cu
   hash_table/static_map/contains_bench.cu
-  hash_table/static_map/erase_bench.cu)
+  hash_table/static_map/erase_bench.cu
+  hash_table/static_map/insert_or_apply_bench.cu)
 
 ###################################################################################################
 # - static_multiset benchmarks --------------------------------------------------------------------
diff --git a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
new file mode 100644
index 000000000..6bbb26421
--- /dev/null
+++ b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark_defaults.hpp>
+#include <benchmark_utils.hpp>
+
+#include <cuco/static_map.cuh>
+#include <cuco/utility/key_generator.cuh>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/transform.h>
+
+using namespace cuco::benchmark;
+using namespace cuco::utility;
+
+/**
+ * @brief A benchmark evaluating `cuco::static_map::insert_or_apply` performance
+ */
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert_or_apply(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  using pair_type = cuco::pair<Key, Value>;
+
+  auto const num_keys     = state.get_int64_or_default("NumInputs", defaults::N);
+  auto const occupancy    = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
+  auto const multiplicity = state.get_int64_or_default("Multiplicity", defaults::MULTIPLICITY);
+
+  std::size_t const size = cuco::detail::int_div_ceil(num_keys, multiplicity) / occupancy;
+
+  thrust::device_vector<Key> keys(num_keys);
+
+  key_generator gen;
+  gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
+
+  thrust::device_vector<pair_type> pairs(num_keys);
+  thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) {
+    return pair_type(key, static_cast<Value>(key));
+  });
+
+  state.add_element_count(num_keys);
+
+  cuco::static_map map{size, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+
+  state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+    map.clear_async({launch.get_stream()});
+
+    timer.start();
+    map.insert_or_apply_async(
+      pairs.begin(), pairs.end(), cuco::op::reduce::sum, {launch.get_stream()});
+    timer.stop();
+  });
+}
+
+template <typename Key, typename Value, typename Dist>
+std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_insert_or_apply(
+  nvbench::state& state, nvbench::type_list<Key, Value, Dist>)
+{
+  state.skip("Key should be the same type as Value.");
+}
+
+NVBENCH_BENCH_TYPES(static_map_insert_or_apply,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_map_insert_or_apply_uniform_multiplicity")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);
+
+NVBENCH_BENCH_TYPES(static_map_insert_or_apply,
+                    NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
+                                      defaults::VALUE_TYPE_RANGE,
+                                      nvbench::type_list<distribution::uniform>))
+  .set_name("static_map_insert_or_apply_uniform_occupancy")
+  .set_type_axes_names({"Key", "Value", "Distribution"})
+  .set_max_noise(defaults::MAX_NOISE)
+  .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
\ No newline at end of file
diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh
index 1052bd301..f47c86cdf 100644
--- a/include/cuco/detail/static_map/kernels.cuh
+++ b/include/cuco/detail/static_map/kernels.cuh
@@ -28,6 +28,7 @@
 namespace cuco::static_map_ns::detail {
 CUCO_SUPPRESS_KERNEL_WARNINGS
 
+// TODO user insert_or_assign internally
 /**
  * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to
  * `k` already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`.
@@ -67,4 +68,44 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_assign(InputIt first,
   }
 }
 
-}  // namespace cuco::static_map_ns::detail
+/**
+ * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to
+ * `k` already exists in the container, then binary operation is applied using `op` callable object
+ * on the existing value at slot and the element to insert. If the key does not exist, inserts the
+ * pair as if by insert.
+ *
+ * @note Callable object to perform binary operation should be able to invoke as
+ * Op(cuda::atomic_ref<T,Scope>, T>)
+ *
+ * @tparam CGSize Number of threads in each CG
+ * @tparam BlockSize Number of threads in each block
+ * @tparam InputIt Device accessible input iterator whose `value_type` is
+ * convertible to the `value_type` of the data structure
+ * @tparam Op Callable type used to peform `apply` operation.
+ * @tparam Ref Type of non-owning device ref allowing access to storage
+ *
+ * @param first Beginning of the sequence of input elements
+ * @param n Number of input elements
+ * @param op Callable object to perform apply operation.
+ * @param ref Non-owning container device ref used to access the slot storage
+ */
+template <int32_t CGSize, int32_t BlockSize, typename InputIt, typename Op, typename Ref>
+__global__ void insert_or_apply(InputIt first, cuco::detail::index_type n, Op op, Ref ref)
+{
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
+
+  while (idx < n) {
+    typename std::iterator_traits<InputIt>::value_type const& insert_pair = *(first + idx);
+    if constexpr (CGSize == 1) {
+      ref.insert_or_apply(insert_pair, op);
+    } else {
+      auto const tile =
+        cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
+      ref.insert_or_apply(tile, insert_pair, op);
+    }
+    idx += loop_stride;
+  }
+}
+
+}  // namespace cuco::static_map_ns::detail
\ No newline at end of file
diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
index 9ca129038..a26092196 100644
--- a/include/cuco/detail/static_map/static_map.inl
+++ b/include/cuco/detail/static_map/static_map.inl
@@ -247,6 +247,44 @@ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Stora
       first, num, ref(op::insert_or_assign));
 }
 
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename Op>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  insert_or_apply(InputIt first, InputIt last, Op op, cuda::stream_ref stream)
+{
+  return this->insert_or_apply_async(first, last, op, stream);
+  stream.wait();
+}
+
+template <class Key,
+          class T,
+          class Extent,
+          cuda::thread_scope Scope,
+          class KeyEqual,
+          class ProbingScheme,
+          class Allocator,
+          class Storage>
+template <typename InputIt, typename Op>
+void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+  insert_or_apply_async(InputIt first, InputIt last, Op op, cuda::stream_ref stream) noexcept
+{
+  auto const num = cuco::detail::distance(first, last);
+  if (num == 0) { return; }
+
+  auto const grid_size = cuco::detail::grid_size(num, cg_size);
+
+  static_map_ns::detail::insert_or_apply<cg_size, cuco::detail::default_block_size()>
+    <<<grid_size, cuco::detail::default_block_size(), 0, stream.get()>>>(
+      first, num, op, ref(op::insert_or_apply));
+}
+
 template <class Key,
           class T,
           class Extent,
diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
index 0361f459b..0e59aa502 100644
--- a/include/cuco/detail/static_map/static_map_ref.inl
+++ b/include/cuco/detail/static_map/static_map_ref.inl
@@ -16,9 +16,11 @@
 
 #pragma once
 
+#include <cuco/detail/equal_wrapper.cuh>
 #include <cuco/operator.hpp>
 
 #include <cuda/atomic>
+#include <cuda/std/functional>
 #include <thrust/tuple.h>
 
 #include <cooperative_groups.h>
@@ -393,9 +395,6 @@ class operator_impl<
   static constexpr auto cg_size     = base_type::cg_size;
   static constexpr auto window_size = base_type::window_size;
 
-  static_assert(sizeof(T) == 4 or sizeof(T) == 8,
-                "sizeof(mapped_type) must be either 4 bytes or 8 bytes.");
-
  public:
   /**
    * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v`
@@ -549,6 +548,198 @@ class operator_impl<
   }
 };
 
+// TODO use insert_or_apply internally
+template <typename Key,
+          typename T,
+          cuda::thread_scope Scope,
+          typename KeyEqual,
+          typename ProbingScheme,
+          typename StorageRef,
+          typename... Operators>
+class operator_impl<
+  op::insert_or_apply_tag,
+  static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>> {
+  using base_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef>;
+  using ref_type = static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>;
+  using key_type = typename base_type::key_type;
+  using value_type = typename base_type::value_type;
+
+  static constexpr auto cg_size     = base_type::cg_size;
+  static constexpr auto window_size = base_type::window_size;
+
+ public:
+  /**
+   * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, applies
+   * `Op` binary function to the mapped_type corresponding to the key `k` and the value `v`.
+   *
+   * @tparam Value Input type which is implicitly convertible to 'value_type'
+   * @tparam Op Callable type which is used as apply operation and can be
+   *   called with arguments as Op(cuda::atomic_ref<T, Scope>, T). Op strictly must
+   *   have this signature to atomically apply the operation.
+
+   * @param value The element to insert
+   * @param op The callable object to perform binary operation between existing value at the slot
+   *  and the element to insert.
+   */
+
+  template <typename Value, typename Op>
+  __device__ void insert_or_apply(Value const& value, Op op)
+  {
+    static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme");
+
+    static_assert(
+      cuda::std::is_invocable_v<Op, cuda::atomic_ref<T, Scope>, T>,
+      "insert_or_apply expects `Op` to be a callable as `Op(cuda::atomic_ref<T, Scope>, T)`");
+
+    ref_type& ref_ = static_cast<ref_type&>(*this);
+
+    auto const val         = ref_.impl_.heterogeneous_value(value);
+    auto const key         = ref_.impl_.extract_key(val);
+    auto& probing_scheme   = ref_.impl_.probing_scheme();
+    auto storage_ref       = ref_.impl_.storage_ref();
+    auto probing_iter      = probing_scheme(key, storage_ref.window_extent());
+    auto const empty_value = ref_.impl_.empty_slot_sentinel().second;
+
+    while (true) {
+      auto const window_slots = storage_ref[*probing_iter];
+
+      for (auto& slot_content : window_slots) {
+        auto const eq_res =
+          ref_.impl_.predicate_.operator()<is_insert::YES>(key, slot_content.first);
+        auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content);
+        auto slot_ptr = (storage_ref.data() + *probing_iter)->data() + intra_window_index;
+
+        // If the key is already in the container, update the payload and return
+        if (eq_res == detail::equal_result::EQUAL) {
+          op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
+          return;
+        }
+        if (eq_res == detail::equal_result::AVAILABLE) {
+          switch (ref_.impl_.attempt_insert_stable(slot_ptr, slot_content, val)) {
+            case insert_result::SUCCESS: return;
+            case insert_result::DUPLICATE: {
+              if constexpr (sizeof(value_type) > 8) {
+                ref_.impl_.wait_for_payload(slot_ptr->second, empty_value);
+              }
+              op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
+              return;
+            }
+            default: continue;
+          }
+        }
+      }
+      ++probing_iter;
+    }
+  }
+
+  template <typename Value>
+  __device__ void insert_or_apply(Value const& value, cuco::op::reduce::sum_tag)
+  {
+    auto& ref_ = static_cast<ref_type&>(*this);
+    ref_.insert_or_apply(value, [](cuda::atomic_ref<T, Scope> payload_ref, T const& payload) {
+      payload_ref.fetch_add(payload, cuda::memory_order_relaxed);
+    });
+  }
+
+  /**
+   * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, applies
+   * `Op` binary function to the mapped_type corresponding to the key `k` and the value `v`.
+   *
+   * @tparam Value Input type which is implicitly convertible to 'value_type'
+   * @tparam Op Callable type which is used as apply operation and can be
+   *   called with arguments as Op(cuda::atomic_ref<T, Scope>, T). Op strictly must
+   *   have this signature to atomically apply the operation.
+   *
+   * @param group The Cooperative Group used to perform group insert
+   * @param value The element to insert
+   * @param op The callable object to perform binary operation between existing value at the slot
+   *  and the element to insert.
+   */
+
+  template <typename Value, typename Op>
+  __device__ void insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
+                                  Value const& value,
+                                  Op op)
+  {
+    static_assert(
+      cuda::std::is_invocable_v<Op, cuda::atomic_ref<T, Scope>, T>,
+      "insert_or_apply expects `Op` to be a callable as `Op(cuda::atomic_ref<T, Scope>, T)`");
+
+    ref_type& ref_ = static_cast<ref_type&>(*this);
+
+    auto const val         = ref_.impl_.heterogeneous_value(value);
+    auto const key         = ref_.impl_.extract_key(val);
+    auto& probing_scheme   = ref_.impl_.probing_scheme();
+    auto storage_ref       = ref_.impl_.storage_ref();
+    auto probing_iter      = probing_scheme(group, key, storage_ref.window_extent());
+    auto const empty_value = ref_.impl_.empty_slot_sentinel().second;
+
+    while (true) {
+      auto const window_slots = storage_ref[*probing_iter];
+
+      auto const [state, intra_window_index] = [&]() {
+        auto res = detail::equal_result::UNEQUAL;
+        for (auto i = 0; i < window_size; ++i) {
+          res = ref_.impl_.predicate_.operator()<is_insert::YES>(key, window_slots[i].first);
+          if (res != detail::equal_result::UNEQUAL) {
+            return detail::window_probing_results{res, i};
+          }
+        }
+        // returns dummy index `-1` for UNEQUAL
+        return detail::window_probing_results{res, -1};
+      }();
+
+      auto* slot_ptr = (storage_ref.data() + *probing_iter)->data() + intra_window_index;
+
+      auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL);
+      if (group_contains_equal) {
+        auto const src_lane = __ffs(group_contains_equal) - 1;
+        if (group.thread_rank() == src_lane) {
+          op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
+        }
+        return;
+      }
+
+      auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE);
+      if (group_contains_available) {
+        auto const src_lane = __ffs(group_contains_available) - 1;
+        auto const status   = [&, target_idx = intra_window_index]() {
+          if (group.thread_rank() != src_lane) { return insert_result::CONTINUE; }
+          return ref_.impl_.attempt_insert_stable(slot_ptr, window_slots[target_idx], val);
+        }();
+
+        switch (group.shfl(status, src_lane)) {
+          case insert_result::SUCCESS: return;
+          case insert_result::DUPLICATE: {
+            if (group.thread_rank() == src_lane) {
+              if constexpr (sizeof(value_type) > 8) {
+                ref_.impl_.wait_for_payload(slot_ptr->second, empty_value);
+              }
+              op(cuda::atomic_ref<T, Scope>{slot_ptr->second}, val.second);
+            }
+            return;
+          }
+          default: continue;
+        }
+      } else {
+        ++probing_iter;
+      }
+    }
+  }
+
+  template <typename Value>
+  __device__ void insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
+                                  Value const& value,
+                                  cuco::op::reduce::sum_tag)
+  {
+    auto& ref_ = static_cast<ref_type&>(*this);
+    ref_.insert_or_apply(
+      group, value, [](cuda::atomic_ref<T, Scope> payload_ref, T const& payload) {
+        payload_ref.fetch_add(payload, cuda::memory_order_relaxed);
+      });
+  }
+};
+
 template <typename Key,
           typename T,
           cuda::thread_scope Scope,
diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp
index cf9643d1a..cb4cfd4de 100644
--- a/include/cuco/operator.hpp
+++ b/include/cuco/operator.hpp
@@ -38,6 +38,12 @@ struct insert_and_find_tag {
 struct insert_or_assign_tag {
 } inline constexpr insert_or_assign;  ///< `cuco::insert_or_assign` operator
 
+/**
+ * @brief `insert_or_apply` operator tag
+ */
+struct insert_or_apply_tag {
+} inline constexpr insert_or_apply;  ///< `cuco::insert_or_apply` operator
+
 /**
  * @brief `erase` operator tag
  */
@@ -68,6 +74,16 @@ struct find_tag {
 struct for_each_tag {
 } inline constexpr for_each;  ///< `cuco::for_each` operator
 
+namespace reduce {
+
+/**
+ * @brief `sum` reduction operator tag
+ */
+struct sum_tag {
+} inline constexpr sum;
+
+}  // namespace reduce
+
 }  // namespace op
 }  // namespace cuco
 
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index 4ef0abb22..f84759b0f 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -408,6 +408,55 @@ class static_map {
   template <typename InputIt>
   void insert_or_assign_async(InputIt first, InputIt last, cuda::stream_ref stream = {}) noexcept;
 
+  /**
+   * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent
+   * to `k` already exists in the container, then binary operation is applied using `op` callable
+   * object on the existing value at slot and the element to insert. If the key does not exist,
+   * inserts the pair as if by insert.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   * `insert_or_apply_async`.
+   * @note Callable object to perform binary operation should be able to invoke as
+   *  Op(cuda::atomic_ref<T, Scope>, T>)
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * static_map<K, V>::value_type></tt> is `true`
+   * @tparam Op Callable type used to peform `apply` operation.
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param op Callable object to perform apply operation.
+   * @param stream CUDA stream used for insert
+   */
+  template <typename InputIt, typename Op>
+  void insert_or_apply(InputIt first, InputIt last, Op op, cuda::stream_ref stream = {});
+
+  /**
+   * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent
+   * to `k` already exists in the container, then binary operation is applied using `op` callable
+   * object on the existing value at slot and the element to insert. If the key does not exist,
+   * inserts the pair as if by insert.
+   *
+   * @note Callable object to perform binary operation should be able to invoke as
+   *  Op(cuda::atomic_ref<T, Scope>, T>)
+   *
+   * @tparam InputIt Device accessible random access input iterator where
+   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
+   * static_map<K, V>::value_type></tt> is `true`
+   * @tparam Op Callable type used to peform `apply` operation.
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last End of the sequence of keys
+   * @param op Callable object to perform apply operation.
+   * @param stream CUDA stream used for insert
+   */
+  template <typename InputIt, typename Op>
+  void insert_or_apply_async(InputIt first,
+                             InputIt last,
+                             Op op,
+                             cuda::stream_ref stream = {}) noexcept;
+
   /**
    * @brief Erases keys in the range `[first, last)`.
    *
diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh
index 5c96af776..4006d84ae 100644
--- a/include/cuco/static_map_ref.cuh
+++ b/include/cuco/static_map_ref.cuh
@@ -74,7 +74,8 @@ class static_map_ref
   using impl_type = detail::
     open_addressing_ref_impl<Key, Scope, KeyEqual, ProbingScheme, StorageRef, allows_duplicates>;
 
-  static_assert(sizeof(T) <= 8, "Container does not support payload types larger than 8 bytes.");
+  static_assert(sizeof(T) == 4 or sizeof(T) == 8,
+                "sizeof(mapped_type) must be either 4 bytes or 8 bytes.");
 
   static_assert(
     cuco::is_bitwise_comparable_v<Key>,
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 2cdd625e5..491ecf841 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -81,6 +81,7 @@ ConfigureTest(STATIC_MAP_TEST
     static_map/heterogeneous_lookup_test.cu
     static_map/insert_and_find_test.cu
     static_map/insert_or_assign_test.cu
+    static_map/insert_or_apply_test.cu
     static_map/key_sentinel_test.cu
     static_map/shared_memory_test.cu
     static_map/stream_test.cu
diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu
new file mode 100644
index 000000000..07a05214b
--- /dev/null
+++ b/tests/static_map/insert_or_apply_test.cu
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <test_utils.hpp>
+
+#include <cuco/static_map.cuh>
+
+#include <cuda/atomic>
+#include <thrust/device_vector.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/tuple.h>
+
+#include <catch2/catch_template_test_macros.hpp>
+
+#include <cstdint>
+#include <iostream>
+
+using size_type = std::size_t;
+
+template <typename Map>
+void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_keys)
+{
+  REQUIRE((num_keys % num_unique_keys) == 0);
+
+  using Key   = typename Map::key_type;
+  using Value = typename Map::mapped_type;
+
+  // Insert pairs
+  auto pairs_begin = thrust::make_transform_iterator(
+    thrust::counting_iterator<size_type>(0),
+    cuda::proclaim_return_type<cuco::pair<Key, Value>>([num_unique_keys] __device__(auto i) {
+      return cuco::pair<Key, Value>{i % num_unique_keys, 1};
+    }));
+
+  map.insert_or_apply(
+    pairs_begin,
+    pairs_begin + num_keys,
+    [] __device__(cuda::atomic_ref<Value, Map::thread_scope> lhs, const Value& rhs) {
+      lhs.fetch_add(rhs, cuda::memory_order_relaxed);
+    });
+
+  REQUIRE(map.size() == num_unique_keys);
+
+  thrust::device_vector<Key> d_keys(num_unique_keys);
+  thrust::device_vector<Value> d_values(num_unique_keys);
+  map.retrieve_all(d_keys.begin(), d_values.begin());
+
+  REQUIRE(cuco::test::equal(d_values.begin(),
+                            d_values.end(),
+                            thrust::make_constant_iterator<Value>(num_keys / num_unique_keys),
+                            thrust::equal_to<Value>{}));
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "static_map insert_or_apply tests",
+  "",
+  ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize),
+   Key,
+   Value,
+   Probe,
+   CGSize),
+  (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 1),
+  (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 2),
+  (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1),
+  (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2),
+  (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2))
+{
+  constexpr size_type num_keys{400};
+  constexpr size_type num_unique_keys{100};
+
+  using probe = std::conditional_t<
+    Probe == cuco::test::probe_sequence::linear_probing,
+    cuco::linear_probing<CGSize, cuco::murmurhash3_32<Key>>,
+    cuco::double_hashing<CGSize, cuco::murmurhash3_32<Key>, cuco::murmurhash3_32<Key>>>;
+
+  using map_type = cuco::static_map<Key,
+                                    Value,
+                                    cuco::extent<size_type>,
+                                    cuda::thread_scope_device,
+                                    thrust::equal_to<Key>,
+                                    probe,
+                                    cuco::cuda_allocator<std::byte>,
+                                    cuco::storage<2>>;
+
+  SECTION("Sentinel equals to identity")
+  {
+    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+
+    test_insert_or_apply(map, num_keys, num_unique_keys);
+  }
+
+  SECTION("Sentinel not equals to identity")
+  {
+    auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
+
+    test_insert_or_apply(map, num_keys, num_unique_keys);
+  }
+}
+
+TEMPLATE_TEST_CASE_SIG(
+  "static_map insert_or_apply all unique keys tests", "", ((typename Key)), (int32_t), (int64_t))
+{
+  using Value = Key;
+
+  constexpr size_type num_keys = 100;
+
+  auto map = cuco::static_map<Key,
+                              Value,
+                              cuco::extent<size_type>,
+                              cuda::thread_scope_device,
+                              thrust::equal_to<Key>,
+                              cuco::linear_probing<1, cuco::murmurhash3_32<Key>>,
+                              cuco::cuda_allocator<std::byte>,
+                              cuco::storage<2>>{
+    num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
+
+  test_insert_or_apply(map, num_keys, num_keys);
+}
\ No newline at end of file