diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 128052b98..4651d5646 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -62,7 +62,8 @@ ConfigureBench(STATIC_MAP_BENCH hash_table/static_map/insert_bench.cu hash_table/static_map/find_bench.cu hash_table/static_map/contains_bench.cu - hash_table/static_map/erase_bench.cu) + hash_table/static_map/erase_bench.cu + hash_table/static_map/insert_or_apply_bench.cu) ################################################################################################### # - static_multiset benchmarks -------------------------------------------------------------------- diff --git a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu new file mode 100644 index 000000000..6bbb26421 --- /dev/null +++ b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_map::insert_or_apply` performance + */ +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert_or_apply( + nvbench::state& state, nvbench::type_list) +{ + using pair_type = cuco::pair; + + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const multiplicity = state.get_int64_or_default("Multiplicity", defaults::MULTIPLICITY); + + std::size_t const size = cuco::detail::int_div_ceil(num_keys, multiplicity) / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + thrust::device_vector pairs(num_keys); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, static_cast(key)); + }); + + state.add_element_count(num_keys); + + cuco::static_map map{size, cuco::empty_key{-1}, cuco::empty_value{0}}; + + state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + map.clear_async({launch.get_stream()}); + + timer.start(); + map.insert_or_apply_async( + pairs.begin(), pairs.end(), cuco::op::reduce::sum, {launch.get_stream()}); + timer.stop(); + }); +} + +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_insert_or_apply( + nvbench::state& state, nvbench::type_list) +{ + state.skip("Key should be the same type as Value."); +} + +NVBENCH_BENCH_TYPES(static_map_insert_or_apply, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_insert_or_apply_uniform_multiplicity") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); + +NVBENCH_BENCH_TYPES(static_map_insert_or_apply, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_insert_or_apply_uniform_occupancy") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); \ No newline at end of file diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index 1052bd301..f47c86cdf 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -28,6 +28,7 @@ namespace cuco::static_map_ns::detail { CUCO_SUPPRESS_KERNEL_WARNINGS +// TODO user insert_or_assign internally /** * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to * `k` already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`. @@ -67,4 +68,44 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_assign(InputIt first, } } -} // namespace cuco::static_map_ns::detail +/** + * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to + * `k` already exists in the container, then binary operation is applied using `op` callable object + * on the existing value at slot and the element to insert. If the key does not exist, inserts the + * pair as if by insert. + * + * @note Callable object to perform binary operation should be able to invoke as + * Op(cuda::atomic_ref, T>) + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize Number of threads in each block + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the `value_type` of the data structure + * @tparam Op Callable type used to peform `apply` operation. + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param op Callable object to perform apply operation. + * @param ref Non-owning container device ref used to access the slot storage + */ +template +__global__ void insert_or_apply(InputIt first, cuco::detail::index_type n, Op op, Ref ref) +{ + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; + + while (idx < n) { + typename std::iterator_traits::value_type const& insert_pair = *(first + idx); + if constexpr (CGSize == 1) { + ref.insert_or_apply(insert_pair, op); + } else { + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + ref.insert_or_apply(tile, insert_pair, op); + } + idx += loop_stride; + } +} + +} // namespace cuco::static_map_ns::detail \ No newline at end of file diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 9ca129038..a26092196 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -247,6 +247,44 @@ void static_map +template +void static_map:: + insert_or_apply(InputIt first, InputIt last, Op op, cuda::stream_ref stream) +{ + return this->insert_or_apply_async(first, last, op, stream); + stream.wait(); +} + +template +template +void static_map:: + insert_or_apply_async(InputIt first, InputIt last, Op op, cuda::stream_ref stream) noexcept +{ + auto const num = cuco::detail::distance(first, last); + if (num == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num, cg_size); + + static_map_ns::detail::insert_or_apply + <<>>( + first, num, op, ref(op::insert_or_apply)); +} + template #include #include +#include #include #include @@ -393,9 +395,6 @@ class operator_impl< static constexpr auto cg_size = base_type::cg_size; static constexpr auto window_size = base_type::window_size; - static_assert(sizeof(T) == 4 or sizeof(T) == 8, - "sizeof(mapped_type) must be either 4 bytes or 8 bytes."); - public: /** * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v` @@ -549,6 +548,198 @@ class operator_impl< } }; +// TODO use insert_or_apply internally +template +class operator_impl< + op::insert_or_apply_tag, + static_map_ref> { + using base_type = static_map_ref; + using ref_type = static_map_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, applies + * `Op` binary function to the mapped_type corresponding to the key `k` and the value `v`. + * + * @tparam Value Input type which is implicitly convertible to 'value_type' + * @tparam Op Callable type which is used as apply operation and can be + * called with arguments as Op(cuda::atomic_ref, T). Op strictly must + * have this signature to atomically apply the operation. + + * @param value The element to insert + * @param op The callable object to perform binary operation between existing value at the slot + * and the element to insert. + */ + + template + __device__ void insert_or_apply(Value const& value, Op op) + { + static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + + static_assert( + cuda::std::is_invocable_v, T>, + "insert_or_apply expects `Op` to be a callable as `Op(cuda::atomic_ref, T)`"); + + ref_type& ref_ = static_cast(*this); + + auto const val = ref_.impl_.heterogeneous_value(value); + auto const key = ref_.impl_.extract_key(val); + auto& probing_scheme = ref_.impl_.probing_scheme(); + auto storage_ref = ref_.impl_.storage_ref(); + auto probing_iter = probing_scheme(key, storage_ref.window_extent()); + auto const empty_value = ref_.impl_.empty_slot_sentinel().second; + + while (true) { + auto const window_slots = storage_ref[*probing_iter]; + + for (auto& slot_content : window_slots) { + auto const eq_res = + ref_.impl_.predicate_.operator()(key, slot_content.first); + auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); + auto slot_ptr = (storage_ref.data() + *probing_iter)->data() + intra_window_index; + + // If the key is already in the container, update the payload and return + if (eq_res == detail::equal_result::EQUAL) { + op(cuda::atomic_ref{slot_ptr->second}, val.second); + return; + } + if (eq_res == detail::equal_result::AVAILABLE) { + switch (ref_.impl_.attempt_insert_stable(slot_ptr, slot_content, val)) { + case insert_result::SUCCESS: return; + case insert_result::DUPLICATE: { + if constexpr (sizeof(value_type) > 8) { + ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + } + op(cuda::atomic_ref{slot_ptr->second}, val.second); + return; + } + default: continue; + } + } + } + ++probing_iter; + } + } + + template + __device__ void insert_or_apply(Value const& value, cuco::op::reduce::sum_tag) + { + auto& ref_ = static_cast(*this); + ref_.insert_or_apply(value, [](cuda::atomic_ref payload_ref, T const& payload) { + payload_ref.fetch_add(payload, cuda::memory_order_relaxed); + }); + } + + /** + * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, applies + * `Op` binary function to the mapped_type corresponding to the key `k` and the value `v`. + * + * @tparam Value Input type which is implicitly convertible to 'value_type' + * @tparam Op Callable type which is used as apply operation and can be + * called with arguments as Op(cuda::atomic_ref, T). Op strictly must + * have this signature to atomically apply the operation. + * + * @param group The Cooperative Group used to perform group insert + * @param value The element to insert + * @param op The callable object to perform binary operation between existing value at the slot + * and the element to insert. + */ + + template + __device__ void insert_or_apply(cooperative_groups::thread_block_tile const& group, + Value const& value, + Op op) + { + static_assert( + cuda::std::is_invocable_v, T>, + "insert_or_apply expects `Op` to be a callable as `Op(cuda::atomic_ref, T)`"); + + ref_type& ref_ = static_cast(*this); + + auto const val = ref_.impl_.heterogeneous_value(value); + auto const key = ref_.impl_.extract_key(val); + auto& probing_scheme = ref_.impl_.probing_scheme(); + auto storage_ref = ref_.impl_.storage_ref(); + auto probing_iter = probing_scheme(group, key, storage_ref.window_extent()); + auto const empty_value = ref_.impl_.empty_slot_sentinel().second; + + while (true) { + auto const window_slots = storage_ref[*probing_iter]; + + auto const [state, intra_window_index] = [&]() { + auto res = detail::equal_result::UNEQUAL; + for (auto i = 0; i < window_size; ++i) { + res = ref_.impl_.predicate_.operator()(key, window_slots[i].first); + if (res != detail::equal_result::UNEQUAL) { + return detail::window_probing_results{res, i}; + } + } + // returns dummy index `-1` for UNEQUAL + return detail::window_probing_results{res, -1}; + }(); + + auto* slot_ptr = (storage_ref.data() + *probing_iter)->data() + intra_window_index; + + auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); + if (group_contains_equal) { + auto const src_lane = __ffs(group_contains_equal) - 1; + if (group.thread_rank() == src_lane) { + op(cuda::atomic_ref{slot_ptr->second}, val.second); + } + return; + } + + auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE); + if (group_contains_available) { + auto const src_lane = __ffs(group_contains_available) - 1; + auto const status = [&, target_idx = intra_window_index]() { + if (group.thread_rank() != src_lane) { return insert_result::CONTINUE; } + return ref_.impl_.attempt_insert_stable(slot_ptr, window_slots[target_idx], val); + }(); + + switch (group.shfl(status, src_lane)) { + case insert_result::SUCCESS: return; + case insert_result::DUPLICATE: { + if (group.thread_rank() == src_lane) { + if constexpr (sizeof(value_type) > 8) { + ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + } + op(cuda::atomic_ref{slot_ptr->second}, val.second); + } + return; + } + default: continue; + } + } else { + ++probing_iter; + } + } + } + + template + __device__ void insert_or_apply(cooperative_groups::thread_block_tile const& group, + Value const& value, + cuco::op::reduce::sum_tag) + { + auto& ref_ = static_cast(*this); + ref_.insert_or_apply( + group, value, [](cuda::atomic_ref payload_ref, T const& payload) { + payload_ref.fetch_add(payload, cuda::memory_order_relaxed); + }); + } +}; + template void insert_or_assign_async(InputIt first, InputIt last, cuda::stream_ref stream = {}) noexcept; + /** + * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent + * to `k` already exists in the container, then binary operation is applied using `op` callable + * object on the existing value at slot and the element to insert. If the key does not exist, + * inserts the pair as if by insert. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `insert_or_apply_async`. + * @note Callable object to perform binary operation should be able to invoke as + * Op(cuda::atomic_ref, T>) + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_map::value_type> is `true` + * @tparam Op Callable type used to peform `apply` operation. + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param op Callable object to perform apply operation. + * @param stream CUDA stream used for insert + */ + template + void insert_or_apply(InputIt first, InputIt last, Op op, cuda::stream_ref stream = {}); + + /** + * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent + * to `k` already exists in the container, then binary operation is applied using `op` callable + * object on the existing value at slot and the element to insert. If the key does not exist, + * inserts the pair as if by insert. + * + * @note Callable object to perform binary operation should be able to invoke as + * Op(cuda::atomic_ref, T>) + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_map::value_type> is `true` + * @tparam Op Callable type used to peform `apply` operation. + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param op Callable object to perform apply operation. + * @param stream CUDA stream used for insert + */ + template + void insert_or_apply_async(InputIt first, + InputIt last, + Op op, + cuda::stream_ref stream = {}) noexcept; + /** * @brief Erases keys in the range `[first, last)`. * diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh index 5c96af776..4006d84ae 100644 --- a/include/cuco/static_map_ref.cuh +++ b/include/cuco/static_map_ref.cuh @@ -74,7 +74,8 @@ class static_map_ref using impl_type = detail:: open_addressing_ref_impl; - static_assert(sizeof(T) <= 8, "Container does not support payload types larger than 8 bytes."); + static_assert(sizeof(T) == 4 or sizeof(T) == 8, + "sizeof(mapped_type) must be either 4 bytes or 8 bytes."); static_assert( cuco::is_bitwise_comparable_v, diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2cdd625e5..491ecf841 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -81,6 +81,7 @@ ConfigureTest(STATIC_MAP_TEST static_map/heterogeneous_lookup_test.cu static_map/insert_and_find_test.cu static_map/insert_or_assign_test.cu + static_map/insert_or_apply_test.cu static_map/key_sentinel_test.cu static_map/shared_memory_test.cu static_map/stream_test.cu diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu new file mode 100644 index 000000000..07a05214b --- /dev/null +++ b/tests/static_map/insert_or_apply_test.cu @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +using size_type = std::size_t; + +template +void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_keys) +{ + REQUIRE((num_keys % num_unique_keys) == 0); + + using Key = typename Map::key_type; + using Value = typename Map::mapped_type; + + // Insert pairs + auto pairs_begin = thrust::make_transform_iterator( + thrust::counting_iterator(0), + cuda::proclaim_return_type>([num_unique_keys] __device__(auto i) { + return cuco::pair{i % num_unique_keys, 1}; + })); + + map.insert_or_apply( + pairs_begin, + pairs_begin + num_keys, + [] __device__(cuda::atomic_ref lhs, const Value& rhs) { + lhs.fetch_add(rhs, cuda::memory_order_relaxed); + }); + + REQUIRE(map.size() == num_unique_keys); + + thrust::device_vector d_keys(num_unique_keys); + thrust::device_vector d_values(num_unique_keys); + map.retrieve_all(d_keys.begin(), d_values.begin()); + + REQUIRE(cuco::test::equal(d_values.begin(), + d_values.end(), + thrust::make_constant_iterator(num_keys / num_unique_keys), + thrust::equal_to{})); +} + +TEMPLATE_TEST_CASE_SIG( + "static_map insert_or_apply tests", + "", + ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize), + Key, + Value, + Probe, + CGSize), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr size_type num_keys{400}; + constexpr size_type num_unique_keys{100}; + + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing>, + cuco::double_hashing, cuco::murmurhash3_32>>; + + using map_type = cuco::static_map, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + SECTION("Sentinel equals to identity") + { + auto map = map_type{num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; + + test_insert_or_apply(map, num_keys, num_unique_keys); + } + + SECTION("Sentinel not equals to identity") + { + auto map = map_type{num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; + + test_insert_or_apply(map, num_keys, num_unique_keys); + } +} + +TEMPLATE_TEST_CASE_SIG( + "static_map insert_or_apply all unique keys tests", "", ((typename Key)), (int32_t), (int64_t)) +{ + using Value = Key; + + constexpr size_type num_keys = 100; + + auto map = cuco::static_map, + cuda::thread_scope_device, + thrust::equal_to, + cuco::linear_probing<1, cuco::murmurhash3_32>, + cuco::cuda_allocator, + cuco::storage<2>>{ + num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; + + test_insert_or_apply(map, num_keys, num_keys); +} \ No newline at end of file