From 84fd0b169f52a9c516964fb4544ffb78eba61b74 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Sat, 14 Oct 2023 00:26:10 +0000 Subject: [PATCH 01/16] WIP add insert_or_apply --- include/cuco/detail/static_map/kernels.cuh | 39 ++++ include/cuco/detail/static_map/static_map.inl | 38 ++++ .../cuco/detail/static_map/static_map_ref.inl | 210 ++++++++++++++++++ include/cuco/operator.hpp | 16 ++ include/cuco/static_map.cuh | 11 + tests/CMakeLists.txt | 1 + tests/static_map/insert_or_apply_test.cu | 104 +++++++++ 7 files changed, 419 insertions(+) create mode 100644 tests/static_map/insert_or_apply_test.cu diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index f9171ef77..b65e59451 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -30,6 +30,7 @@ namespace experimental { namespace static_map_ns { namespace detail { +// TODO user insert_or_assign internally /** * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to * `k` already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`. @@ -67,6 +68,44 @@ __global__ void insert_or_assign(InputIt first, cuco::detail::index_type n, Ref } } +// TODO docs +/** + * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to + * `k` already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`. + * If the key does not exist, inserts the pair as if by insert. + * + * @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element + * is inserted. + * + * @tparam CGSize Number of threads in each CG + * @tparam BlockSize Number of threads in each block + * @tparam InputIt Device accessible input iterator whose `value_type` is + * convertible to the `value_type` of the data structure + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param first Beginning of the sequence of input elements + * @param n Number of input elements + * @param ref Non-owning container device ref used to access the slot storage + */ +template +__global__ void insert_or_apply(InputIt first, cuco::detail::index_type n, Op op, Ref ref) +{ + auto const loop_stride = cuco::detail::grid_stride() / CGSize; + auto idx = cuco::detail::global_thread_id() / CGSize; + + while (idx < n) { + typename std::iterator_traits::value_type const& insert_pair = *(first + idx); + if constexpr (CGSize == 1) { + ref.insert_or_apply(insert_pair, op); + } else { + auto const tile = + cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + ref.insert_or_apply(tile, insert_pair, op); + } + idx += loop_stride; + } +} + /** * @brief Finds the equivalent map elements of all keys in the range `[first, first + n)`. * diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 9249d4fa1..80d7135e9 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -241,6 +241,44 @@ void static_map +template +void static_map:: + insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream) noexcept +{ + return this->insert_or_apply_async(first, last, op, stream); + stream.synchronize(); +} + +template +template +void static_map:: + insert_or_apply_async(InputIt first, InputIt last, Op op, cuda_stream_ref stream) noexcept +{ + auto const num = cuco::detail::distance(first, last); + if (num == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num, cg_size); + + static_map_ns::detail::insert_or_apply + <<>>( + first, num, op, ref(op::insert_or_apply)); +} + template +class operator_impl< + op::insert_or_apply_tag, + static_map_ref> { + using base_type = static_map_ref; + using ref_type = static_map_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + static_assert(sizeof(T) == 4 or sizeof(T) == 8, + "sizeof(mapped_type) must be either 4 bytes or 8 bytes."); + + public: + // TODO docs + /** + * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v` + * to the mapped_type corresponding to the key `k`. + * + * @tparam Value Input type which is implicitly convertible to 'value_type' + * + * @param value The element to insert + */ + template + __device__ void insert_or_apply(Value const& value, Op op) noexcept + { + static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + + ref_type& ref_ = static_cast(*this); + auto const key = thrust::get<0>(value); + auto& probing_scheme = ref_.impl_.probing_scheme(); + auto storage_ref = ref_.impl_.storage_ref(); + auto probing_iter = probing_scheme(key, storage_ref.window_extent()); + + while (true) { + auto const window_slots = storage_ref[*probing_iter]; + + for (auto& slot_content : window_slots) { + auto const eq_res = ref_.impl_.predicate()(slot_content.first, key); + + // If the key is already in the container, update the payload and return + if (eq_res == detail::equal_result::EQUAL) { + auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); + op(((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second, + static_cast(thrust::get<1>(value))); + return; + } + if (eq_res == detail::equal_result::EMPTY or + cuco::detail::bitwise_compare(slot_content.first, ref_.impl_.erased_key_sentinel())) { + auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); + if (attempt_insert_or_apply( + (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op)) { + return; + } + } + } + ++probing_iter; + } + } + + template + __device__ void insert_or_apply(Value const& value, + cuco::experimental::op::reduce::sum_tag) noexcept + { + auto& ref_ = static_cast(*this); + ref_.insert_or_apply(value, [](T& slot, T const& payload) { + cuda::atomic_ref slot_ref{slot}; + slot_ref.fetch_add(payload, cuda::memory_order_relaxed); + }); + } + + // TODO docs + /** + * @brief Inserts an element. + * + * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v` + * to the mapped_type corresponding to the key `k`. + * + * @tparam Value Input type which is implicitly convertible to 'value_type' + * + * @param group The Cooperative Group used to perform group insert + * @param value The element to insert + */ + template + __device__ void insert_or_apply(cooperative_groups::thread_block_tile const& group, + Value const& value, + Op op) noexcept + { + ref_type& ref_ = static_cast(*this); + + auto const key = value.first; + auto& probing_scheme = ref_.impl_.probing_scheme(); + auto storage_ref = ref_.impl_.storage_ref(); + auto probing_iter = probing_scheme(group, key, storage_ref.window_extent()); + + while (true) { + auto const window_slots = storage_ref[*probing_iter]; + + auto const [state, intra_window_index] = [&]() { + for (auto i = 0; i < window_size; ++i) { + switch (ref_.impl_.predicate()(window_slots[i].first, key)) { + case detail::equal_result::EMPTY: + return detail::window_probing_results{detail::equal_result::EMPTY, i}; + case detail::equal_result::EQUAL: + return detail::window_probing_results{detail::equal_result::EQUAL, i}; + default: { + if (cuco::detail::bitwise_compare(window_slots[i].first, + ref_.impl_.erased_key_sentinel())) { + return window_probing_results{detail::equal_result::ERASED, i}; + } else { + continue; + } + } + } + } + // returns dummy index `-1` for UNEQUAL + return detail::window_probing_results{detail::equal_result::UNEQUAL, -1}; + }(); + + auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); + if (group_contains_equal) { + auto const src_lane = __ffs(group_contains_equal) - 1; + if (group.thread_rank() == src_lane) { + op(&((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second, + value.second); + } + group.sync(); + return; + } + + auto const group_contains_available = + group.ballot(state == detail::equal_result::EMPTY or state == detail::equal_result::ERASED); + if (group_contains_available) { + auto const src_lane = __ffs(group_contains_available) - 1; + auto const status = + (group.thread_rank() == src_lane) + ? attempt_insert_or_apply( + (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op) + : false; + + // Exit if inserted or assigned + if (group.shfl(status, src_lane)) { return; } + } else { + ++probing_iter; + } + } + } + + template + __device__ void insert_or_apply(cooperative_groups::thread_block_tile const& group, + Value const& value, + cuco::experimental::op::reduce::sum_tag) noexcept + { + auto& ref_ = static_cast(*this); + ref_.insert_or_apply(group, value, [](T& slot, T const& payload) { + cuda::atomic_ref slot_ref{slot}; + slot_ref.fetch_add(payload, cuda::memory_order_relaxed); + }); + } + + private: + // TODO docs + /** + * @brief Attempts to insert an element into a slot or update the matching payload with the given + * element + * + * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v` + * to the mapped_type corresponding to the key `k`. + * + * @tparam Value Input type which is implicitly convertible to 'value_type' + * + * @param group The Cooperative Group used to perform group insert + * @param value The element to insert + * + * @return Returns `true` if the given `value` is inserted or `value` has a match in the map. + */ + template + __device__ constexpr bool attempt_insert_or_apply(value_type* slot, + Value const& value, + Op op) noexcept + { + ref_type& ref_ = static_cast(*this); + auto const expected_key = ref_.impl_.empty_slot_sentinel().first; + + auto old_key = ref_.impl_.compare_and_swap( + &slot->first, expected_key, static_cast(thrust::get<0>(value))); + auto* old_key_ptr = reinterpret_cast(&old_key); + + // if key success or key was already present in the map + if (cuco::detail::bitwise_compare(*old_key_ptr, expected_key) or + (ref_.impl_.predicate().equal_to(*old_key_ptr, + thrust::get<0>(thrust::raw_reference_cast(value))) == + detail::equal_result::EQUAL)) { + // Update payload + op(slot->second, static_cast(thrust::get<1>(value))); + return true; + } + return false; + } +}; + template void insert_or_assign_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; + // TODO docs + template + void insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream = {}) noexcept; + + // TODO docs + template + void insert_or_apply_async(InputIt first, + InputIt last, + Op op, + cuda_stream_ref stream = {}) noexcept; + /** * @brief Erases keys in the range `[first, last)`. * diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 916e0ea42..c44de4037 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -77,6 +77,7 @@ ConfigureTest(STATIC_MAP_TEST static_map/heterogeneous_lookup_test.cu static_map/insert_and_find_test.cu static_map/insert_or_assign_test.cu + static_map/insert_or_apply_test.cu static_map/key_sentinel_test.cu static_map/shared_memory_test.cu static_map/stream_test.cu diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu new file mode 100644 index 000000000..127cfa6ba --- /dev/null +++ b/tests/static_map/insert_or_apply_test.cu @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +using size_type = std::size_t; + +template +__inline__ void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_keys) +{ + REQUIRE((num_keys % num_unique_keys) == 0); + + using key_type = typename Map::key_type; + using mapped_type = typename Map::mapped_type; + + auto keys_begin = thrust::make_transform_iterator( + thrust::counting_iterator(0), + [num_unique_keys] __host__ __device__(key_type const& x) -> key_type { + return x % num_unique_keys; + }); + + auto values_begin = thrust::make_constant_iterator(1); + + auto pairs_begin = thrust::make_zip_iterator(thrust::make_tuple(keys_begin, values_begin)); + + map.insert_or_apply(pairs_begin, pairs_begin + num_keys, cuco::experimental::op::reduce::sum); + + REQUIRE(map.size() == num_unique_keys); + + thrust::device_vector d_keys(num_unique_keys); + thrust::device_vector d_values(num_unique_keys); + map.retrieve_all(d_keys.begin(), d_values.begin()); + + // TODO remove + for (int i = 0; i < num_unique_keys; ++i) { + std::cout << d_keys[i] << " " << d_values[i] << std::endl; + } + + REQUIRE(cuco::test::equal(d_values.begin(), + d_values.end(), + thrust::make_constant_iterator(num_keys / num_unique_keys), + thrust::equal_to{})); +} + +TEMPLATE_TEST_CASE_SIG( + "Insert or apply", + "", + ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize), + Key, + Value, + Probe, + CGSize), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1)) +{ + constexpr size_type num_keys{10}; + constexpr size_type num_unique_keys{10}; + + using probe = + std::conditional_t>, + cuco::experimental::double_hashing, + cuco::murmurhash3_32>>; + + auto map = cuco::experimental::static_map, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::experimental::storage<2>>{ + num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; + + test_insert_or_apply(map, num_keys, num_unique_keys); +} \ No newline at end of file From 0d76d1e43f6ac4be5a0634e8c5f6f45604b3a3fa Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 16 Oct 2023 16:43:41 +0000 Subject: [PATCH 02/16] Fix unit test --- .../cuco/detail/static_map/static_map_ref.inl | 6 ++-- tests/static_map/insert_or_apply_test.cu | 28 +++++++++++++------ 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 526876cb4..378d1d05e 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -491,7 +491,7 @@ class operator_impl< { ref_type& ref_ = static_cast(*this); - auto const key = value.first; + auto const key = thrust::get<0>(thrust::raw_reference_cast(value)); auto& probing_scheme = ref_.impl_.probing_scheme(); auto storage_ref = ref_.impl_.storage_ref(); auto probing_iter = probing_scheme(group, key, storage_ref.window_extent()); @@ -524,8 +524,8 @@ class operator_impl< if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; if (group.thread_rank() == src_lane) { - op(&((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second, - value.second); + op(((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second, + static_cast(thrust::get<1>(value))); } group.sync(); return; diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu index 127cfa6ba..a7cbff438 100644 --- a/tests/static_map/insert_or_apply_test.cu +++ b/tests/static_map/insert_or_apply_test.cu @@ -59,11 +59,6 @@ __inline__ void test_insert_or_apply(Map& map, size_type num_keys, size_type num thrust::device_vector d_values(num_unique_keys); map.retrieve_all(d_keys.begin(), d_values.begin()); - // TODO remove - for (int i = 0; i < num_unique_keys; ++i) { - std::cout << d_keys[i] << " " << d_values[i] << std::endl; - } - REQUIRE(cuco::test::equal(d_values.begin(), d_values.end(), thrust::make_constant_iterator(num_keys / num_unique_keys), @@ -78,10 +73,25 @@ TEMPLATE_TEST_CASE_SIG( Value, Probe, CGSize), - (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1)) + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int32_t, int64_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, int64_t, cuco::test::probe_sequence::linear_probing, 2)) { - constexpr size_type num_keys{10}; - constexpr size_type num_unique_keys{10}; + constexpr size_type num_keys{400}; + constexpr size_type num_unique_keys{100}; using probe = std::conditional_t, cuco::experimental::storage<2>>{ - num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; + num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; test_insert_or_apply(map, num_keys, num_unique_keys); } \ No newline at end of file From 54e2697d326f12c56ff7d9c678c76e0fe2fa048e Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 16 Oct 2023 17:07:08 +0000 Subject: [PATCH 03/16] Add benchmark --- benchmarks/CMakeLists.txt | 3 +- .../static_map/insert_or_apply_bench.cu | 93 +++++++++++++++++++ 2 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 benchmarks/hash_table/static_map/insert_or_apply_bench.cu diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 6b03cb98c..00c5a46f3 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -62,7 +62,8 @@ ConfigureBench(STATIC_MAP_BENCH hash_table/static_map/insert_bench.cu hash_table/static_map/find_bench.cu hash_table/static_map/contains_bench.cu - hash_table/static_map/erase_bench.cu) + hash_table/static_map/erase_bench.cu + hash_table/static_map/insert_or_apply_bench.cu) ################################################################################################### # - static_multimap benchmarks -------------------------------------------------------------------- diff --git a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu new file mode 100644 index 000000000..725f7c15e --- /dev/null +++ b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_map::insert_or_apply` performance + */ +template +std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert_or_apply( + nvbench::state& state, nvbench::type_list) +{ + using pair_type = cuco::pair; + + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const multiplicity = state.get_int64_or_default("Multiplicity", defaults::MULTIPLICITY); + + std::size_t const size = cuco::detail::int_div_ceil(num_keys, multiplicity) / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + thrust::device_vector pairs(num_keys); + thrust::transform(keys.begin(), keys.end(), pairs.begin(), [] __device__(Key const& key) { + return pair_type(key, static_cast(key)); + }); + + state.add_element_count(num_keys); + + cuco::experimental::static_map map{size, cuco::empty_key{-1}, cuco::empty_value{0}}; + + state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + map.clear_async({launch.get_stream()}); + + timer.start(); + map.insert_or_apply_async( + pairs.begin(), pairs.end(), cuco::experimental::op::reduce::sum, {launch.get_stream()}); + timer.stop(); + }); +} + +template +std::enable_if_t<(sizeof(Key) != sizeof(Value)), void> static_map_insert_or_apply( + nvbench::state& state, nvbench::type_list) +{ + state.skip("Key should be the same type as Value."); +} + +NVBENCH_BENCH_TYPES(static_map_insert_or_apply, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_map_insert_or_apply_uniform_multiplicity") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); + +NVBENCH_BENCH_TYPES(static_map_insert_or_apply, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + defaults::VALUE_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_set_insert_or_apply_uniform_occupancy") + .set_type_axes_names({"Key", "Value", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); \ No newline at end of file From 2561965ffc308a6f31f0f31892e1c4f9f32e0303 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Mon, 24 Jun 2024 17:15:48 +0000 Subject: [PATCH 04/16] Update insert_or_apply to latest dev --- .../static_map/insert_or_apply_bench.cu | 10 +-- .../cuco/detail/static_map/static_map_ref.inl | 56 +++++++--------- tests/static_map/insert_or_apply_test.cu | 64 +++++++++---------- 3 files changed, 58 insertions(+), 72 deletions(-) diff --git a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu index 725f7c15e..e670bd75d 100644 --- a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu +++ b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu @@ -14,11 +14,11 @@ * limitations under the License. */ -#include -#include +#include +#include #include -#include +#include #include @@ -55,14 +55,14 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert_or_appl state.add_element_count(num_keys); - cuco::experimental::static_map map{size, cuco::empty_key{-1}, cuco::empty_value{0}}; + cuco::static_map map{size, cuco::empty_key{-1}, cuco::empty_value{0}}; state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { map.clear_async({launch.get_stream()}); timer.start(); map.insert_or_apply_async( - pairs.begin(), pairs.end(), cuco::experimental::op::reduce::sum, {launch.get_stream()}); + pairs.begin(), pairs.end(), cuco::op::reduce::sum, {launch.get_stream()}); timer.stop(); }); } diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 014ce4dbd..a0fe659b2 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -16,6 +16,7 @@ #pragma once +#include #include #include @@ -586,8 +587,10 @@ class operator_impl< { static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); - ref_type& ref_ = static_cast(*this); - auto const key = thrust::get<0>(value); + ref_type& ref_ = static_cast(*this); + + auto const val = ref_.impl_.heterogeneous_value(value); + auto const key = ref_.impl_.extract_key(val); auto& probing_scheme = ref_.impl_.probing_scheme(); auto storage_ref = ref_.impl_.storage_ref(); auto probing_iter = probing_scheme(key, storage_ref.window_extent()); @@ -596,17 +599,17 @@ class operator_impl< auto const window_slots = storage_ref[*probing_iter]; for (auto& slot_content : window_slots) { - auto const eq_res = ref_.impl_.predicate()(slot_content.first, key); + auto const eq_res = + ref_.impl_.predicate_.operator()(key, slot_content.first); // If the key is already in the container, update the payload and return if (eq_res == detail::equal_result::EQUAL) { auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); op(((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second, - static_cast(thrust::get<1>(value))); + val.second); return; } - if (eq_res == detail::equal_result::EMPTY or - cuco::detail::bitwise_compare(slot_content.first, ref_.impl_.erased_key_sentinel())) { + if (eq_res == detail::equal_result::AVAILABLE) { auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); if (attempt_insert_or_apply( (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op)) { @@ -619,8 +622,7 @@ class operator_impl< } template - __device__ void insert_or_apply(Value const& value, - cuco::experimental::op::reduce::sum_tag) noexcept + __device__ void insert_or_apply(Value const& value, cuco::op::reduce::sum_tag) noexcept { auto& ref_ = static_cast(*this); ref_.insert_or_apply(value, [](T& slot, T const& payload) { @@ -648,7 +650,8 @@ class operator_impl< { ref_type& ref_ = static_cast(*this); - auto const key = thrust::get<0>(thrust::raw_reference_cast(value)); + auto const val = ref_.impl_.heterogeneous_value(value); + auto const key = ref_.impl_.extract_key(val); auto& probing_scheme = ref_.impl_.probing_scheme(); auto storage_ref = ref_.impl_.storage_ref(); auto probing_iter = probing_scheme(group, key, storage_ref.window_extent()); @@ -657,24 +660,15 @@ class operator_impl< auto const window_slots = storage_ref[*probing_iter]; auto const [state, intra_window_index] = [&]() { + auto res = detail::equal_result::UNEQUAL; for (auto i = 0; i < window_size; ++i) { - switch (ref_.impl_.predicate()(window_slots[i].first, key)) { - case detail::equal_result::EMPTY: - return detail::window_probing_results{detail::equal_result::EMPTY, i}; - case detail::equal_result::EQUAL: - return detail::window_probing_results{detail::equal_result::EQUAL, i}; - default: { - if (cuco::detail::bitwise_compare(window_slots[i].first, - ref_.impl_.erased_key_sentinel())) { - return window_probing_results{detail::equal_result::ERASED, i}; - } else { - continue; - } - } + res = ref_.impl_.predicate_.operator()(key, window_slots[i].first); + if (res != detail::equal_result::UNEQUAL) { + return detail::window_probing_results{res, i}; } } // returns dummy index `-1` for UNEQUAL - return detail::window_probing_results{detail::equal_result::UNEQUAL, -1}; + return detail::window_probing_results{res, -1}; }(); auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); @@ -682,14 +676,13 @@ class operator_impl< auto const src_lane = __ffs(group_contains_equal) - 1; if (group.thread_rank() == src_lane) { op(((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second, - static_cast(thrust::get<1>(value))); + val.second); } group.sync(); return; } - auto const group_contains_available = - group.ballot(state == detail::equal_result::EMPTY or state == detail::equal_result::ERASED); + auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE); if (group_contains_available) { auto const src_lane = __ffs(group_contains_available) - 1; auto const status = @@ -709,7 +702,7 @@ class operator_impl< template __device__ void insert_or_apply(cooperative_groups::thread_block_tile const& group, Value const& value, - cuco::experimental::op::reduce::sum_tag) noexcept + cuco::op::reduce::sum_tag) noexcept { auto& ref_ = static_cast(*this); ref_.insert_or_apply(group, value, [](T& slot, T const& payload) { @@ -742,17 +735,16 @@ class operator_impl< ref_type& ref_ = static_cast(*this); auto const expected_key = ref_.impl_.empty_slot_sentinel().first; - auto old_key = ref_.impl_.compare_and_swap( - &slot->first, expected_key, static_cast(thrust::get<0>(value))); + auto old_key = + ref_.impl_.compare_and_swap(&slot->first, expected_key, static_cast(value.first)); auto* old_key_ptr = reinterpret_cast(&old_key); // if key success or key was already present in the map if (cuco::detail::bitwise_compare(*old_key_ptr, expected_key) or - (ref_.impl_.predicate().equal_to(*old_key_ptr, - thrust::get<0>(thrust::raw_reference_cast(value))) == + (ref_.impl_.predicate().equal_to(value.first, *old_key_ptr) == detail::equal_result::EQUAL)) { // Update payload - op(slot->second, static_cast(thrust::get<1>(value))); + op(slot->second, value.second); return true; } return false; diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu index a7cbff438..eade154ea 100644 --- a/tests/static_map/insert_or_apply_test.cu +++ b/tests/static_map/insert_or_apply_test.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include @@ -25,12 +25,11 @@ #include #include -#include +#include +#include #include -#include - using size_type = std::size_t; template @@ -38,31 +37,28 @@ __inline__ void test_insert_or_apply(Map& map, size_type num_keys, size_type num { REQUIRE((num_keys % num_unique_keys) == 0); - using key_type = typename Map::key_type; - using mapped_type = typename Map::mapped_type; - - auto keys_begin = thrust::make_transform_iterator( - thrust::counting_iterator(0), - [num_unique_keys] __host__ __device__(key_type const& x) -> key_type { - return x % num_unique_keys; - }); - - auto values_begin = thrust::make_constant_iterator(1); + using Key = typename Map::key_type; + using Value = typename Map::mapped_type; - auto pairs_begin = thrust::make_zip_iterator(thrust::make_tuple(keys_begin, values_begin)); + // Insert pairs + auto pairs_begin = thrust::make_transform_iterator( + thrust::counting_iterator(0), + cuda::proclaim_return_type>([num_unique_keys] __device__(auto i) { + return cuco::pair{i % num_unique_keys, 1}; + })); - map.insert_or_apply(pairs_begin, pairs_begin + num_keys, cuco::experimental::op::reduce::sum); + map.insert_or_apply(pairs_begin, pairs_begin + num_keys, cuco::op::reduce::sum); REQUIRE(map.size() == num_unique_keys); - thrust::device_vector d_keys(num_unique_keys); - thrust::device_vector d_values(num_unique_keys); + thrust::device_vector d_keys(num_unique_keys); + thrust::device_vector d_values(num_unique_keys); map.retrieve_all(d_keys.begin(), d_values.begin()); REQUIRE(cuco::test::equal(d_values.begin(), d_values.end(), - thrust::make_constant_iterator(num_keys / num_unique_keys), - thrust::equal_to{})); + thrust::make_constant_iterator(num_keys / num_unique_keys), + thrust::equal_to{})); } TEMPLATE_TEST_CASE_SIG( @@ -93,21 +89,19 @@ TEMPLATE_TEST_CASE_SIG( constexpr size_type num_keys{400}; constexpr size_type num_unique_keys{100}; - using probe = - std::conditional_t>, - cuco::experimental::double_hashing, - cuco::murmurhash3_32>>; - - auto map = cuco::experimental::static_map, - cuda::thread_scope_device, - thrust::equal_to, - probe, - cuco::cuda_allocator, - cuco::experimental::storage<2>>{ + using probe = std::conditional_t< + Probe == cuco::test::probe_sequence::linear_probing, + cuco::linear_probing>, + cuco::double_hashing, cuco::murmurhash3_32>>; + + auto map = cuco::static_map, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>{ num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; test_insert_or_apply(map, num_keys, num_unique_keys); From 5e94f0621ae0e8b9d4440fead3e7b3f87772f9eb Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Mon, 24 Jun 2024 23:57:45 +0000 Subject: [PATCH 05/16] Update insert_or_apply to accept Op of a specific signature --- .../cuco/detail/static_map/static_map_ref.inl | 71 ++++++++++++------- tests/static_map/insert_or_apply_test.cu | 28 +++++++- 2 files changed, 72 insertions(+), 27 deletions(-) diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index a0fe659b2..678c481fb 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -573,20 +573,30 @@ class operator_impl< "sizeof(mapped_type) must be either 4 bytes or 8 bytes."); public: - // TODO docs /** - * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v` - * to the mapped_type corresponding to the key `k`. + * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, applies + `Op` + * binary function to the mapped_type corresponding to the key `k` and the value `v`. * * @tparam Value Input type which is implicitly convertible to 'value_type' - * + * @tparam Op Callable type which is used as apply operation and can be + * called with arguments as Op(cuda::atomic_ref, T). Op strictly must + * have this signature to atomically apply the operation. + * @param value The element to insert + * @param op The callable object to perform binary operation between existing value at the slot + * and the element to insert. */ + template __device__ void insert_or_apply(Value const& value, Op op) noexcept { static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); + static_assert( + std::is_invocable_v, T>, + "insert_or_apply expects `Op` to be a callable as `Op(cuda::atomic_ref, T)`"); + ref_type& ref_ = static_cast(*this); auto const val = ref_.impl_.heterogeneous_value(value); @@ -605,8 +615,10 @@ class operator_impl< // If the key is already in the container, update the payload and return if (eq_res == detail::equal_result::EQUAL) { auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); - op(((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second, - val.second); + op( + cuda::atomic_ref{ + ((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second}, + val.second); return; } if (eq_res == detail::equal_result::AVAILABLE) { @@ -625,29 +637,35 @@ class operator_impl< __device__ void insert_or_apply(Value const& value, cuco::op::reduce::sum_tag) noexcept { auto& ref_ = static_cast(*this); - ref_.insert_or_apply(value, [](T& slot, T const& payload) { - cuda::atomic_ref slot_ref{slot}; + ref_.insert_or_apply(value, [](cuda::atomic_ref slot_ref, T const& payload) { slot_ref.fetch_add(payload, cuda::memory_order_relaxed); }); } - // TODO docs /** - * @brief Inserts an element. - * - * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v` - * to the mapped_type corresponding to the key `k`. + * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, applies + * `Op` binary function to the mapped_type corresponding to the key `k` and the value `v`. * * @tparam Value Input type which is implicitly convertible to 'value_type' + * @tparam Op Callable type which is used as apply operation and can be + * called with arguments as Op(cuda::atomic_ref, T). Op strictly must + * have this signature to atomically apply the operation. * * @param group The Cooperative Group used to perform group insert * @param value The element to insert + * @param op The callable object to perform binary operation between existing value at the slot + * and the element to insert. */ + template __device__ void insert_or_apply(cooperative_groups::thread_block_tile const& group, Value const& value, Op op) noexcept { + static_assert( + std::is_invocable_v, T>, + "insert_or_apply expects `Op` to be a callable as `Op(cuda::atomic_ref, T)`"); + ref_type& ref_ = static_cast(*this); auto const val = ref_.impl_.heterogeneous_value(value); @@ -675,8 +693,10 @@ class operator_impl< if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; if (group.thread_rank() == src_lane) { - op(((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second, - val.second); + op( + cuda::atomic_ref{ + ((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second}, + val.second); } group.sync(); return; @@ -705,25 +725,24 @@ class operator_impl< cuco::op::reduce::sum_tag) noexcept { auto& ref_ = static_cast(*this); - ref_.insert_or_apply(group, value, [](T& slot, T const& payload) { - cuda::atomic_ref slot_ref{slot}; + ref_.insert_or_apply(group, value, [](cuda::atomic_ref slot_ref, T const& payload) { slot_ref.fetch_add(payload, cuda::memory_order_relaxed); }); } private: - // TODO docs /** - * @brief Attempts to insert an element into a slot or update the matching payload with the given - * element - * - * @brief Inserts a key-value pair `{k, v}` if it's not present in the map. Otherwise, assigns `v` - * to the mapped_type corresponding to the key `k`. + * @brief Attempts to insert an element into a slot or update the matching payload by applying the + * binary operation on the payload and new value. * * @tparam Value Input type which is implicitly convertible to 'value_type' - * - * @param group The Cooperative Group used to perform group insert + * @tparam Op Callable type which is used as apply operation and called be + * called with arguments as Op(cuda::atomic_ref, T) + + * @param slot value_type pointer to the slot to insert * @param value The element to insert + * @param op The callable object to perform binary operation between existing value at the slot + * and element to insert. * * @return Returns `true` if the given `value` is inserted or `value` has a match in the map. */ @@ -744,7 +763,7 @@ class operator_impl< (ref_.impl_.predicate().equal_to(value.first, *old_key_ptr) == detail::equal_result::EQUAL)) { // Update payload - op(slot->second, value.second); + op(cuda::atomic_ref{slot->second}, value.second); return true; } return false; diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu index eade154ea..39d7d6df8 100644 --- a/tests/static_map/insert_or_apply_test.cu +++ b/tests/static_map/insert_or_apply_test.cu @@ -18,6 +18,7 @@ #include +#include #include #include #include @@ -47,7 +48,12 @@ __inline__ void test_insert_or_apply(Map& map, size_type num_keys, size_type num return cuco::pair{i % num_unique_keys, 1}; })); - map.insert_or_apply(pairs_begin, pairs_begin + num_keys, cuco::op::reduce::sum); + map.insert_or_apply( + pairs_begin, + pairs_begin + num_keys, + [] __device__(cuda::atomic_ref lhs, const Value& rhs) { + lhs.fetch_add(rhs, cuda::memory_order_relaxed); + }); REQUIRE(map.size() == num_unique_keys); @@ -105,4 +111,24 @@ TEMPLATE_TEST_CASE_SIG( num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; test_insert_or_apply(map, num_keys, num_unique_keys); +} + +TEMPLATE_TEST_CASE_SIG( + "Insert or apply all unique keys", "", ((typename Key)), (int32_t), (int64_t)) +{ + using Value = Key; + + constexpr size_type num_keys = 100; + + auto map = cuco::static_map, + cuda::thread_scope_device, + thrust::equal_to, + cuco::linear_probing<1, cuco::murmurhash3_32>, + cuco::cuda_allocator, + cuco::storage<2>>{ + num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; + + test_insert_or_apply(map, num_keys, num_keys); } \ No newline at end of file From e7bca30fc8a78cce2cfe65cb50e4920940febad2 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Tue, 25 Jun 2024 00:38:26 +0000 Subject: [PATCH 06/16] Clean up and add docs --- include/cuco/detail/static_map/kernels.cuh | 72 +++------------------- include/cuco/static_map.cuh | 42 ++++++++++++- 2 files changed, 47 insertions(+), 67 deletions(-) diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index fee007344..ec15a66ed 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -68,23 +68,25 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_assign(InputIt first, } } -// TODO docs /** * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent to - * `k` already exists in the container, assigns `v` to the mapped_type corresponding to the key `k`. - * If the key does not exist, inserts the pair as if by insert. + * `k` already exists in the container, then binary operation is applied using `op` callable object + * on the existing value at slot and the element to insert. If the key does not exist, inserts the + * pair as if by insert. * - * @note If multiple elements in `[first, first + n)` compare equal, it is unspecified which element - * is inserted. + * @note Callable object to perform binary operation should be able to invoke as Op(cuda::atomic, T>) * * @tparam CGSize Number of threads in each CG * @tparam BlockSize Number of threads in each block * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the `value_type` of the data structure + * @tparam Op Callable type used to peform apply operation. * @tparam Ref Type of non-owning device ref allowing access to storage * * @param first Beginning of the sequence of input elements * @param n Number of input elements + * @param op callable object to perform apply operation. * @param ref Non-owning container device ref used to access the slot storage */ template @@ -106,64 +108,4 @@ __global__ void insert_or_apply(InputIt first, cuco::detail::index_type n, Op op } } -/** - * @brief Finds the equivalent map elements of all keys in the range `[first, first + n)`. - * - * @note If the key `*(first + i)` has a match in the container, copies the payload of its matched - * element to `(output_begin + i)`. Else, copies the empty value sentinel. Uses the CUDA Cooperative - * Groups API to leverage groups of multiple threads to find each key. This provides a significant - * boost in throughput compared to the non Cooperative Group `find` at moderate to high load - * factors. - * - * @tparam CGSize Number of threads in each CG - * @tparam BlockSize The size of the thread block - * @tparam InputIt Device accessible input iterator - * @tparam OutputIt Device accessible output iterator assignable from the map's `mapped_type` - * @tparam Ref Type of non-owning device ref allowing access to storage - * - * @param first Beginning of the sequence of keys - * @param n Number of keys to query - * @param output_begin Beginning of the sequence of matched payloads retrieved for each key - * @param ref Non-owning map device ref used to access the slot storage - */ -template -__global__ void find(InputIt first, cuco::detail::index_type n, OutputIt output_begin, Ref ref) -{ - namespace cg = cooperative_groups; - - auto const block = cg::this_thread_block(); - auto const thread_idx = block.thread_rank(); - auto const loop_stride = cuco::detail::grid_stride() / CGSize; - auto idx = cuco::detail::global_thread_id() / CGSize; - - __shared__ typename Ref::mapped_type output_buffer[BlockSize / CGSize]; - - while (idx - thread_idx < n) { // the whole thread block falls into the same iteration - if (idx < n) { - typename std::iterator_traits::value_type const& key = *(first + idx); - if constexpr (CGSize == 1) { - auto const found = ref.find(key); - /* - * The ld.relaxed.gpu instruction causes L1 to flush more frequently, causing increased - * sector stores from L2 to global memory. By writing results to shared memory and then - * synchronizing before writing back to global, we no longer rely on L1, preventing the - * increase in sector stores from L2 to global and improving performance. - */ - output_buffer[thread_idx] = - found == ref.end() ? ref.empty_value_sentinel() : (*found).second; - block.sync(); - *(output_begin + idx) = output_buffer[thread_idx]; - } else { - auto const tile = cg::tiled_partition(block); - auto const found = ref.find(tile, key); - - if (tile.thread_rank() == 0) { - *(output_begin + idx) = found == ref.end() ? ref.empty_value_sentinel() : (*found).second; - } - } - } - idx += loop_stride; - } -} - } // namespace cuco::static_map_ns::detail \ No newline at end of file diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 88a24c974..a3f274286 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -408,11 +408,49 @@ class static_map { template void insert_or_assign_async(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept; - // TODO docs + /** + * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent + * to `k` already exists in the container, then binary operation is applied using `op` callable + * object on the existing value at slot and the element to insert. If the key does not exist, + * inserts the pair as if by insert. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `insert_or_apply_async`. + * @note Callable object to perform binary operation should be able to invoke as + * Op(cuda::atomic, T>) + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_map::value_type> is `true` + * @tparam Op Callable type used to peform apply operation. + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param op callable object to perform apply operation. + * @param stream CUDA stream used for insert + */ template void insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream = {}) noexcept; - // TODO docs + /** + * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent + * to `k` already exists in the container, then binary operation is applied using `op` callable + * object on the existing value at slot and the element to insert. If the key does not exist, + * inserts the pair as if by insert. + * + * @note Callable object to perform binary operation should be able to invoke as + * Op(cuda::atomic, T>) + * + * @tparam InputIt Device accessible random access input iterator where + * std::is_convertible::value_type, + * static_map::value_type> is `true` + * @tparam Op Callable type used to peform apply operation. + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param op callable object to perform apply operation. + * @param stream CUDA stream used for insert + */ template void insert_or_apply_async(InputIt first, InputIt last, From df3cd87ca7c712747206a59cbb4093f38926d614 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Tue, 25 Jun 2024 03:10:15 +0000 Subject: [PATCH 07/16] doxygen fixes --- include/cuco/operator.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp index d4dbf2c65..4aa466c17 100644 --- a/include/cuco/operator.hpp +++ b/include/cuco/operator.hpp @@ -42,7 +42,7 @@ struct insert_or_assign_tag { * @brief `insert_or_apply` operator tag */ struct insert_or_apply_tag { -} inline constexpr insert_or_apply; +} inline constexpr insert_or_apply; ///< `cuco::insert_or_apply` operator /** * @brief `erase` operator tag From 01fa6c373d7894534a0757de4ab92ea505441085 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Tue, 25 Jun 2024 22:25:14 +0000 Subject: [PATCH 08/16] minor fixes based on review comments --- .../static_map/insert_or_apply_bench.cu | 2 +- .../cuco/detail/static_map/static_map_ref.inl | 19 ++++++------------- include/cuco/static_map_ref.cuh | 3 ++- tests/static_map/insert_or_apply_test.cu | 8 ++++---- 4 files changed, 13 insertions(+), 19 deletions(-) diff --git a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu index e670bd75d..186b548ea 100644 --- a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu +++ b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 678c481fb..06bbd01de 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -375,7 +375,6 @@ class operator_impl< } }; -// TODO use insert_or_apply internally template - __device__ void insert_or_apply(Value const& value, Op op) noexcept + __device__ void insert_or_apply(Value const& value, Op op) { static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); @@ -634,7 +627,7 @@ class operator_impl< } template - __device__ void insert_or_apply(Value const& value, cuco::op::reduce::sum_tag) noexcept + __device__ void insert_or_apply(Value const& value, cuco::op::reduce::sum_tag) { auto& ref_ = static_cast(*this); ref_.insert_or_apply(value, [](cuda::atomic_ref slot_ref, T const& payload) { @@ -660,7 +653,7 @@ class operator_impl< template __device__ void insert_or_apply(cooperative_groups::thread_block_tile const& group, Value const& value, - Op op) noexcept + Op op) { static_assert( std::is_invocable_v, T>, @@ -722,7 +715,7 @@ class operator_impl< template __device__ void insert_or_apply(cooperative_groups::thread_block_tile const& group, Value const& value, - cuco::op::reduce::sum_tag) noexcept + cuco::op::reduce::sum_tag) { auto& ref_ = static_cast(*this); ref_.insert_or_apply(group, value, [](cuda::atomic_ref slot_ref, T const& payload) { diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh index 5c96af776..4006d84ae 100644 --- a/include/cuco/static_map_ref.cuh +++ b/include/cuco/static_map_ref.cuh @@ -74,7 +74,8 @@ class static_map_ref using impl_type = detail:: open_addressing_ref_impl; - static_assert(sizeof(T) <= 8, "Container does not support payload types larger than 8 bytes."); + static_assert(sizeof(T) == 4 or sizeof(T) == 8, + "sizeof(mapped_type) must be either 4 bytes or 8 bytes."); static_assert( cuco::is_bitwise_comparable_v, diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu index 39d7d6df8..9f717474b 100644 --- a/tests/static_map/insert_or_apply_test.cu +++ b/tests/static_map/insert_or_apply_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,7 +34,7 @@ using size_type = std::size_t; template -__inline__ void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_keys) +void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_keys) { REQUIRE((num_keys % num_unique_keys) == 0); @@ -68,7 +68,7 @@ __inline__ void test_insert_or_apply(Map& map, size_type num_keys, size_type num } TEMPLATE_TEST_CASE_SIG( - "Insert or apply", + "static_map insert_or_apply tests", "", ((typename Key, typename Value, cuco::test::probe_sequence Probe, int CGSize), Key, @@ -114,7 +114,7 @@ TEMPLATE_TEST_CASE_SIG( } TEMPLATE_TEST_CASE_SIG( - "Insert or apply all unique keys", "", ((typename Key)), (int32_t), (int64_t)) + "static_map insert_or_apply all unique keys tests", "", ((typename Key)), (int32_t), (int64_t)) { using Value = Key; From 45adafc4b9abd75bce1d8ce80bc4cbcb5b2afc74 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Fri, 28 Jun 2024 03:13:29 +0000 Subject: [PATCH 09/16] Add identity element --- .../static_map/insert_or_apply_bench.cu | 21 ++- include/cuco/detail/static_map/kernels.cuh | 21 ++- include/cuco/detail/static_map/static_map.inl | 18 ++- .../cuco/detail/static_map/static_map_ref.inl | 138 +++++++++++++----- include/cuco/static_map.cuh | 12 +- tests/static_map/insert_or_apply_test.cu | 60 ++++++-- 6 files changed, 202 insertions(+), 68 deletions(-) diff --git a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu index 186b548ea..d103647a1 100644 --- a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu +++ b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu @@ -28,6 +28,8 @@ using namespace cuco::benchmark; using namespace cuco::utility; +const auto USE_IDENTITY = std::vector{0, 1}; +const auto MULTIPLICITY_RANGE = std::vector{1, 2, 4, 8, 16, 32, 64, 128}; /** * @brief A benchmark evaluating `cuco::static_map::insert_or_apply` performance */ @@ -40,9 +42,13 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert_or_appl auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); auto const multiplicity = state.get_int64_or_default("Multiplicity", defaults::MULTIPLICITY); + auto const use_identity = state.get_int64_or_default("UseIdentity", 1); std::size_t const size = cuco::detail::int_div_ceil(num_keys, multiplicity) / occupancy; + cuda::std::optional identity{}; + if (use_identity) identity = 0; + thrust::device_vector keys(num_keys); key_generator gen; @@ -57,12 +63,17 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert_or_appl cuco::static_map map{size, cuco::empty_key{-1}, cuco::empty_value{0}}; + using Map = decltype(map); + + auto const op = [] __device__(cuda::atomic_ref lhs, const Value& rhs) { + lhs.fetch_add(rhs, cuda::memory_order_relaxed); + }; + state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { map.clear_async({launch.get_stream()}); timer.start(); - map.insert_or_apply_async( - pairs.begin(), pairs.end(), cuco::op::reduce::sum, {launch.get_stream()}); + map.insert_or_apply_async(pairs.begin(), pairs.end(), op, identity, {launch.get_stream()}); timer.stop(); }); } @@ -81,7 +92,8 @@ NVBENCH_BENCH_TYPES(static_map_insert_or_apply, .set_name("static_map_insert_or_apply_uniform_multiplicity") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) - .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); + .add_int64_axis("Multiplicity", MULTIPLICITY_RANGE) + .add_int64_axis("UseIdentity", USE_IDENTITY); NVBENCH_BENCH_TYPES(static_map_insert_or_apply, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, @@ -90,4 +102,5 @@ NVBENCH_BENCH_TYPES(static_map_insert_or_apply, .set_name("static_set_insert_or_apply_uniform_occupancy") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) - .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); \ No newline at end of file + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE) + .add_int64_axis("UseIdentity", USE_IDENTITY); diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index ec15a66ed..3646e8c9c 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -20,6 +20,7 @@ #include #include +#include #include @@ -82,15 +83,27 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_assign(InputIt first, * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the `value_type` of the data structure * @tparam Op Callable type used to peform apply operation. + * @tparam T Type of optional idenitity element which is convertible + * to `value_type` of the data structure * @tparam Ref Type of non-owning device ref allowing access to storage * * @param first Beginning of the sequence of input elements * @param n Number of input elements * @param op callable object to perform apply operation. + * @param identity_element An optional Identity element of the binary operation * @param ref Non-owning container device ref used to access the slot storage */ -template -__global__ void insert_or_apply(InputIt first, cuco::detail::index_type n, Op op, Ref ref) +template +__global__ void insert_or_apply(InputIt first, + cuco::detail::index_type n, + Op op, + cuda::std::optional identity_element, + Ref ref) { auto const loop_stride = cuco::detail::grid_stride() / CGSize; auto idx = cuco::detail::global_thread_id() / CGSize; @@ -98,11 +111,11 @@ __global__ void insert_or_apply(InputIt first, cuco::detail::index_type n, Op op while (idx < n) { typename std::iterator_traits::value_type const& insert_pair = *(first + idx); if constexpr (CGSize == 1) { - ref.insert_or_apply(insert_pair, op); + ref.insert_or_apply(insert_pair, op, identity_element); } else { auto const tile = cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); - ref.insert_or_apply(tile, insert_pair, op); + ref.insert_or_apply(tile, insert_pair, op, identity_element); } idx += loop_stride; } diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 94cb194f6..80c858e26 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -22,6 +22,8 @@ #include #include +#include + #include namespace cuco { @@ -253,9 +255,13 @@ template template void static_map:: - insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream) noexcept + insert_or_apply(InputIt first, + InputIt last, + Op op, + cuda::std::optional identity_element, + cuda_stream_ref stream) noexcept { - return this->insert_or_apply_async(first, last, op, stream); + return this->insert_or_apply_async(first, last, op, identity_element, stream); stream.synchronize(); } @@ -269,7 +275,11 @@ template template void static_map:: - insert_or_apply_async(InputIt first, InputIt last, Op op, cuda_stream_ref stream) noexcept + insert_or_apply_async(InputIt first, + InputIt last, + Op op, + cuda::std::optional identity_element, + cuda_stream_ref stream) noexcept { auto const num = cuco::detail::distance(first, last); if (num == 0) { return; } @@ -278,7 +288,7 @@ void static_map <<>>( - first, num, op, ref(op::insert_or_apply)); + first, num, op, identity_element, ref(op::insert_or_apply)); } template #include +#include #include #include @@ -579,10 +580,13 @@ class operator_impl< * @param value The element to insert * @param op The callable object to perform binary operation between existing value at the slot * and the element to insert. + * @param identity_element An optional Identity element of the binary operation */ template - __device__ void insert_or_apply(Value const& value, Op op) + __device__ void insert_or_apply(Value const& value, + Op op, + cuda::std::optional identity_element = {}) { static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); @@ -592,11 +596,20 @@ class operator_impl< ref_type& ref_ = static_cast(*this); - auto const val = ref_.impl_.heterogeneous_value(value); - auto const key = ref_.impl_.extract_key(val); - auto& probing_scheme = ref_.impl_.probing_scheme(); - auto storage_ref = ref_.impl_.storage_ref(); - auto probing_iter = probing_scheme(key, storage_ref.window_extent()); + auto const val = ref_.impl_.heterogeneous_value(value); + auto const key = ref_.impl_.extract_key(val); + auto& probing_scheme = ref_.impl_.probing_scheme(); + auto storage_ref = ref_.impl_.storage_ref(); + auto probing_iter = probing_scheme(key, storage_ref.window_extent()); + auto const empty_value = ref_.impl_.empty_slot_sentinel().second; + + // optimize first insert when sentinel payload value equals identity element + auto const optimize_insert = [&]() { + if (identity_element.has_value()) { + if (identity_element.value() == empty_value) return true; + } + return false; + }(); while (true) { auto const window_slots = storage_ref[*probing_iter]; @@ -604,21 +617,30 @@ class operator_impl< for (auto& slot_content : window_slots) { auto const eq_res = ref_.impl_.predicate_.operator()(key, slot_content.first); + auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); + auto slot_ptr = (storage_ref.data() + *probing_iter)->data() + intra_window_index; // If the key is already in the container, update the payload and return if (eq_res == detail::equal_result::EQUAL) { - auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); - op( - cuda::atomic_ref{ - ((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second}, - val.second); + op(cuda::atomic_ref{slot_ptr->second}, val.second); return; } if (eq_res == detail::equal_result::AVAILABLE) { - auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); - if (attempt_insert_or_apply( - (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op)) { - return; + // if the sentinel value and identity_element are same, perform op + // and return, no need to wait on payload + if (optimize_insert) { + if (attempt_insert_or_apply(slot_ptr, val, op)) return; + continue; + } + // else, attempt stable insert + switch (ref_.impl_.attempt_insert_stable(slot_ptr, slot_content, val)) { + case insert_result::CONTINUE: continue; + case insert_result::SUCCESS: return; + case insert_result::DUPLICATE: { + ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + op(cuda::atomic_ref{slot_ptr->second}, val.second); + return; + } } } } @@ -627,12 +649,17 @@ class operator_impl< } template - __device__ void insert_or_apply(Value const& value, cuco::op::reduce::sum_tag) + __device__ void insert_or_apply(Value const& value, + cuco::op::reduce::sum_tag, + cuda::std::optional = {}) { auto& ref_ = static_cast(*this); - ref_.insert_or_apply(value, [](cuda::atomic_ref slot_ref, T const& payload) { - slot_ref.fetch_add(payload, cuda::memory_order_relaxed); - }); + ref_.insert_or_apply( + value, + [](cuda::atomic_ref slot_ref, T const& payload) { + slot_ref.fetch_add(payload, cuda::memory_order_relaxed); + }, + static_cast(0)); } /** @@ -648,12 +675,14 @@ class operator_impl< * @param value The element to insert * @param op The callable object to perform binary operation between existing value at the slot * and the element to insert. + * @param identity_element An optional Identity element of the binary operation */ template __device__ void insert_or_apply(cooperative_groups::thread_block_tile const& group, Value const& value, - Op op) + Op op, + cuda::std::optional identity_element = {}) { static_assert( std::is_invocable_v, T>, @@ -661,11 +690,19 @@ class operator_impl< ref_type& ref_ = static_cast(*this); - auto const val = ref_.impl_.heterogeneous_value(value); - auto const key = ref_.impl_.extract_key(val); - auto& probing_scheme = ref_.impl_.probing_scheme(); - auto storage_ref = ref_.impl_.storage_ref(); - auto probing_iter = probing_scheme(group, key, storage_ref.window_extent()); + auto const val = ref_.impl_.heterogeneous_value(value); + auto const key = ref_.impl_.extract_key(val); + auto& probing_scheme = ref_.impl_.probing_scheme(); + auto storage_ref = ref_.impl_.storage_ref(); + auto probing_iter = probing_scheme(group, key, storage_ref.window_extent()); + auto const empty_value = ref_.impl_.empty_slot_sentinel().second; + + auto const optimize_insert = [&]() { + if (identity_element.has_value()) { + if (identity_element.value() == empty_value) return true; + } + return false; + }(); while (true) { auto const window_slots = storage_ref[*probing_iter]; @@ -682,14 +719,13 @@ class operator_impl< return detail::window_probing_results{res, -1}; }(); + auto* slot_ptr = (storage_ref.data() + *probing_iter)->data() + intra_window_index; + auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; if (group.thread_rank() == src_lane) { - op( - cuda::atomic_ref{ - ((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second}, - val.second); + op(cuda::atomic_ref{slot_ptr->second}, val.second); } group.sync(); return; @@ -698,14 +734,31 @@ class operator_impl< auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE); if (group_contains_available) { auto const src_lane = __ffs(group_contains_available) - 1; - auto const status = - (group.thread_rank() == src_lane) - ? attempt_insert_or_apply( - (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op) - : false; - // Exit if inserted or assigned - if (group.shfl(status, src_lane)) { return; } + if (optimize_insert) { + auto const status = (group.thread_rank() == src_lane) + ? attempt_insert_or_apply(slot_ptr, value, op) + : false; + if (group.shfl(status, src_lane)) { return; } + continue; + } + auto const status = [&, target_idx = intra_window_index]() { + if (group.thread_rank() != src_lane) { return insert_result::CONTINUE; } + return ref_.impl_.attempt_insert_stable(slot_ptr, window_slots[target_idx], val); + }(); + + switch (group.shfl(status, src_lane)) { + case insert_result::SUCCESS: return; + case insert_result::DUPLICATE: { + if (group.thread_rank() == src_lane) { + ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + op(cuda::atomic_ref{slot_ptr->second}, val.second); + } + group.sync(); + return; + } + default: continue; + } } else { ++probing_iter; } @@ -715,12 +768,17 @@ class operator_impl< template __device__ void insert_or_apply(cooperative_groups::thread_block_tile const& group, Value const& value, - cuco::op::reduce::sum_tag) + cuco::op::reduce::sum_tag, + cuda::std::optional = {}) { auto& ref_ = static_cast(*this); - ref_.insert_or_apply(group, value, [](cuda::atomic_ref slot_ref, T const& payload) { - slot_ref.fetch_add(payload, cuda::memory_order_relaxed); - }); + ref_.insert_or_apply( + group, + value, + [](cuda::atomic_ref slot_ref, T const& payload) { + slot_ref.fetch_add(payload, cuda::memory_order_relaxed); + }, + static_cast(0)); } private: diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index a3f274286..d0d6cd3a5 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -29,6 +29,7 @@ #include #include +#include #include #if defined(CUCO_HAS_CUDA_BARRIER) @@ -427,10 +428,15 @@ class static_map { * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param op callable object to perform apply operation. + * @param identity_element An optional Identity element of the binary operation * @param stream CUDA stream used for insert */ template - void insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream = {}) noexcept; + void insert_or_apply(InputIt first, + InputIt last, + Op op, + cuda::std::optional identity_element = {}, + cuda_stream_ref stream = {}) noexcept; /** * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent @@ -449,13 +455,15 @@ class static_map { * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param op callable object to perform apply operation. + * @param identity_element An optional Identity element of the binary operation * @param stream CUDA stream used for insert */ template void insert_or_apply_async(InputIt first, InputIt last, Op op, - cuda_stream_ref stream = {}) noexcept; + cuda::std::optional identity_element = {}, + cuda_stream_ref stream = {}) noexcept; /** * @brief Erases keys in the range `[first, last)`. diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu index 9f717474b..451cb3ce2 100644 --- a/tests/static_map/insert_or_apply_test.cu +++ b/tests/static_map/insert_or_apply_test.cu @@ -19,6 +19,7 @@ #include #include +#include #include #include #include @@ -34,7 +35,10 @@ using size_type = std::size_t; template -void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_keys) +void test_insert_or_apply(Map& map, + size_type num_keys, + size_type num_unique_keys, + bool use_identity) { REQUIRE((num_keys % num_unique_keys) == 0); @@ -48,12 +52,17 @@ void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_key return cuco::pair{i % num_unique_keys, 1}; })); + cuda::std::optional identity{}; + + if (use_identity) identity = 0; + map.insert_or_apply( pairs_begin, pairs_begin + num_keys, [] __device__(cuda::atomic_ref lhs, const Value& rhs) { lhs.fetch_add(rhs, cuda::memory_order_relaxed); - }); + }, + identity); REQUIRE(map.size() == num_unique_keys); @@ -100,17 +109,40 @@ TEMPLATE_TEST_CASE_SIG( cuco::linear_probing>, cuco::double_hashing, cuco::murmurhash3_32>>; - auto map = cuco::static_map, - cuda::thread_scope_device, - thrust::equal_to, - probe, - cuco::cuda_allocator, - cuco::storage<2>>{ - num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; - - test_insert_or_apply(map, num_keys, num_unique_keys); + using map_type = cuco::static_map, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + // test all four case of sentienel value and identity + // only first case i.e sentienel = 0, identity = true will use optimized_insert code path + SECTION("sentienel = 0, use_identity = true") + { + auto map = map_type{num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; + test_insert_or_apply(map, num_keys, num_unique_keys, true); + } + + SECTION("sentienel = 0, use_identity = false") + { + auto map = map_type{num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; + test_insert_or_apply(map, num_keys, num_unique_keys, false); + } + + SECTION("sentienel = -1, use_identity = true") + { + auto map = map_type{num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; + test_insert_or_apply(map, num_keys, num_unique_keys, true); + } + + SECTION("sentienel = -1, use_identity = false") + { + auto map = map_type{num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; + test_insert_or_apply(map, num_keys, num_unique_keys, false); + } } TEMPLATE_TEST_CASE_SIG( @@ -130,5 +162,5 @@ TEMPLATE_TEST_CASE_SIG( cuco::storage<2>>{ num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; - test_insert_or_apply(map, num_keys, num_keys); + test_insert_or_apply(map, num_keys, num_keys, true); } \ No newline at end of file From e00abec08b16bdf3ab9e409506844f35a0d01afc Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Fri, 28 Jun 2024 16:59:16 +0000 Subject: [PATCH 10/16] Revert "Add identity element" This reverts commit 45adafc4b9abd75bce1d8ce80bc4cbcb5b2afc74, Because there is no performance improvement in adding identity element optimization. --- .../static_map/insert_or_apply_bench.cu | 21 +-- include/cuco/detail/static_map/kernels.cuh | 21 +-- include/cuco/detail/static_map/static_map.inl | 18 +-- .../cuco/detail/static_map/static_map_ref.inl | 138 +++++------------- include/cuco/static_map.cuh | 12 +- tests/static_map/insert_or_apply_test.cu | 60 ++------ 6 files changed, 68 insertions(+), 202 deletions(-) diff --git a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu index d103647a1..186b548ea 100644 --- a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu +++ b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu @@ -28,8 +28,6 @@ using namespace cuco::benchmark; using namespace cuco::utility; -const auto USE_IDENTITY = std::vector{0, 1}; -const auto MULTIPLICITY_RANGE = std::vector{1, 2, 4, 8, 16, 32, 64, 128}; /** * @brief A benchmark evaluating `cuco::static_map::insert_or_apply` performance */ @@ -42,13 +40,9 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert_or_appl auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); auto const multiplicity = state.get_int64_or_default("Multiplicity", defaults::MULTIPLICITY); - auto const use_identity = state.get_int64_or_default("UseIdentity", 1); std::size_t const size = cuco::detail::int_div_ceil(num_keys, multiplicity) / occupancy; - cuda::std::optional identity{}; - if (use_identity) identity = 0; - thrust::device_vector keys(num_keys); key_generator gen; @@ -63,17 +57,12 @@ std::enable_if_t<(sizeof(Key) == sizeof(Value)), void> static_map_insert_or_appl cuco::static_map map{size, cuco::empty_key{-1}, cuco::empty_value{0}}; - using Map = decltype(map); - - auto const op = [] __device__(cuda::atomic_ref lhs, const Value& rhs) { - lhs.fetch_add(rhs, cuda::memory_order_relaxed); - }; - state.exec(nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { map.clear_async({launch.get_stream()}); timer.start(); - map.insert_or_apply_async(pairs.begin(), pairs.end(), op, identity, {launch.get_stream()}); + map.insert_or_apply_async( + pairs.begin(), pairs.end(), cuco::op::reduce::sum, {launch.get_stream()}); timer.stop(); }); } @@ -92,8 +81,7 @@ NVBENCH_BENCH_TYPES(static_map_insert_or_apply, .set_name("static_map_insert_or_apply_uniform_multiplicity") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) - .add_int64_axis("Multiplicity", MULTIPLICITY_RANGE) - .add_int64_axis("UseIdentity", USE_IDENTITY); + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); NVBENCH_BENCH_TYPES(static_map_insert_or_apply, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, @@ -102,5 +90,4 @@ NVBENCH_BENCH_TYPES(static_map_insert_or_apply, .set_name("static_set_insert_or_apply_uniform_occupancy") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) - .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE) - .add_int64_axis("UseIdentity", USE_IDENTITY); + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); \ No newline at end of file diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index 3646e8c9c..ec15a66ed 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -20,7 +20,6 @@ #include #include -#include #include @@ -83,27 +82,15 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_assign(InputIt first, * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the `value_type` of the data structure * @tparam Op Callable type used to peform apply operation. - * @tparam T Type of optional idenitity element which is convertible - * to `value_type` of the data structure * @tparam Ref Type of non-owning device ref allowing access to storage * * @param first Beginning of the sequence of input elements * @param n Number of input elements * @param op callable object to perform apply operation. - * @param identity_element An optional Identity element of the binary operation * @param ref Non-owning container device ref used to access the slot storage */ -template -__global__ void insert_or_apply(InputIt first, - cuco::detail::index_type n, - Op op, - cuda::std::optional identity_element, - Ref ref) +template +__global__ void insert_or_apply(InputIt first, cuco::detail::index_type n, Op op, Ref ref) { auto const loop_stride = cuco::detail::grid_stride() / CGSize; auto idx = cuco::detail::global_thread_id() / CGSize; @@ -111,11 +98,11 @@ __global__ void insert_or_apply(InputIt first, while (idx < n) { typename std::iterator_traits::value_type const& insert_pair = *(first + idx); if constexpr (CGSize == 1) { - ref.insert_or_apply(insert_pair, op, identity_element); + ref.insert_or_apply(insert_pair, op); } else { auto const tile = cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); - ref.insert_or_apply(tile, insert_pair, op, identity_element); + ref.insert_or_apply(tile, insert_pair, op); } idx += loop_stride; } diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 80c858e26..94cb194f6 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -22,8 +22,6 @@ #include #include -#include - #include namespace cuco { @@ -255,13 +253,9 @@ template template void static_map:: - insert_or_apply(InputIt first, - InputIt last, - Op op, - cuda::std::optional identity_element, - cuda_stream_ref stream) noexcept + insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream) noexcept { - return this->insert_or_apply_async(first, last, op, identity_element, stream); + return this->insert_or_apply_async(first, last, op, stream); stream.synchronize(); } @@ -275,11 +269,7 @@ template template void static_map:: - insert_or_apply_async(InputIt first, - InputIt last, - Op op, - cuda::std::optional identity_element, - cuda_stream_ref stream) noexcept + insert_or_apply_async(InputIt first, InputIt last, Op op, cuda_stream_ref stream) noexcept { auto const num = cuco::detail::distance(first, last); if (num == 0) { return; } @@ -288,7 +278,7 @@ void static_map <<>>( - first, num, op, identity_element, ref(op::insert_or_apply)); + first, num, op, ref(op::insert_or_apply)); } template #include -#include #include #include @@ -580,13 +579,10 @@ class operator_impl< * @param value The element to insert * @param op The callable object to perform binary operation between existing value at the slot * and the element to insert. - * @param identity_element An optional Identity element of the binary operation */ template - __device__ void insert_or_apply(Value const& value, - Op op, - cuda::std::optional identity_element = {}) + __device__ void insert_or_apply(Value const& value, Op op) { static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); @@ -596,20 +592,11 @@ class operator_impl< ref_type& ref_ = static_cast(*this); - auto const val = ref_.impl_.heterogeneous_value(value); - auto const key = ref_.impl_.extract_key(val); - auto& probing_scheme = ref_.impl_.probing_scheme(); - auto storage_ref = ref_.impl_.storage_ref(); - auto probing_iter = probing_scheme(key, storage_ref.window_extent()); - auto const empty_value = ref_.impl_.empty_slot_sentinel().second; - - // optimize first insert when sentinel payload value equals identity element - auto const optimize_insert = [&]() { - if (identity_element.has_value()) { - if (identity_element.value() == empty_value) return true; - } - return false; - }(); + auto const val = ref_.impl_.heterogeneous_value(value); + auto const key = ref_.impl_.extract_key(val); + auto& probing_scheme = ref_.impl_.probing_scheme(); + auto storage_ref = ref_.impl_.storage_ref(); + auto probing_iter = probing_scheme(key, storage_ref.window_extent()); while (true) { auto const window_slots = storage_ref[*probing_iter]; @@ -617,30 +604,21 @@ class operator_impl< for (auto& slot_content : window_slots) { auto const eq_res = ref_.impl_.predicate_.operator()(key, slot_content.first); - auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); - auto slot_ptr = (storage_ref.data() + *probing_iter)->data() + intra_window_index; // If the key is already in the container, update the payload and return if (eq_res == detail::equal_result::EQUAL) { - op(cuda::atomic_ref{slot_ptr->second}, val.second); + auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); + op( + cuda::atomic_ref{ + ((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second}, + val.second); return; } if (eq_res == detail::equal_result::AVAILABLE) { - // if the sentinel value and identity_element are same, perform op - // and return, no need to wait on payload - if (optimize_insert) { - if (attempt_insert_or_apply(slot_ptr, val, op)) return; - continue; - } - // else, attempt stable insert - switch (ref_.impl_.attempt_insert_stable(slot_ptr, slot_content, val)) { - case insert_result::CONTINUE: continue; - case insert_result::SUCCESS: return; - case insert_result::DUPLICATE: { - ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); - op(cuda::atomic_ref{slot_ptr->second}, val.second); - return; - } + auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); + if (attempt_insert_or_apply( + (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op)) { + return; } } } @@ -649,17 +627,12 @@ class operator_impl< } template - __device__ void insert_or_apply(Value const& value, - cuco::op::reduce::sum_tag, - cuda::std::optional = {}) + __device__ void insert_or_apply(Value const& value, cuco::op::reduce::sum_tag) { auto& ref_ = static_cast(*this); - ref_.insert_or_apply( - value, - [](cuda::atomic_ref slot_ref, T const& payload) { - slot_ref.fetch_add(payload, cuda::memory_order_relaxed); - }, - static_cast(0)); + ref_.insert_or_apply(value, [](cuda::atomic_ref slot_ref, T const& payload) { + slot_ref.fetch_add(payload, cuda::memory_order_relaxed); + }); } /** @@ -675,14 +648,12 @@ class operator_impl< * @param value The element to insert * @param op The callable object to perform binary operation between existing value at the slot * and the element to insert. - * @param identity_element An optional Identity element of the binary operation */ template __device__ void insert_or_apply(cooperative_groups::thread_block_tile const& group, Value const& value, - Op op, - cuda::std::optional identity_element = {}) + Op op) { static_assert( std::is_invocable_v, T>, @@ -690,19 +661,11 @@ class operator_impl< ref_type& ref_ = static_cast(*this); - auto const val = ref_.impl_.heterogeneous_value(value); - auto const key = ref_.impl_.extract_key(val); - auto& probing_scheme = ref_.impl_.probing_scheme(); - auto storage_ref = ref_.impl_.storage_ref(); - auto probing_iter = probing_scheme(group, key, storage_ref.window_extent()); - auto const empty_value = ref_.impl_.empty_slot_sentinel().second; - - auto const optimize_insert = [&]() { - if (identity_element.has_value()) { - if (identity_element.value() == empty_value) return true; - } - return false; - }(); + auto const val = ref_.impl_.heterogeneous_value(value); + auto const key = ref_.impl_.extract_key(val); + auto& probing_scheme = ref_.impl_.probing_scheme(); + auto storage_ref = ref_.impl_.storage_ref(); + auto probing_iter = probing_scheme(group, key, storage_ref.window_extent()); while (true) { auto const window_slots = storage_ref[*probing_iter]; @@ -719,13 +682,14 @@ class operator_impl< return detail::window_probing_results{res, -1}; }(); - auto* slot_ptr = (storage_ref.data() + *probing_iter)->data() + intra_window_index; - auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; if (group.thread_rank() == src_lane) { - op(cuda::atomic_ref{slot_ptr->second}, val.second); + op( + cuda::atomic_ref{ + ((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second}, + val.second); } group.sync(); return; @@ -734,31 +698,14 @@ class operator_impl< auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE); if (group_contains_available) { auto const src_lane = __ffs(group_contains_available) - 1; + auto const status = + (group.thread_rank() == src_lane) + ? attempt_insert_or_apply( + (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op) + : false; - if (optimize_insert) { - auto const status = (group.thread_rank() == src_lane) - ? attempt_insert_or_apply(slot_ptr, value, op) - : false; - if (group.shfl(status, src_lane)) { return; } - continue; - } - auto const status = [&, target_idx = intra_window_index]() { - if (group.thread_rank() != src_lane) { return insert_result::CONTINUE; } - return ref_.impl_.attempt_insert_stable(slot_ptr, window_slots[target_idx], val); - }(); - - switch (group.shfl(status, src_lane)) { - case insert_result::SUCCESS: return; - case insert_result::DUPLICATE: { - if (group.thread_rank() == src_lane) { - ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); - op(cuda::atomic_ref{slot_ptr->second}, val.second); - } - group.sync(); - return; - } - default: continue; - } + // Exit if inserted or assigned + if (group.shfl(status, src_lane)) { return; } } else { ++probing_iter; } @@ -768,17 +715,12 @@ class operator_impl< template __device__ void insert_or_apply(cooperative_groups::thread_block_tile const& group, Value const& value, - cuco::op::reduce::sum_tag, - cuda::std::optional = {}) + cuco::op::reduce::sum_tag) { auto& ref_ = static_cast(*this); - ref_.insert_or_apply( - group, - value, - [](cuda::atomic_ref slot_ref, T const& payload) { - slot_ref.fetch_add(payload, cuda::memory_order_relaxed); - }, - static_cast(0)); + ref_.insert_or_apply(group, value, [](cuda::atomic_ref slot_ref, T const& payload) { + slot_ref.fetch_add(payload, cuda::memory_order_relaxed); + }); } private: diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index d0d6cd3a5..a3f274286 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -29,7 +29,6 @@ #include #include -#include #include #if defined(CUCO_HAS_CUDA_BARRIER) @@ -428,15 +427,10 @@ class static_map { * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param op callable object to perform apply operation. - * @param identity_element An optional Identity element of the binary operation * @param stream CUDA stream used for insert */ template - void insert_or_apply(InputIt first, - InputIt last, - Op op, - cuda::std::optional identity_element = {}, - cuda_stream_ref stream = {}) noexcept; + void insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream = {}) noexcept; /** * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent @@ -455,15 +449,13 @@ class static_map { * @param first Beginning of the sequence of keys * @param last End of the sequence of keys * @param op callable object to perform apply operation. - * @param identity_element An optional Identity element of the binary operation * @param stream CUDA stream used for insert */ template void insert_or_apply_async(InputIt first, InputIt last, Op op, - cuda::std::optional identity_element = {}, - cuda_stream_ref stream = {}) noexcept; + cuda_stream_ref stream = {}) noexcept; /** * @brief Erases keys in the range `[first, last)`. diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu index 451cb3ce2..9f717474b 100644 --- a/tests/static_map/insert_or_apply_test.cu +++ b/tests/static_map/insert_or_apply_test.cu @@ -19,7 +19,6 @@ #include #include -#include #include #include #include @@ -35,10 +34,7 @@ using size_type = std::size_t; template -void test_insert_or_apply(Map& map, - size_type num_keys, - size_type num_unique_keys, - bool use_identity) +void test_insert_or_apply(Map& map, size_type num_keys, size_type num_unique_keys) { REQUIRE((num_keys % num_unique_keys) == 0); @@ -52,17 +48,12 @@ void test_insert_or_apply(Map& map, return cuco::pair{i % num_unique_keys, 1}; })); - cuda::std::optional identity{}; - - if (use_identity) identity = 0; - map.insert_or_apply( pairs_begin, pairs_begin + num_keys, [] __device__(cuda::atomic_ref lhs, const Value& rhs) { lhs.fetch_add(rhs, cuda::memory_order_relaxed); - }, - identity); + }); REQUIRE(map.size() == num_unique_keys); @@ -109,40 +100,17 @@ TEMPLATE_TEST_CASE_SIG( cuco::linear_probing>, cuco::double_hashing, cuco::murmurhash3_32>>; - using map_type = cuco::static_map, - cuda::thread_scope_device, - thrust::equal_to, - probe, - cuco::cuda_allocator, - cuco::storage<2>>; - - // test all four case of sentienel value and identity - // only first case i.e sentienel = 0, identity = true will use optimized_insert code path - SECTION("sentienel = 0, use_identity = true") - { - auto map = map_type{num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; - test_insert_or_apply(map, num_keys, num_unique_keys, true); - } - - SECTION("sentienel = 0, use_identity = false") - { - auto map = map_type{num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; - test_insert_or_apply(map, num_keys, num_unique_keys, false); - } - - SECTION("sentienel = -1, use_identity = true") - { - auto map = map_type{num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; - test_insert_or_apply(map, num_keys, num_unique_keys, true); - } - - SECTION("sentienel = -1, use_identity = false") - { - auto map = map_type{num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; - test_insert_or_apply(map, num_keys, num_unique_keys, false); - } + auto map = cuco::static_map, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>{ + num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; + + test_insert_or_apply(map, num_keys, num_unique_keys); } TEMPLATE_TEST_CASE_SIG( @@ -162,5 +130,5 @@ TEMPLATE_TEST_CASE_SIG( cuco::storage<2>>{ num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; - test_insert_or_apply(map, num_keys, num_keys, true); + test_insert_or_apply(map, num_keys, num_keys); } \ No newline at end of file From 5a9a31ca6265c774bc658ea0070e190b10b401e3 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Fri, 28 Jun 2024 17:33:41 +0000 Subject: [PATCH 11/16] use insert_stable and update tests --- .../cuco/detail/static_map/static_map_ref.inl | 74 +++++++++++-------- tests/static_map/insert_or_apply_test.cu | 33 ++++++--- 2 files changed, 64 insertions(+), 43 deletions(-) diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 06bbd01de..2212f5879 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -592,11 +592,12 @@ class operator_impl< ref_type& ref_ = static_cast(*this); - auto const val = ref_.impl_.heterogeneous_value(value); - auto const key = ref_.impl_.extract_key(val); - auto& probing_scheme = ref_.impl_.probing_scheme(); - auto storage_ref = ref_.impl_.storage_ref(); - auto probing_iter = probing_scheme(key, storage_ref.window_extent()); + auto const val = ref_.impl_.heterogeneous_value(value); + auto const key = ref_.impl_.extract_key(val); + auto& probing_scheme = ref_.impl_.probing_scheme(); + auto storage_ref = ref_.impl_.storage_ref(); + auto probing_iter = probing_scheme(key, storage_ref.window_extent()); + auto const empty_value = ref_.impl_.empty_slot_sentinel().second; while (true) { auto const window_slots = storage_ref[*probing_iter]; @@ -604,21 +605,23 @@ class operator_impl< for (auto& slot_content : window_slots) { auto const eq_res = ref_.impl_.predicate_.operator()(key, slot_content.first); + auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); + auto slot_ptr = (storage_ref.data() + *probing_iter)->data() + intra_window_index; // If the key is already in the container, update the payload and return if (eq_res == detail::equal_result::EQUAL) { - auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); - op( - cuda::atomic_ref{ - ((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second}, - val.second); + op(cuda::atomic_ref{slot_ptr->second}, val.second); return; } if (eq_res == detail::equal_result::AVAILABLE) { - auto const intra_window_index = thrust::distance(window_slots.begin(), &slot_content); - if (attempt_insert_or_apply( - (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op)) { - return; + switch (ref_.impl_.attempt_insert_stable(slot_ptr, slot_content, val)) { + case insert_result::SUCCESS: return; + case insert_result::DUPLICATE: { + ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + op(cuda::atomic_ref{slot_ptr->second}, val.second); + return; + } + default: continue; } } } @@ -661,11 +664,12 @@ class operator_impl< ref_type& ref_ = static_cast(*this); - auto const val = ref_.impl_.heterogeneous_value(value); - auto const key = ref_.impl_.extract_key(val); - auto& probing_scheme = ref_.impl_.probing_scheme(); - auto storage_ref = ref_.impl_.storage_ref(); - auto probing_iter = probing_scheme(group, key, storage_ref.window_extent()); + auto const val = ref_.impl_.heterogeneous_value(value); + auto const key = ref_.impl_.extract_key(val); + auto& probing_scheme = ref_.impl_.probing_scheme(); + auto storage_ref = ref_.impl_.storage_ref(); + auto probing_iter = probing_scheme(group, key, storage_ref.window_extent()); + auto const empty_value = ref_.impl_.empty_slot_sentinel().second; while (true) { auto const window_slots = storage_ref[*probing_iter]; @@ -682,30 +686,36 @@ class operator_impl< return detail::window_probing_results{res, -1}; }(); + auto* slot_ptr = (storage_ref.data() + *probing_iter)->data() + intra_window_index; + auto const group_contains_equal = group.ballot(state == detail::equal_result::EQUAL); if (group_contains_equal) { auto const src_lane = __ffs(group_contains_equal) - 1; if (group.thread_rank() == src_lane) { - op( - cuda::atomic_ref{ - ((storage_ref.data() + *probing_iter)->data() + intra_window_index)->second}, - val.second); + op(cuda::atomic_ref{slot_ptr->second}, val.second); } - group.sync(); return; } auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE); if (group_contains_available) { auto const src_lane = __ffs(group_contains_available) - 1; - auto const status = - (group.thread_rank() == src_lane) - ? attempt_insert_or_apply( - (storage_ref.data() + *probing_iter)->data() + intra_window_index, value, op) - : false; - - // Exit if inserted or assigned - if (group.shfl(status, src_lane)) { return; } + auto const status = [&, target_idx = intra_window_index]() { + if (group.thread_rank() != src_lane) { return insert_result::CONTINUE; } + return ref_.impl_.attempt_insert_stable(slot_ptr, window_slots[target_idx], val); + }(); + + switch (group.shfl(status, src_lane)) { + case insert_result::SUCCESS: return; + case insert_result::DUPLICATE: { + if (group.thread_rank() == src_lane) { + ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + op(cuda::atomic_ref{slot_ptr->second}, val.second); + } + return; + } + default: continue; + } } else { ++probing_iter; } diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu index 9f717474b..07a05214b 100644 --- a/tests/static_map/insert_or_apply_test.cu +++ b/tests/static_map/insert_or_apply_test.cu @@ -100,17 +100,28 @@ TEMPLATE_TEST_CASE_SIG( cuco::linear_probing>, cuco::double_hashing, cuco::murmurhash3_32>>; - auto map = cuco::static_map, - cuda::thread_scope_device, - thrust::equal_to, - probe, - cuco::cuda_allocator, - cuco::storage<2>>{ - num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; - - test_insert_or_apply(map, num_keys, num_unique_keys); + using map_type = cuco::static_map, + cuda::thread_scope_device, + thrust::equal_to, + probe, + cuco::cuda_allocator, + cuco::storage<2>>; + + SECTION("Sentinel equals to identity") + { + auto map = map_type{num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; + + test_insert_or_apply(map, num_keys, num_unique_keys); + } + + SECTION("Sentinel not equals to identity") + { + auto map = map_type{num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; + + test_insert_or_apply(map, num_keys, num_unique_keys); + } } TEMPLATE_TEST_CASE_SIG( From a936ab2a50f927205ca370fa67d935b00e49b490 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Mon, 1 Jul 2024 21:22:51 +0000 Subject: [PATCH 12/16] minor improvements --- .../cuco/detail/static_map/static_map_ref.inl | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 2212f5879..d68abeadb 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -587,7 +588,7 @@ class operator_impl< static_assert(cg_size == 1, "Non-CG operation is incompatible with the current probing scheme"); static_assert( - std::is_invocable_v, T>, + cuda::std::is_invocable_v, T>, "insert_or_apply expects `Op` to be a callable as `Op(cuda::atomic_ref, T)`"); ref_type& ref_ = static_cast(*this); @@ -617,7 +618,9 @@ class operator_impl< switch (ref_.impl_.attempt_insert_stable(slot_ptr, slot_content, val)) { case insert_result::SUCCESS: return; case insert_result::DUPLICATE: { - ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + if constexpr (sizeof(value_type) > 8) { + ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + } op(cuda::atomic_ref{slot_ptr->second}, val.second); return; } @@ -633,8 +636,8 @@ class operator_impl< __device__ void insert_or_apply(Value const& value, cuco::op::reduce::sum_tag) { auto& ref_ = static_cast(*this); - ref_.insert_or_apply(value, [](cuda::atomic_ref slot_ref, T const& payload) { - slot_ref.fetch_add(payload, cuda::memory_order_relaxed); + ref_.insert_or_apply(value, [](cuda::atomic_ref payload_ref, T const& payload) { + payload_ref.fetch_add(payload, cuda::memory_order_relaxed); }); } @@ -659,7 +662,7 @@ class operator_impl< Op op) { static_assert( - std::is_invocable_v, T>, + cuda::std::is_invocable_v, T>, "insert_or_apply expects `Op` to be a callable as `Op(cuda::atomic_ref, T)`"); ref_type& ref_ = static_cast(*this); @@ -728,8 +731,8 @@ class operator_impl< cuco::op::reduce::sum_tag) { auto& ref_ = static_cast(*this); - ref_.insert_or_apply(group, value, [](cuda::atomic_ref slot_ref, T const& payload) { - slot_ref.fetch_add(payload, cuda::memory_order_relaxed); + ref_.insert_or_apply(group, value, [](cuda::atomic_ref payload_ref, T const& payload) { + payload_ref.fetch_add(payload, cuda::memory_order_relaxed); }); } From 2eadddb615142d9da2fecd65a4c1c16430b250b9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 1 Jul 2024 21:24:35 +0000 Subject: [PATCH 13/16] [pre-commit.ci] auto code formatting --- include/cuco/detail/static_map/static_map_ref.inl | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index d68abeadb..62f12e4e7 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -731,9 +731,10 @@ class operator_impl< cuco::op::reduce::sum_tag) { auto& ref_ = static_cast(*this); - ref_.insert_or_apply(group, value, [](cuda::atomic_ref payload_ref, T const& payload) { - payload_ref.fetch_add(payload, cuda::memory_order_relaxed); - }); + ref_.insert_or_apply( + group, value, [](cuda::atomic_ref payload_ref, T const& payload) { + payload_ref.fetch_add(payload, cuda::memory_order_relaxed); + }); } private: From 0568e65689d028111a2f987d43dd48c3a82b16f2 Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Tue, 2 Jul 2024 16:33:06 +0000 Subject: [PATCH 14/16] more minor fixes --- .../hash_table/static_map/insert_or_apply_bench.cu | 2 +- include/cuco/detail/static_map/kernels.cuh | 8 ++++---- include/cuco/detail/static_map/static_map.inl | 2 +- include/cuco/static_map.cuh | 14 +++++++------- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu index 186b548ea..6bbb26421 100644 --- a/benchmarks/hash_table/static_map/insert_or_apply_bench.cu +++ b/benchmarks/hash_table/static_map/insert_or_apply_bench.cu @@ -87,7 +87,7 @@ NVBENCH_BENCH_TYPES(static_map_insert_or_apply, NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, defaults::VALUE_TYPE_RANGE, nvbench::type_list)) - .set_name("static_set_insert_or_apply_uniform_occupancy") + .set_name("static_map_insert_or_apply_uniform_occupancy") .set_type_axes_names({"Key", "Value", "Distribution"}) .set_max_noise(defaults::MAX_NOISE) .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); \ No newline at end of file diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index ec15a66ed..f47c86cdf 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -74,19 +74,19 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_assign(InputIt first, * on the existing value at slot and the element to insert. If the key does not exist, inserts the * pair as if by insert. * - * @note Callable object to perform binary operation should be able to invoke as Op(cuda::atomic, T>) + * @note Callable object to perform binary operation should be able to invoke as + * Op(cuda::atomic_ref, T>) * * @tparam CGSize Number of threads in each CG * @tparam BlockSize Number of threads in each block * @tparam InputIt Device accessible input iterator whose `value_type` is * convertible to the `value_type` of the data structure - * @tparam Op Callable type used to peform apply operation. + * @tparam Op Callable type used to peform `apply` operation. * @tparam Ref Type of non-owning device ref allowing access to storage * * @param first Beginning of the sequence of input elements * @param n Number of input elements - * @param op callable object to perform apply operation. + * @param op Callable object to perform apply operation. * @param ref Non-owning container device ref used to access the slot storage */ template diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 94cb194f6..807200228 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -253,7 +253,7 @@ template template void static_map:: - insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream) noexcept + insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream) { return this->insert_or_apply_async(first, last, op, stream); stream.synchronize(); diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index a3f274286..5256dea51 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -417,20 +417,20 @@ class static_map { * @note This function synchronizes the given stream. For asynchronous execution use * `insert_or_apply_async`. * @note Callable object to perform binary operation should be able to invoke as - * Op(cuda::atomic, T>) + * Op(cuda::atomic_ref, T>) * * @tparam InputIt Device accessible random access input iterator where * std::is_convertible::value_type, * static_map::value_type> is `true` - * @tparam Op Callable type used to peform apply operation. + * @tparam Op Callable type used to peform `apply` operation. * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys - * @param op callable object to perform apply operation. + * @param op Callable object to perform apply operation. * @param stream CUDA stream used for insert */ template - void insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream = {}) noexcept; + void insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream = {}); /** * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent @@ -439,16 +439,16 @@ class static_map { * inserts the pair as if by insert. * * @note Callable object to perform binary operation should be able to invoke as - * Op(cuda::atomic, T>) + * Op(cuda::atomic_ref, T>) * * @tparam InputIt Device accessible random access input iterator where * std::is_convertible::value_type, * static_map::value_type> is `true` - * @tparam Op Callable type used to peform apply operation. + * @tparam Op Callable type used to peform `apply` operation. * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys - * @param op callable object to perform apply operation. + * @param op Callable object to perform apply operation. * @param stream CUDA stream used for insert */ template From 4ae274d93bd320400424294cb5639571a306590b Mon Sep 17 00:00:00 2001 From: Srinivas Yadav Singanaboina Date: Wed, 3 Jul 2024 18:10:02 +0000 Subject: [PATCH 15/16] more minor cleanup --- .../cuco/detail/static_map/static_map_ref.inl | 43 ++----------------- 1 file changed, 3 insertions(+), 40 deletions(-) diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 62f12e4e7..0e59aa502 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -712,7 +712,9 @@ class operator_impl< case insert_result::SUCCESS: return; case insert_result::DUPLICATE: { if (group.thread_rank() == src_lane) { - ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + if constexpr (sizeof(value_type) > 8) { + ref_.impl_.wait_for_payload(slot_ptr->second, empty_value); + } op(cuda::atomic_ref{slot_ptr->second}, val.second); } return; @@ -736,45 +738,6 @@ class operator_impl< payload_ref.fetch_add(payload, cuda::memory_order_relaxed); }); } - - private: - /** - * @brief Attempts to insert an element into a slot or update the matching payload by applying the - * binary operation on the payload and new value. - * - * @tparam Value Input type which is implicitly convertible to 'value_type' - * @tparam Op Callable type which is used as apply operation and called be - * called with arguments as Op(cuda::atomic_ref, T) - - * @param slot value_type pointer to the slot to insert - * @param value The element to insert - * @param op The callable object to perform binary operation between existing value at the slot - * and element to insert. - * - * @return Returns `true` if the given `value` is inserted or `value` has a match in the map. - */ - template - __device__ constexpr bool attempt_insert_or_apply(value_type* slot, - Value const& value, - Op op) noexcept - { - ref_type& ref_ = static_cast(*this); - auto const expected_key = ref_.impl_.empty_slot_sentinel().first; - - auto old_key = - ref_.impl_.compare_and_swap(&slot->first, expected_key, static_cast(value.first)); - auto* old_key_ptr = reinterpret_cast(&old_key); - - // if key success or key was already present in the map - if (cuco::detail::bitwise_compare(*old_key_ptr, expected_key) or - (ref_.impl_.predicate().equal_to(value.first, *old_key_ptr) == - detail::equal_result::EQUAL)) { - // Update payload - op(cuda::atomic_ref{slot->second}, value.second); - return true; - } - return false; - } }; template Date: Wed, 3 Jul 2024 22:04:46 +0000 Subject: [PATCH 16/16] replace cuda_stream_ref with cuda::stream_ref --- include/cuco/detail/static_map/static_map.inl | 8 ++++---- include/cuco/static_map.cuh | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl index 06ba8441d..a26092196 100644 --- a/include/cuco/detail/static_map/static_map.inl +++ b/include/cuco/detail/static_map/static_map.inl @@ -257,10 +257,10 @@ template template void static_map:: - insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream) + insert_or_apply(InputIt first, InputIt last, Op op, cuda::stream_ref stream) { return this->insert_or_apply_async(first, last, op, stream); - stream.synchronize(); + stream.wait(); } template template void static_map:: - insert_or_apply_async(InputIt first, InputIt last, Op op, cuda_stream_ref stream) noexcept + insert_or_apply_async(InputIt first, InputIt last, Op op, cuda::stream_ref stream) noexcept { auto const num = cuco::detail::distance(first, last); if (num == 0) { return; } @@ -281,7 +281,7 @@ void static_map - <<>>( + <<>>( first, num, op, ref(op::insert_or_apply)); } diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index b34c1481d..f84759b0f 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -430,7 +430,7 @@ class static_map { * @param stream CUDA stream used for insert */ template - void insert_or_apply(InputIt first, InputIt last, Op op, cuda_stream_ref stream = {}); + void insert_or_apply(InputIt first, InputIt last, Op op, cuda::stream_ref stream = {}); /** * @brief For any key-value pair `{k, v}` in the range `[first, first + n)`, if a key equivalent @@ -455,7 +455,7 @@ class static_map { void insert_or_apply_async(InputIt first, InputIt last, Op op, - cuda_stream_ref stream = {}) noexcept; + cuda::stream_ref stream = {}) noexcept; /** * @brief Erases keys in the range `[first, last)`.