From 85de9c2a91774675d2d8ffd8a19841aa70abbea3 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Thu, 11 May 2023 21:30:34 +0000 Subject: [PATCH 01/41] Trie checkin --- include/cuco/detail/trie/trie.inl | 197 ++++++++++++++++++++++++++ include/cuco/detail/trie/trie_ref.inl | 112 +++++++++++++++ include/cuco/operator.hpp | 6 + include/cuco/trie.cuh | 142 +++++++++++++++++++ include/cuco/trie_ref.cuh | 39 +++++ tests/CMakeLists.txt | 5 + tests/trie/lookup_test.cu | 111 +++++++++++++++ 7 files changed, 612 insertions(+) create mode 100644 include/cuco/detail/trie/trie.inl create mode 100644 include/cuco/detail/trie/trie_ref.inl create mode 100644 include/cuco/trie.cuh create mode 100644 include/cuco/trie_ref.cuh create mode 100644 tests/trie/lookup_test.cu diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl new file mode 100644 index 000000000..84c8bec56 --- /dev/null +++ b/include/cuco/detail/trie/trie.inl @@ -0,0 +1,197 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace cuco { +namespace experimental { + +template +trie::trie() + : levels_(2), + d_levels_ptr_(nullptr), + num_levels_(2), + n_keys_(0), + n_nodes_(1), + last_key_(), + device_ptr_(nullptr) +{ + levels_[0].louds.append(0); + levels_[0].louds.append(1); + levels_[1].louds.append(1); + levels_[0].outs.append(0); + levels_[0].labels.push_back(root_label_); +} + +template +trie::~trie() noexcept(false) +{ + if (d_levels_ptr_) { CUCO_CUDA_TRY(cudaFree(d_levels_ptr_)); } + if (device_ptr_) { CUCO_CUDA_TRY(cudaFree(device_ptr_)); } +} + +template +void trie::insert(const std::vector& key) +{ + if (key == last_key_) { return; } // Ignore duplicate keys + assert(n_keys_ == 0 || key > last_key_); // Keys are expected to be inserted in sorted order + + if (key.empty()) { + levels_[0].outs.set(0, 1); + ++levels_[1].offset; + ++n_keys_; + return; + } + + if (key.size() + 1 >= levels_.size()) { levels_.resize(key.size() + 2); } + + // Find first position where label is different from last_key + // Trie is not updated till that position is reached, simply skip to next position + uint32_t pos = 0; + for (; pos < key.size(); ++pos) { + auto& level = levels_[pos + 1]; + T label = key[pos]; + + if ((pos == last_key_.size()) || (label != level.labels.back())) { + level.louds.set_last(0); + level.louds.append(1); + level.outs.append(0); + level.labels.push_back(label); + ++n_nodes_; + break; + } + } + + // Process remaining labels after divergence point from last_key + // Each such label will create a new edge and node pair in trie + for (++pos; pos < key.size(); ++pos) { + auto& level = levels_[pos + 1]; + level.louds.append(0); + level.louds.append(1); + level.outs.append(0); + level.labels.push_back(key[pos]); + ++n_nodes_; + } + + levels_[key.size() + 1].louds.append(1); // Mark end of current key + ++levels_[key.size() + 1].offset; + levels_[key.size()].outs.set_last(1); // Set terminal bit indicating valid path + + ++n_keys_; + last_key_ = key; +} + +// Helper to move vector from host to device +// Host vector is clear to avoid duplication. Device pointer is returned +template +T* move_vector_to_device(std::vector& host_vector, thrust::device_vector& device_vector) +{ + device_vector = host_vector; + host_vector.clear(); + return thrust::raw_pointer_cast(device_vector.data()); +} + +template +void trie::build() +{ + // Perform build level-by-level for all levels, followed by a deep-copy from host to device + + // Host-side per-level bit-vector refs + std::vector louds_refs, outs_refs; + uint64_t offset = 0; + + for (auto& level : levels_) { + level.louds.build(); + louds_refs.push_back(level.louds.ref(bv_read)); + + level.outs.build(); + outs_refs.push_back(level.outs.ref(bv_read)); + + // Move labels to device + level.d_labels_ptr = move_vector_to_device(level.labels, level.d_labels); + + offset += level.offset; + level.offset = offset; + } + + // Move bitvector refs to device + d_louds_refs_ptr_ = move_vector_to_device(louds_refs, d_louds_refs_); + d_outs_refs_ptr_ = move_vector_to_device(outs_refs, d_outs_refs_); + + num_levels_ = levels_.size(); + + // Move levels to device + CUCO_CUDA_TRY(cudaMalloc(&d_levels_ptr_, sizeof(level) * num_levels_)); + CUCO_CUDA_TRY( + cudaMemcpy(d_levels_ptr_, &levels_[0], sizeof(level) * num_levels_, cudaMemcpyHostToDevice)); + + // Finally create a device copy of full trie structure + CUCO_CUDA_TRY(cudaMalloc(&device_ptr_, sizeof(trie))); + CUCO_CUDA_TRY(cudaMemcpy(device_ptr_, this, sizeof(trie), cudaMemcpyHostToDevice)); +} + +template +template +void trie::lookup(KeyIt keys_begin, + OffsetIt offsets_begin, + OffsetIt offsets_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const +{ + auto const num_keys = cuco::detail::distance(offsets_begin, offsets_end) - 1; + if (num_keys == 0) { return; } + + auto const grid_size = + (num_keys - 1) / (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE) + 1; + + auto ref_ = this->ref(cuco::experimental::trie_lookup); + + trie_lookup_kernel<<>>( + ref_, keys_begin, offsets_begin, outputs_begin, num_keys); +} + +template +__global__ void trie_lookup_kernel( + TrieRef ref, KeyIt keys, OffsetIt offsets, OutputIt outputs, uint64_t num_keys) +{ + uint32_t const loop_stride = gridDim.x * blockDim.x; + uint32_t key_id = blockDim.x * blockIdx.x + threadIdx.x; + + while (key_id < num_keys) { + const auto key_start_pos = keys + offsets[key_id]; + const uint64_t key_length = offsets[key_id + 1] - offsets[key_id]; + + outputs[key_id] = ref.lookup_key(key_start_pos, key_length); + key_id += loop_stride; + } +} + +template +template +auto trie::ref(Operators...) const noexcept +{ + static_assert(sizeof...(Operators), "No operators specified"); + return ref_type{device_ptr_}; +} + +template +trie::level::level() : louds{}, outs{}, labels{}, d_labels_ptr{nullptr}, offset{0} +{ +} + +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/detail/trie/trie_ref.inl b/include/cuco/detail/trie/trie_ref.inl new file mode 100644 index 000000000..22bd2945d --- /dev/null +++ b/include/cuco/detail/trie/trie_ref.inl @@ -0,0 +1,112 @@ +#include + +namespace cuco { +namespace experimental { + +template +__host__ __device__ constexpr trie_ref::trie_ref(const trie* trie) noexcept + : trie_(trie) +{ +} + +namespace detail { + +template +class operator_impl> { + using ref_type = trie_ref; + + public: + /** + * @brief Lookup a single key in trie + * + * @param key Iterator to first character of search key + * @param length Number of characters in key + * + * @return Index of key if it exists in trie, -1 otherwise + */ + template + [[nodiscard]] __device__ uint64_t lookup_key(KeyIt key, uint64_t length) const noexcept + { + auto const& trie = static_cast(*this).trie_; + + // Level-by-level search. node_id is updated at each level + uint32_t node_id = 0; + for (uint32_t cur_depth = 1; cur_depth <= length; cur_depth++) { + if (!search_label_in_children(key[cur_depth - 1], node_id, cur_depth)) { return -1lu; } + } + + // Check for terminal node bit that indicates a valid key + uint64_t leaf_level_id = length; + if (!trie->d_outs_refs_ptr_[leaf_level_id].get(node_id)) { return -1lu; } + + // Key exists in trie, generate the index + auto offset = trie->d_levels_ptr_[leaf_level_id].offset; + auto rank = trie->d_outs_refs_ptr_[leaf_level_id].rank(node_id); + + return offset + rank; + } + + private: + /** + * @brief Find position of last child of a node + * + * @param louds louds bitvector of current level + * @param node_id node index in current level + * + * @return Position of last child + */ + template + [[nodiscard]] __device__ uint32_t get_last_child_position(BitVectorRef louds, + uint32_t& node_id) const noexcept + { + uint32_t node_pos = 0; + if (node_id != 0) { + node_pos = louds.select(node_id - 1) + 1; + node_id = node_pos - node_id; + } + + uint32_t pos_end = louds.find_next_set(node_pos); + return node_id + (pos_end - node_pos); + } + + /** + * @brief Search for a target label in children nodes of a parent node + * + * @param target Label to search for + * @param node_id Index of parent node + * @param level_id Index of current level + * + * @return Boolean indicating success of search process + */ + [[nodiscard]] __device__ bool search_label_in_children(T target, + uint32_t& node_id, + uint32_t level_id) const noexcept + { + auto const& trie = static_cast(*this).trie_; + auto louds = trie->d_louds_refs_ptr_[level_id]; + + uint32_t end = get_last_child_position(louds, node_id); // Position of last child + uint32_t begin = node_id; // Position of first child, initialized after find_last_child call + + auto& level = trie->d_levels_ptr_[level_id]; + auto labels = level.d_labels_ptr; + + // Binary search labels array of current level + while (begin < end) { + node_id = (begin + end) / 2; + auto label = labels[node_id]; + if (target < label) { + end = node_id; + } else if (target > label) { + begin = node_id + 1; + } else { + break; + } + } + return begin < end; + } +}; + +} // namespace detail +} // namespace experimental +} // namespace cuco diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp index 77cf2c133..ca2953f40 100644 --- a/include/cuco/operator.hpp +++ b/include/cuco/operator.hpp @@ -51,6 +51,12 @@ struct contains_tag { struct find_tag { } inline constexpr find; +/** + * @brief `trie_lookup` operator tag + */ +struct trie_lookup_tag { +} inline constexpr trie_lookup; + } // namespace op } // namespace experimental } // namespace cuco diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh new file mode 100644 index 000000000..6b295a5ad --- /dev/null +++ b/include/cuco/trie.cuh @@ -0,0 +1,142 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +#include +#include +#include +#include + +namespace cuco { +namespace experimental { + +/** + * @brief Trie class + * + * @tparam T type of individual characters of vector keys (eg. char or int) + */ +template +class trie { + public: + trie(); + ~trie() noexcept(false); + + /** + * @brief Insert new key into trie + * + * @param key Key to insert + */ + void insert(const std::vector& key); + + /** + * @brief Build level-by-level trie indexes after inserting all keys + * + * In addition, a snapshot of current trie state is copied to device + */ + void build(); + + /** + * @brief Bulk lookup vector of keys + * + * @tparam KeyIt Device-accessible iterator to individual characters of keys + * @tparam OffsetIt Device-accessible iterator to positions of key boundaries + * @tparam OutputIt Device-accessible iterator to lookup result + * + * @param keys_begin Begin iterator to individual key characters + * @param offsets_begin Begin iterator to offsets of key boundaries + * @param offsets_end End iterator to offsets + * @param outputs_begin Begin iterator to results + * @param stream Stream to execute lookup kernel + */ + template + void lookup(KeyIt keys_begin, + OffsetIt offsets_begin, + OffsetIt offsets_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) const; + + /** + * @brief Get number of keys inserted into trie + * + * @return Number of keys + */ + uint64_t n_keys() const { return n_keys_; } + + template + using ref_type = + cuco::experimental::trie_ref; ///< Non-owning container ref type + + /** + * @brief Get device ref with operators. + * + * @tparam Operators Set of `cuco::op` to be provided by the ref + * + * @param ops List of operators, e.g., `cuco::bv_read` + * + * @return Device ref of the current `trie` object + */ + template + [[nodiscard]] auto ref(Operators... ops) const noexcept; + + /** + * @brief Struct to represent each trie level + */ + struct level { + level(); + level(level&& other) = default; + + bit_vector<> louds; ///< Indicates links to next and previous level + bit_vector<> outs; ///< Indicates terminal nodes of valid keys + + std::vector labels; ///< Stores individual characters of keys + thrust::device_vector d_labels; ///< Device-side copy of `labels` + T* d_labels_ptr; ///< Raw pointer to d_labels + + uint64_t offset; ///< Count of nodes in all parent levels + }; + + level* d_levels_ptr_; ///< Device-side array of levels + + using bv_read_ref = bit_vector_ref::device_storage_ref, bv_read_tag>; + bv_read_ref* d_louds_refs_ptr_; ///< Refs to louds bitvectors of each level + bv_read_ref* d_outs_refs_ptr_; ///< Refs to out bitvectors of each level + + private: + static constexpr T root_label_ = sizeof(T) == 1 ? ' ' : static_cast(-1); ///< Sentinel value + uint64_t num_levels_; ///< Number of trie levels + std::vector levels_; ///< Host-side array of levels + + uint64_t n_keys_; ///< Number of keys inserted into trie + uint64_t n_nodes_; ///< Number of nodes in trie + std::vector last_key_; ///< Last key inserted into trie + + trie* device_ptr_; ///< Device-side copy of trie structure + + using bv_refs_vector = thrust::device_vector; + bv_refs_vector d_louds_refs_; ///< refs to per-level louds bitvectors + bv_refs_vector d_outs_refs_; ///< refs to per-level outs bitvectors +}; + +} // namespace experimental +} // namespace cuco + +#include diff --git a/include/cuco/trie_ref.cuh b/include/cuco/trie_ref.cuh new file mode 100644 index 000000000..cdc7d86ad --- /dev/null +++ b/include/cuco/trie_ref.cuh @@ -0,0 +1,39 @@ +#pragma once + +#include + +namespace cuco { +namespace experimental { + +template +class trie; + +/** + * @brief Device non-owning "ref" type that can be used in device code to perform arbitrary + * operations defined in `include/cuco/operator.hpp` + * + * @tparam T Trie key type + * @tparam Operators Device operator options defined in `include/cuco/operator.hpp` + */ +template +class trie_ref : public detail::operator_impl>... { + public: + /** + * @brief Constructs trie_ref. + * + * @param trie Non-owning ref of trie + */ + __host__ __device__ explicit constexpr trie_ref(const trie* trie) noexcept; + + private: + const trie* trie_; + + // Mixins need to be friends with this class in order to access private members + template + friend class detail::operator_impl; +}; + +} // namespace experimental +} // namespace cuco + +#include diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 3deeeddf1..b837a3964 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -105,3 +105,8 @@ ConfigureTest(DYNAMIC_BITSET_TEST dynamic_bitset/rank_test.cu dynamic_bitset/select_test.cu dynamic_bitset/size_test.cu) + +################################################################################################### +# - trie tests ------------------------------------------------------------------------------ +ConfigureTest(TRIE_TEST + trie/lookup_test.cu) diff --git a/tests/trie/lookup_test.cu b/tests/trie/lookup_test.cu new file mode 100644 index 000000000..9f8154e02 --- /dev/null +++ b/tests/trie/lookup_test.cu @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include +#include +#include +#include + +#include + +struct valid_key { + valid_key(uint64_t num_keys) : num_keys_(num_keys) {} + __host__ __device__ bool operator()(uint64_t x) const { return x < num_keys_; } + const uint64_t num_keys_; +}; + +template +void generate_keys(thrust::host_vector& keys, + thrust::host_vector& offsets, + size_t num_keys, + size_t max_key_value, + size_t max_key_length) +{ + for (size_t key_id = 0; key_id < num_keys; key_id++) { + size_t cur_key_length = 1 + (std::rand() % max_key_length); + offsets.push_back(cur_key_length); + for (size_t pos = 0; pos < cur_key_length; pos++) { + keys.push_back(1 + (std::rand() % max_key_value)); + } + } + + // Add a dummy 0 to simplify subsequent scan + offsets.push_back(0); + thrust::exclusive_scan(offsets.begin(), offsets.end(), offsets.begin()); // in-place scan +} + +TEST_CASE("Lookup test", "") +{ + using KeyType = int; + + std::size_t num_keys = 64 * 1024; + std::size_t max_key_value = 1000; + std::size_t max_key_length = 32; + thrust::host_vector keys; + thrust::host_vector offsets; + + generate_keys(keys, offsets, num_keys, max_key_value, max_key_length); + + cuco::experimental::trie trie; + + { + std::vector> all_keys; + for (size_t key_id = 0; key_id < num_keys; key_id++) { + std::vector cur_key; + for (size_t pos = offsets[key_id]; pos < offsets[key_id + 1]; pos++) { + cur_key.push_back(keys[pos]); + } + all_keys.push_back(cur_key); + } + + struct vectorKeyCompare { + bool operator()(const std::vector& lhs, const std::vector& rhs) + { + for (size_t pos = 0; pos < min(lhs.size(), rhs.size()); pos++) { + if (lhs[pos] < rhs[pos]) { + return true; + } else if (lhs[pos] > rhs[pos]) { + return false; + } + } + return lhs.size() <= rhs.size(); + } + }; + sort(all_keys.begin(), all_keys.end(), vectorKeyCompare()); + + for (auto key : all_keys) { + trie.insert(key); + } + } + + trie.build(); + + { + thrust::device_vector lookup_result(num_keys, -1lu); + thrust::device_vector device_keys = keys; + thrust::device_vector device_offsets = offsets; + + trie.lookup( + device_keys.begin(), device_offsets.begin(), device_offsets.end(), lookup_result.begin()); + + REQUIRE(cuco::test::all_of(lookup_result.begin(), lookup_result.end(), valid_key(num_keys))); + } +} From c724f614a7a25133503e5b4e4ebe571eae13941c Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Sun, 27 Aug 2023 06:58:20 +0000 Subject: [PATCH 02/41] Coding style --- include/cuco/detail/trie/trie.inl | 30 +++++++------- include/cuco/trie.cuh | 66 ++++++++++++++++--------------- 2 files changed, 50 insertions(+), 46 deletions(-) diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl index 84c8bec56..9d04a76a3 100644 --- a/include/cuco/detail/trie/trie.inl +++ b/include/cuco/detail/trie/trie.inl @@ -22,13 +22,13 @@ namespace experimental { template trie::trie() - : levels_(2), - d_levels_ptr_(nullptr), - num_levels_(2), - n_keys_(0), - n_nodes_(1), - last_key_(), - device_ptr_(nullptr) + : levels_{2}, + d_levels_ptr_{nullptr}, + num_levels_{2}, + n_keys_{0}, + n_nodes_{1}, + last_key_{}, + device_ptr_{nullptr} { levels_[0].louds.append(0); levels_[0].louds.append(1); @@ -61,7 +61,7 @@ void trie::insert(const std::vector& key) // Find first position where label is different from last_key // Trie is not updated till that position is reached, simply skip to next position - uint32_t pos = 0; + size_type pos = 0; for (; pos < key.size(); ++pos) { auto& level = levels_[pos + 1]; T label = key[pos]; @@ -112,7 +112,7 @@ void trie::build() // Host-side per-level bit-vector refs std::vector louds_refs, outs_refs; - uint64_t offset = 0; + size_type offset = 0; for (auto& level : levels_) { level.louds.build(); @@ -152,10 +152,10 @@ void trie::lookup(KeyIt keys_begin, OutputIt outputs_begin, cuda_stream_ref stream) const { - auto const num_keys = cuco::detail::distance(offsets_begin, offsets_end) - 1; + auto num_keys = cuco::detail::distance(offsets_begin, offsets_end) - 1; if (num_keys == 0) { return; } - auto const grid_size = + auto grid_size = (num_keys - 1) / (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE) + 1; auto ref_ = this->ref(cuco::experimental::trie_lookup); @@ -168,12 +168,12 @@ template #include -#include -#include #include namespace cuco { @@ -74,16 +72,14 @@ class trie { OutputIt outputs_begin, cuda_stream_ref stream = {}) const; + using size_type = std::size_t; ///< size type + /** - * @brief Get number of keys inserted into trie + * @brief Get current size i.e. number of keys inserted * * @return Number of keys */ - uint64_t n_keys() const { return n_keys_; } - - template - using ref_type = - cuco::experimental::trie_ref; ///< Non-owning container ref type + size_type constexpr size() const { return n_keys_; } /** * @brief Get device ref with operators. @@ -97,12 +93,41 @@ class trie { template [[nodiscard]] auto ref(Operators... ops) const noexcept; + private: + size_type n_keys_; ///< Number of keys inserted into trie + size_type n_nodes_; ///< Number of nodes in trie + std::vector last_key_; ///< Last key inserted into trie + + static constexpr T root_label_ = sizeof(T) == 1 ? ' ' : static_cast(-1); ///< Sentinel value + + struct level; + size_type num_levels_; ///< Number of trie levels + std::vector levels_; ///< Host-side array of levels + level* d_levels_ptr_; ///< Device-side array of levels + + using bv_read_ref = bit_vector_ref::device_storage_ref, bv_read_tag>; ///< Read ref + thrust::device_vector d_louds_refs_; ///< refs to per-level louds bitvectors + thrust::device_vector d_outs_refs_; ///< refs to per-level outs bitvectors + + bv_read_ref* d_louds_refs_ptr_; ///< Raw pointer to d_louds_refs_ + bv_read_ref* d_outs_refs_ptr_; ///< Raw pointer to d_outs_refs_ + + trie* device_ptr_; ///< Device-side copy of trie + + template + using ref_type = + cuco::experimental::trie_ref; ///< Non-owning container ref type + + // Mixins need to be friends with this class in order to access private members + template + friend class detail::operator_impl; + /** * @brief Struct to represent each trie level */ struct level { level(); - level(level&& other) = default; + level(level&&) = default; ///< Move constructor bit_vector<> louds; ///< Indicates links to next and previous level bit_vector<> outs; ///< Indicates terminal nodes of valid keys @@ -111,29 +136,8 @@ class trie { thrust::device_vector d_labels; ///< Device-side copy of `labels` T* d_labels_ptr; ///< Raw pointer to d_labels - uint64_t offset; ///< Count of nodes in all parent levels + size_type offset; ///< Count of nodes in all parent levels }; - - level* d_levels_ptr_; ///< Device-side array of levels - - using bv_read_ref = bit_vector_ref::device_storage_ref, bv_read_tag>; - bv_read_ref* d_louds_refs_ptr_; ///< Refs to louds bitvectors of each level - bv_read_ref* d_outs_refs_ptr_; ///< Refs to out bitvectors of each level - - private: - static constexpr T root_label_ = sizeof(T) == 1 ? ' ' : static_cast(-1); ///< Sentinel value - uint64_t num_levels_; ///< Number of trie levels - std::vector levels_; ///< Host-side array of levels - - uint64_t n_keys_; ///< Number of keys inserted into trie - uint64_t n_nodes_; ///< Number of nodes in trie - std::vector last_key_; ///< Last key inserted into trie - - trie* device_ptr_; ///< Device-side copy of trie structure - - using bv_refs_vector = thrust::device_vector; - bv_refs_vector d_louds_refs_; ///< refs to per-level louds bitvectors - bv_refs_vector d_outs_refs_; ///< refs to per-level outs bitvectors }; } // namespace experimental From ac5af70390ac3260fc26178bb845dc7d1994fc70 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Sun, 27 Aug 2023 21:47:24 +0000 Subject: [PATCH 03/41] Change template parameter name --- include/cuco/detail/trie/trie.inl | 42 +++++++++++++-------------- include/cuco/detail/trie/trie_ref.inl | 13 +++++---- include/cuco/trie.cuh | 25 ++++++++-------- include/cuco/trie_ref.cuh | 12 ++++---- 4 files changed, 47 insertions(+), 45 deletions(-) diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl index 9d04a76a3..fd405280f 100644 --- a/include/cuco/detail/trie/trie.inl +++ b/include/cuco/detail/trie/trie.inl @@ -20,8 +20,8 @@ namespace cuco { namespace experimental { -template -trie::trie() +template +trie::trie() : levels_{2}, d_levels_ptr_{nullptr}, num_levels_{2}, @@ -37,15 +37,15 @@ trie::trie() levels_[0].labels.push_back(root_label_); } -template -trie::~trie() noexcept(false) +template +trie::~trie() noexcept(false) { if (d_levels_ptr_) { CUCO_CUDA_TRY(cudaFree(d_levels_ptr_)); } if (device_ptr_) { CUCO_CUDA_TRY(cudaFree(device_ptr_)); } } -template -void trie::insert(const std::vector& key) +template +void trie::insert(const std::vector& key) { if (key == last_key_) { return; } // Ignore duplicate keys assert(n_keys_ == 0 || key > last_key_); // Keys are expected to be inserted in sorted order @@ -64,7 +64,7 @@ void trie::insert(const std::vector& key) size_type pos = 0; for (; pos < key.size(); ++pos) { auto& level = levels_[pos + 1]; - T label = key[pos]; + auto label = key[pos]; if ((pos == last_key_.size()) || (label != level.labels.back())) { level.louds.set_last(0); @@ -105,8 +105,8 @@ T* move_vector_to_device(std::vector& host_vector, thrust::device_vector& return thrust::raw_pointer_cast(device_vector.data()); } -template -void trie::build() +template +void trie::build() { // Perform build level-by-level for all levels, followed by a deep-copy from host to device @@ -140,17 +140,17 @@ void trie::build() cudaMemcpy(d_levels_ptr_, &levels_[0], sizeof(level) * num_levels_, cudaMemcpyHostToDevice)); // Finally create a device copy of full trie structure - CUCO_CUDA_TRY(cudaMalloc(&device_ptr_, sizeof(trie))); - CUCO_CUDA_TRY(cudaMemcpy(device_ptr_, this, sizeof(trie), cudaMemcpyHostToDevice)); + CUCO_CUDA_TRY(cudaMalloc(&device_ptr_, sizeof(trie))); + CUCO_CUDA_TRY(cudaMemcpy(device_ptr_, this, sizeof(trie), cudaMemcpyHostToDevice)); } -template +template template -void trie::lookup(KeyIt keys_begin, - OffsetIt offsets_begin, - OffsetIt offsets_end, - OutputIt outputs_begin, - cuda_stream_ref stream) const +void trie::lookup(KeyIt keys_begin, + OffsetIt offsets_begin, + OffsetIt offsets_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const { auto num_keys = cuco::detail::distance(offsets_begin, offsets_end) - 1; if (num_keys == 0) { return; } @@ -180,16 +180,16 @@ __global__ void trie_lookup_kernel( } } -template +template template -auto trie::ref(Operators...) const noexcept +auto trie::ref(Operators...) const noexcept { static_assert(sizeof...(Operators), "No operators specified"); return ref_type{device_ptr_}; } -template -trie::level::level() : louds{}, outs{}, labels{}, d_labels_ptr{nullptr}, offset{0} +template +trie::level::level() : louds{}, outs{}, labels{}, d_labels_ptr{nullptr}, offset{0} { } diff --git a/include/cuco/detail/trie/trie_ref.inl b/include/cuco/detail/trie/trie_ref.inl index 22bd2945d..e885b4b6b 100644 --- a/include/cuco/detail/trie/trie_ref.inl +++ b/include/cuco/detail/trie/trie_ref.inl @@ -3,17 +3,18 @@ namespace cuco { namespace experimental { -template -__host__ __device__ constexpr trie_ref::trie_ref(const trie* trie) noexcept +template +__host__ __device__ constexpr trie_ref::trie_ref( + const trie* trie) noexcept : trie_(trie) { } namespace detail { -template -class operator_impl> { - using ref_type = trie_ref; +template +class operator_impl> { + using ref_type = trie_ref; public: /** @@ -78,7 +79,7 @@ class operator_impl> { * * @return Boolean indicating success of search process */ - [[nodiscard]] __device__ bool search_label_in_children(T target, + [[nodiscard]] __device__ bool search_label_in_children(label_type target, uint32_t& node_id, uint32_t level_id) const noexcept { diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh index 650235385..7ff237f6d 100644 --- a/include/cuco/trie.cuh +++ b/include/cuco/trie.cuh @@ -30,9 +30,9 @@ namespace experimental { /** * @brief Trie class * - * @tparam T type of individual characters of vector keys (eg. char or int) + * @tparam label_type type of individual characters of vector keys (eg. char or int) */ -template +template class trie { public: trie(); @@ -43,7 +43,7 @@ class trie { * * @param key Key to insert */ - void insert(const std::vector& key); + void insert(const std::vector& key); /** * @brief Build level-by-level trie indexes after inserting all keys @@ -94,11 +94,12 @@ class trie { [[nodiscard]] auto ref(Operators... ops) const noexcept; private: - size_type n_keys_; ///< Number of keys inserted into trie - size_type n_nodes_; ///< Number of nodes in trie - std::vector last_key_; ///< Last key inserted into trie + size_type n_keys_; ///< Number of keys inserted into trie + size_type n_nodes_; ///< Number of nodes in trie + std::vector last_key_; ///< Last key inserted into trie - static constexpr T root_label_ = sizeof(T) == 1 ? ' ' : static_cast(-1); ///< Sentinel value + static constexpr label_type root_label_ = + sizeof(label_type) == 1 ? ' ' : static_cast(-1); ///< Sentinel value struct level; size_type num_levels_; ///< Number of trie levels @@ -112,11 +113,11 @@ class trie { bv_read_ref* d_louds_refs_ptr_; ///< Raw pointer to d_louds_refs_ bv_read_ref* d_outs_refs_ptr_; ///< Raw pointer to d_outs_refs_ - trie* device_ptr_; ///< Device-side copy of trie + trie* device_ptr_; ///< Device-side copy of trie template using ref_type = - cuco::experimental::trie_ref; ///< Non-owning container ref type + cuco::experimental::trie_ref; ///< Non-owning container ref type // Mixins need to be friends with this class in order to access private members template @@ -132,9 +133,9 @@ class trie { bit_vector<> louds; ///< Indicates links to next and previous level bit_vector<> outs; ///< Indicates terminal nodes of valid keys - std::vector labels; ///< Stores individual characters of keys - thrust::device_vector d_labels; ///< Device-side copy of `labels` - T* d_labels_ptr; ///< Raw pointer to d_labels + std::vector labels; ///< Stores individual characters of keys + thrust::device_vector d_labels; ///< Device-side copy of `labels` + label_type* d_labels_ptr; ///< Raw pointer to d_labels size_type offset; ///< Count of nodes in all parent levels }; diff --git a/include/cuco/trie_ref.cuh b/include/cuco/trie_ref.cuh index cdc7d86ad..c5bffd126 100644 --- a/include/cuco/trie_ref.cuh +++ b/include/cuco/trie_ref.cuh @@ -5,28 +5,28 @@ namespace cuco { namespace experimental { -template +template class trie; /** * @brief Device non-owning "ref" type that can be used in device code to perform arbitrary * operations defined in `include/cuco/operator.hpp` * - * @tparam T Trie key type + * @tparam label_type Trie label type * @tparam Operators Device operator options defined in `include/cuco/operator.hpp` */ -template -class trie_ref : public detail::operator_impl>... { +template +class trie_ref : public detail::operator_impl>... { public: /** * @brief Constructs trie_ref. * * @param trie Non-owning ref of trie */ - __host__ __device__ explicit constexpr trie_ref(const trie* trie) noexcept; + __host__ __device__ explicit constexpr trie_ref(const trie* trie) noexcept; private: - const trie* trie_; + const trie* trie_; // Mixins need to be friends with this class in order to access private members template From ffdde2369fefc53c63d20aefc744e5bce153aaef Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Mon, 28 Aug 2023 00:51:17 +0000 Subject: [PATCH 04/41] Includes --- include/cuco/detail/trie/trie_ref.inl | 2 -- include/cuco/trie.cuh | 6 ------ tests/trie/lookup_test.cu | 2 -- 3 files changed, 10 deletions(-) diff --git a/include/cuco/detail/trie/trie_ref.inl b/include/cuco/detail/trie/trie_ref.inl index e885b4b6b..81aeeb4f5 100644 --- a/include/cuco/detail/trie/trie_ref.inl +++ b/include/cuco/detail/trie/trie_ref.inl @@ -1,5 +1,3 @@ -#include - namespace cuco { namespace experimental { diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh index 7ff237f6d..d58087e4b 100644 --- a/include/cuco/trie.cuh +++ b/include/cuco/trie.cuh @@ -15,15 +15,9 @@ * limitations under the License. */ -#include #include #include -#include - -#include -#include - namespace cuco { namespace experimental { diff --git a/tests/trie/lookup_test.cu b/tests/trie/lookup_test.cu index 9f8154e02..ed1bf7721 100644 --- a/tests/trie/lookup_test.cu +++ b/tests/trie/lookup_test.cu @@ -17,10 +17,8 @@ #include #include -#include #include -#include #include #include From 18e95019485c0f7719d091ef8c3c1142813354d8 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Mon, 28 Aug 2023 02:36:41 +0000 Subject: [PATCH 05/41] Misc coding style changes --- include/cuco/detail/trie/trie.inl | 97 ++++++++++++++------------- include/cuco/detail/trie/trie_ref.inl | 33 ++++----- include/cuco/trie.cuh | 29 ++++---- 3 files changed, 80 insertions(+), 79 deletions(-) diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl index fd405280f..a8756deec 100644 --- a/include/cuco/detail/trie/trie.inl +++ b/include/cuco/detail/trie/trie.inl @@ -21,20 +21,20 @@ namespace cuco { namespace experimental { template -trie::trie() - : levels_{2}, - d_levels_ptr_{nullptr}, - num_levels_{2}, - n_keys_{0}, - n_nodes_{1}, +constexpr trie::trie() + : num_keys_{0}, + num_nodes_{1}, last_key_{}, + num_levels_{2}, + levels_{2}, + d_levels_ptr_{nullptr}, device_ptr_{nullptr} { - levels_[0].louds.append(0); - levels_[0].louds.append(1); - levels_[1].louds.append(1); - levels_[0].outs.append(0); - levels_[0].labels.push_back(root_label_); + levels_[0].louds_.append(0); + levels_[0].louds_.append(1); + levels_[1].louds_.append(1); + levels_[0].outs_.append(0); + levels_[0].labels_.push_back(root_label_); } template @@ -45,15 +45,15 @@ trie::~trie() noexcept(false) } template -void trie::insert(const std::vector& key) +void trie::insert(const std::vector& key) noexcept { - if (key == last_key_) { return; } // Ignore duplicate keys - assert(n_keys_ == 0 || key > last_key_); // Keys are expected to be inserted in sorted order + if (key == last_key_) { return; } // Ignore duplicate keys + assert(num_keys_ == 0 || key > last_key_); // Keys are expected to be inserted in sorted order if (key.empty()) { - levels_[0].outs.set(0, 1); - ++levels_[1].offset; - ++n_keys_; + levels_[0].outs_.set(0, 1); + ++levels_[1].offset_; + ++num_keys_; return; } @@ -66,37 +66,37 @@ void trie::insert(const std::vector& key) auto& level = levels_[pos + 1]; auto label = key[pos]; - if ((pos == last_key_.size()) || (label != level.labels.back())) { - level.louds.set_last(0); - level.louds.append(1); - level.outs.append(0); - level.labels.push_back(label); - ++n_nodes_; + if ((pos == last_key_.size()) || (label != level.labels_.back())) { + level.louds_.set_last(0); + level.louds_.append(1); + level.outs_.append(0); + level.labels_.push_back(label); + ++num_nodes_; break; } } // Process remaining labels after divergence point from last_key - // Each such label will create a new edge and node pair in trie + // Each such label will create a new edge and node pair for (++pos; pos < key.size(); ++pos) { auto& level = levels_[pos + 1]; - level.louds.append(0); - level.louds.append(1); - level.outs.append(0); - level.labels.push_back(key[pos]); - ++n_nodes_; + level.louds_.append(0); + level.louds_.append(1); + level.outs_.append(0); + level.labels_.push_back(key[pos]); + ++num_nodes_; } - levels_[key.size() + 1].louds.append(1); // Mark end of current key - ++levels_[key.size() + 1].offset; - levels_[key.size()].outs.set_last(1); // Set terminal bit indicating valid path + levels_[key.size() + 1].louds_.append(1); // Mark end of current key + ++levels_[key.size() + 1].offset_; + levels_[key.size()].outs_.set_last(1); // Set terminal bit indicating valid path - ++n_keys_; + ++num_keys_; last_key_ = key; } // Helper to move vector from host to device -// Host vector is clear to avoid duplication. Device pointer is returned +// Host vector is cleared to avoid duplication. Device pointer is returned template T* move_vector_to_device(std::vector& host_vector, thrust::device_vector& device_vector) { @@ -106,7 +106,7 @@ T* move_vector_to_device(std::vector& host_vector, thrust::device_vector& } template -void trie::build() +void trie::build() noexcept(false) { // Perform build level-by-level for all levels, followed by a deep-copy from host to device @@ -115,17 +115,17 @@ void trie::build() size_type offset = 0; for (auto& level : levels_) { - level.louds.build(); - louds_refs.push_back(level.louds.ref(bv_read)); + level.louds_.build(); + louds_refs.push_back(level.louds_.ref(bv_read)); - level.outs.build(); - outs_refs.push_back(level.outs.ref(bv_read)); + level.outs_.build(); + outs_refs.push_back(level.outs_.ref(bv_read)); // Move labels to device - level.d_labels_ptr = move_vector_to_device(level.labels, level.d_labels); + level.d_labels_ptr_ = move_vector_to_device(level.labels_, level.d_labels_); - offset += level.offset; - level.offset = offset; + offset += level.offset_; + level.offset_ = offset; } // Move bitvector refs to device @@ -150,7 +150,7 @@ void trie::lookup(KeyIt keys_begin, OffsetIt offsets_begin, OffsetIt offsets_end, OutputIt outputs_begin, - cuda_stream_ref stream) const + cuda_stream_ref stream) const noexcept { auto num_keys = cuco::detail::distance(offsets_begin, offsets_end) - 1; if (num_keys == 0) { return; } @@ -166,14 +166,14 @@ void trie::lookup(KeyIt keys_begin, template __global__ void trie_lookup_kernel( - TrieRef ref, KeyIt keys, OffsetIt offsets, OutputIt outputs, uint64_t num_keys) + TrieRef ref, KeyIt keys, OffsetIt offsets, OutputIt outputs, size_t num_keys) { - size_t loop_stride = gridDim.x * blockDim.x; - size_t key_id = blockDim.x * blockIdx.x + threadIdx.x; + auto loop_stride = gridDim.x * blockDim.x; + auto key_id = blockDim.x * blockIdx.x + threadIdx.x; while (key_id < num_keys) { auto key_start_pos = keys + offsets[key_id]; - size_t key_length = offsets[key_id + 1] - offsets[key_id]; + auto key_length = offsets[key_id + 1] - offsets[key_id]; outputs[key_id] = ref.lookup_key(key_start_pos, key_length); key_id += loop_stride; @@ -189,7 +189,8 @@ auto trie::ref(Operators...) const noexcept } template -trie::level::level() : louds{}, outs{}, labels{}, d_labels_ptr{nullptr}, offset{0} +trie::level::level() + : louds_{}, outs_{}, labels_{}, d_labels_{}, d_labels_ptr_{nullptr}, offset_{0} { } diff --git a/include/cuco/detail/trie/trie_ref.inl b/include/cuco/detail/trie/trie_ref.inl index 81aeeb4f5..054dd6917 100644 --- a/include/cuco/detail/trie/trie_ref.inl +++ b/include/cuco/detail/trie/trie_ref.inl @@ -4,7 +4,7 @@ namespace experimental { template __host__ __device__ constexpr trie_ref::trie_ref( const trie* trie) noexcept - : trie_(trie) + : trie_{trie} { } @@ -12,7 +12,8 @@ namespace detail { template class operator_impl> { - using ref_type = trie_ref; + using ref_type = trie_ref; + using size_type = size_t; public: /** @@ -24,22 +25,22 @@ class operator_impl> { * @return Index of key if it exists in trie, -1 otherwise */ template - [[nodiscard]] __device__ uint64_t lookup_key(KeyIt key, uint64_t length) const noexcept + [[nodiscard]] __device__ size_type lookup_key(KeyIt key, size_type length) const noexcept { auto const& trie = static_cast(*this).trie_; // Level-by-level search. node_id is updated at each level - uint32_t node_id = 0; - for (uint32_t cur_depth = 1; cur_depth <= length; cur_depth++) { + size_type node_id = 0; + for (size_type cur_depth = 1; cur_depth <= length; cur_depth++) { if (!search_label_in_children(key[cur_depth - 1], node_id, cur_depth)) { return -1lu; } } // Check for terminal node bit that indicates a valid key - uint64_t leaf_level_id = length; + size_type leaf_level_id = length; if (!trie->d_outs_refs_ptr_[leaf_level_id].get(node_id)) { return -1lu; } // Key exists in trie, generate the index - auto offset = trie->d_levels_ptr_[leaf_level_id].offset; + auto offset = trie->d_levels_ptr_[leaf_level_id].offset_; auto rank = trie->d_outs_refs_ptr_[leaf_level_id].rank(node_id); return offset + rank; @@ -55,16 +56,16 @@ class operator_impl> { * @return Position of last child */ template - [[nodiscard]] __device__ uint32_t get_last_child_position(BitVectorRef louds, - uint32_t& node_id) const noexcept + [[nodiscard]] __device__ size_type get_last_child_position(BitVectorRef louds, + size_type& node_id) const noexcept { - uint32_t node_pos = 0; + size_type node_pos = 0; if (node_id != 0) { node_pos = louds.select(node_id - 1) + 1; node_id = node_pos - node_id; } - uint32_t pos_end = louds.find_next_set(node_pos); + auto pos_end = louds.find_next_set(node_pos); return node_id + (pos_end - node_pos); } @@ -78,17 +79,17 @@ class operator_impl> { * @return Boolean indicating success of search process */ [[nodiscard]] __device__ bool search_label_in_children(label_type target, - uint32_t& node_id, - uint32_t level_id) const noexcept + size_type& node_id, + size_type level_id) const noexcept { auto const& trie = static_cast(*this).trie_; auto louds = trie->d_louds_refs_ptr_[level_id]; - uint32_t end = get_last_child_position(louds, node_id); // Position of last child - uint32_t begin = node_id; // Position of first child, initialized after find_last_child call + auto end = get_last_child_position(louds, node_id); // Position of last child + auto begin = node_id; // Position of first child, initialized after find_last_child call auto& level = trie->d_levels_ptr_[level_id]; - auto labels = level.d_labels_ptr; + auto labels = level.d_labels_ptr_; // Binary search labels array of current level while (begin < end) { diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh index d58087e4b..c09b14799 100644 --- a/include/cuco/trie.cuh +++ b/include/cuco/trie.cuh @@ -29,7 +29,7 @@ namespace experimental { template class trie { public: - trie(); + constexpr trie(); ~trie() noexcept(false); /** @@ -37,14 +37,14 @@ class trie { * * @param key Key to insert */ - void insert(const std::vector& key); + void insert(const std::vector& key) noexcept; /** * @brief Build level-by-level trie indexes after inserting all keys * * In addition, a snapshot of current trie state is copied to device */ - void build(); + void build() noexcept(false); /** * @brief Bulk lookup vector of keys @@ -64,7 +64,7 @@ class trie { OffsetIt offsets_begin, OffsetIt offsets_end, OutputIt outputs_begin, - cuda_stream_ref stream = {}) const; + cuda_stream_ref stream = {}) const noexcept; using size_type = std::size_t; ///< size type @@ -73,7 +73,7 @@ class trie { * * @return Number of keys */ - size_type constexpr size() const { return n_keys_; } + size_type constexpr size() const noexcept { return num_keys_; } /** * @brief Get device ref with operators. @@ -88,12 +88,11 @@ class trie { [[nodiscard]] auto ref(Operators... ops) const noexcept; private: - size_type n_keys_; ///< Number of keys inserted into trie - size_type n_nodes_; ///< Number of nodes in trie + size_type num_keys_; ///< Number of keys inserted into trie + size_type num_nodes_; ///< Number of internal nodes std::vector last_key_; ///< Last key inserted into trie - static constexpr label_type root_label_ = - sizeof(label_type) == 1 ? ' ' : static_cast(-1); ///< Sentinel value + static constexpr label_type root_label_ = sizeof(label_type) == 1 ? ' ' : -1; ///< Sentinel value struct level; size_type num_levels_; ///< Number of trie levels @@ -124,14 +123,14 @@ class trie { level(); level(level&&) = default; ///< Move constructor - bit_vector<> louds; ///< Indicates links to next and previous level - bit_vector<> outs; ///< Indicates terminal nodes of valid keys + bit_vector<> louds_; ///< Indicates links to next and previous level + bit_vector<> outs_; ///< Indicates terminal nodes of valid keys - std::vector labels; ///< Stores individual characters of keys - thrust::device_vector d_labels; ///< Device-side copy of `labels` - label_type* d_labels_ptr; ///< Raw pointer to d_labels + std::vector labels_; ///< Stores individual characters of keys + thrust::device_vector d_labels_; ///< Device-side copy of `labels` + label_type* d_labels_ptr_; ///< Raw pointer to d_labels - size_type offset; ///< Count of nodes in all parent levels + size_type offset_; ///< Cumulative node count in parent levels }; }; From c5c1ec5758b46b7a113feee5aabe44743889cbfc Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Tue, 29 Aug 2023 03:28:56 +0000 Subject: [PATCH 06/41] Remove bitvector template parameter --- include/cuco/trie.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh index c09b14799..876265986 100644 --- a/include/cuco/trie.cuh +++ b/include/cuco/trie.cuh @@ -99,7 +99,7 @@ class trie { std::vector levels_; ///< Host-side array of levels level* d_levels_ptr_; ///< Device-side array of levels - using bv_read_ref = bit_vector_ref::device_storage_ref, bv_read_tag>; ///< Read ref + using bv_read_ref = bit_vector_ref; ///< Read ref thrust::device_vector d_louds_refs_; ///< refs to per-level louds bitvectors thrust::device_vector d_outs_refs_; ///< refs to per-level outs bitvectors @@ -123,8 +123,8 @@ class trie { level(); level(level&&) = default; ///< Move constructor - bit_vector<> louds_; ///< Indicates links to next and previous level - bit_vector<> outs_; ///< Indicates terminal nodes of valid keys + bit_vector louds_; ///< Indicates links to next and previous level + bit_vector outs_; ///< Indicates terminal nodes of valid keys std::vector labels_; ///< Stores individual characters of keys thrust::device_vector d_labels_; ///< Device-side copy of `labels` From 152f71620f1df179ba74b8598ae0bd7efaac7cc0 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Tue, 29 Aug 2023 05:37:28 +0000 Subject: [PATCH 07/41] Remove some host-side structures --- include/cuco/detail/trie/trie.inl | 29 ++++++--------------------- include/cuco/detail/trie/trie_ref.inl | 8 ++++---- include/cuco/trie.cuh | 13 ++++++------ 3 files changed, 16 insertions(+), 34 deletions(-) diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl index a8756deec..1b6c72276 100644 --- a/include/cuco/detail/trie/trie.inl +++ b/include/cuco/detail/trie/trie.inl @@ -95,42 +95,26 @@ void trie::insert(const std::vector& key) noexcept last_key_ = key; } -// Helper to move vector from host to device -// Host vector is cleared to avoid duplication. Device pointer is returned -template -T* move_vector_to_device(std::vector& host_vector, thrust::device_vector& device_vector) -{ - device_vector = host_vector; - host_vector.clear(); - return thrust::raw_pointer_cast(device_vector.data()); -} - template void trie::build() noexcept(false) { // Perform build level-by-level for all levels, followed by a deep-copy from host to device - - // Host-side per-level bit-vector refs - std::vector louds_refs, outs_refs; size_type offset = 0; - for (auto& level : levels_) { level.louds_.build(); - louds_refs.push_back(level.louds_.ref(bv_read)); + louds_refs_.push_back(level.louds_.ref(bv_read)); level.outs_.build(); - outs_refs.push_back(level.outs_.ref(bv_read)); + outs_refs_.push_back(level.outs_.ref(bv_read)); - // Move labels to device - level.d_labels_ptr_ = move_vector_to_device(level.labels_, level.d_labels_); + level.labels_ptr_ = thrust::raw_pointer_cast(level.labels_.data()); offset += level.offset_; level.offset_ = offset; } - // Move bitvector refs to device - d_louds_refs_ptr_ = move_vector_to_device(louds_refs, d_louds_refs_); - d_outs_refs_ptr_ = move_vector_to_device(outs_refs, d_outs_refs_); + louds_refs_ptr_ = thrust::raw_pointer_cast(louds_refs_.data()); + outs_refs_ptr_ = thrust::raw_pointer_cast(outs_refs_.data()); num_levels_ = levels_.size(); @@ -189,8 +173,7 @@ auto trie::ref(Operators...) const noexcept } template -trie::level::level() - : louds_{}, outs_{}, labels_{}, d_labels_{}, d_labels_ptr_{nullptr}, offset_{0} +trie::level::level() : louds_{}, outs_{}, labels_{}, labels_ptr_{nullptr}, offset_{0} { } diff --git a/include/cuco/detail/trie/trie_ref.inl b/include/cuco/detail/trie/trie_ref.inl index 054dd6917..2eaa57ae6 100644 --- a/include/cuco/detail/trie/trie_ref.inl +++ b/include/cuco/detail/trie/trie_ref.inl @@ -37,11 +37,11 @@ class operator_impl> { // Check for terminal node bit that indicates a valid key size_type leaf_level_id = length; - if (!trie->d_outs_refs_ptr_[leaf_level_id].get(node_id)) { return -1lu; } + if (!trie->outs_refs_ptr_[leaf_level_id].get(node_id)) { return -1lu; } // Key exists in trie, generate the index auto offset = trie->d_levels_ptr_[leaf_level_id].offset_; - auto rank = trie->d_outs_refs_ptr_[leaf_level_id].rank(node_id); + auto rank = trie->outs_refs_ptr_[leaf_level_id].rank(node_id); return offset + rank; } @@ -83,13 +83,13 @@ class operator_impl> { size_type level_id) const noexcept { auto const& trie = static_cast(*this).trie_; - auto louds = trie->d_louds_refs_ptr_[level_id]; + auto louds = trie->louds_refs_ptr_[level_id]; auto end = get_last_child_position(louds, node_id); // Position of last child auto begin = node_id; // Position of first child, initialized after find_last_child call auto& level = trie->d_levels_ptr_[level_id]; - auto labels = level.d_labels_ptr_; + auto labels = level.labels_ptr_; // Binary search labels array of current level while (begin < end) { diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh index 876265986..a9c45c190 100644 --- a/include/cuco/trie.cuh +++ b/include/cuco/trie.cuh @@ -100,11 +100,11 @@ class trie { level* d_levels_ptr_; ///< Device-side array of levels using bv_read_ref = bit_vector_ref; ///< Read ref - thrust::device_vector d_louds_refs_; ///< refs to per-level louds bitvectors - thrust::device_vector d_outs_refs_; ///< refs to per-level outs bitvectors + thrust::device_vector louds_refs_; ///< refs to per-level louds bitvectors + thrust::device_vector outs_refs_; ///< refs to per-level outs bitvectors - bv_read_ref* d_louds_refs_ptr_; ///< Raw pointer to d_louds_refs_ - bv_read_ref* d_outs_refs_ptr_; ///< Raw pointer to d_outs_refs_ + bv_read_ref* louds_refs_ptr_; ///< Raw pointer to d_louds_refs_ + bv_read_ref* outs_refs_ptr_; ///< Raw pointer to d_outs_refs_ trie* device_ptr_; ///< Device-side copy of trie @@ -126,9 +126,8 @@ class trie { bit_vector louds_; ///< Indicates links to next and previous level bit_vector outs_; ///< Indicates terminal nodes of valid keys - std::vector labels_; ///< Stores individual characters of keys - thrust::device_vector d_labels_; ///< Device-side copy of `labels` - label_type* d_labels_ptr_; ///< Raw pointer to d_labels + thrust::device_vector labels_; ///< Stores individual characters of keys + label_type* labels_ptr_; ///< Raw pointer to labels size_type offset_; ///< Cumulative node count in parent levels }; From 643ba319d4c265b7b9989eacc274f571b0cc17d4 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 30 Aug 2023 02:38:17 +0000 Subject: [PATCH 08/41] bit_vector -> dynamic_bitset --- include/cuco/detail/trie/trie.inl | 4 ++-- include/cuco/trie.cuh | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl index 1b6c72276..e42d44799 100644 --- a/include/cuco/detail/trie/trie.inl +++ b/include/cuco/detail/trie/trie.inl @@ -102,10 +102,10 @@ void trie::build() noexcept(false) size_type offset = 0; for (auto& level : levels_) { level.louds_.build(); - louds_refs_.push_back(level.louds_.ref(bv_read)); + louds_refs_.push_back(level.louds_.ref()); level.outs_.build(); - outs_refs_.push_back(level.outs_.ref(bv_read)); + outs_refs_.push_back(level.outs_.ref()); level.labels_ptr_ = thrust::raw_pointer_cast(level.labels_.data()); diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh index a9c45c190..e94fd316b 100644 --- a/include/cuco/trie.cuh +++ b/include/cuco/trie.cuh @@ -15,7 +15,7 @@ * limitations under the License. */ -#include +#include #include namespace cuco { @@ -99,12 +99,12 @@ class trie { std::vector levels_; ///< Host-side array of levels level* d_levels_ptr_; ///< Device-side array of levels - using bv_read_ref = bit_vector_ref; ///< Read ref - thrust::device_vector louds_refs_; ///< refs to per-level louds bitvectors - thrust::device_vector outs_refs_; ///< refs to per-level outs bitvectors + using bitset_ref = detail::dynamic_bitset<>::ref_type; ///< Read ref + thrust::device_vector louds_refs_; ///< refs to per-level louds bitvectors + thrust::device_vector outs_refs_; ///< refs to per-level outs bitvectors - bv_read_ref* louds_refs_ptr_; ///< Raw pointer to d_louds_refs_ - bv_read_ref* outs_refs_ptr_; ///< Raw pointer to d_outs_refs_ + bitset_ref* louds_refs_ptr_; ///< Raw pointer to d_louds_refs_ + bitset_ref* outs_refs_ptr_; ///< Raw pointer to d_outs_refs_ trie* device_ptr_; ///< Device-side copy of trie @@ -123,8 +123,8 @@ class trie { level(); level(level&&) = default; ///< Move constructor - bit_vector louds_; ///< Indicates links to next and previous level - bit_vector outs_; ///< Indicates terminal nodes of valid keys + detail::dynamic_bitset<> louds_; ///< Indicates links to next and previous level + detail::dynamic_bitset<> outs_; ///< Indicates terminal nodes of valid keys thrust::device_vector labels_; ///< Stores individual characters of keys label_type* labels_ptr_; ///< Raw pointer to labels From 53b2ba43919ad827e45da8bc778c850f4e47b30d Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Thu, 7 Sep 2023 22:18:12 +0000 Subject: [PATCH 09/41] dynamic_bitset API change --- include/cuco/detail/trie/trie.inl | 31 +++++++++++---------------- include/cuco/detail/trie/trie_ref.inl | 4 ++-- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl index e42d44799..1735c4d12 100644 --- a/include/cuco/detail/trie/trie.inl +++ b/include/cuco/detail/trie/trie.inl @@ -30,10 +30,10 @@ constexpr trie::trie() d_levels_ptr_{nullptr}, device_ptr_{nullptr} { - levels_[0].louds_.append(0); - levels_[0].louds_.append(1); - levels_[1].louds_.append(1); - levels_[0].outs_.append(0); + levels_[0].louds_.push_back(0); + levels_[0].louds_.push_back(1); + levels_[1].louds_.push_back(1); + levels_[0].outs_.push_back(0); levels_[0].labels_.push_back(root_label_); } @@ -68,8 +68,8 @@ void trie::insert(const std::vector& key) noexcept if ((pos == last_key_.size()) || (label != level.labels_.back())) { level.louds_.set_last(0); - level.louds_.append(1); - level.outs_.append(0); + level.louds_.push_back(1); + level.outs_.push_back(0); level.labels_.push_back(label); ++num_nodes_; break; @@ -80,14 +80,14 @@ void trie::insert(const std::vector& key) noexcept // Each such label will create a new edge and node pair for (++pos; pos < key.size(); ++pos) { auto& level = levels_[pos + 1]; - level.louds_.append(0); - level.louds_.append(1); - level.outs_.append(0); + level.louds_.push_back(0); + level.louds_.push_back(1); + level.outs_.push_back(0); level.labels_.push_back(key[pos]); ++num_nodes_; } - levels_[key.size() + 1].louds_.append(1); // Mark end of current key + levels_[key.size() + 1].louds_.push_back(1); // Mark end of current key ++levels_[key.size() + 1].offset_; levels_[key.size()].outs_.set_last(1); // Set terminal bit indicating valid path @@ -101,10 +101,7 @@ void trie::build() noexcept(false) // Perform build level-by-level for all levels, followed by a deep-copy from host to device size_type offset = 0; for (auto& level : levels_) { - level.louds_.build(); louds_refs_.push_back(level.louds_.ref()); - - level.outs_.build(); outs_refs_.push_back(level.outs_.ref()); level.labels_ptr_ = thrust::raw_pointer_cast(level.labels_.data()); @@ -139,12 +136,10 @@ void trie::lookup(KeyIt keys_begin, auto num_keys = cuco::detail::distance(offsets_begin, offsets_end) - 1; if (num_keys == 0) { return; } - auto grid_size = - (num_keys - 1) / (detail::CUCO_DEFAULT_STRIDE * detail::CUCO_DEFAULT_BLOCK_SIZE) + 1; - - auto ref_ = this->ref(cuco::experimental::trie_lookup); + auto const grid_size = cuco::detail::grid_size(num_keys); + auto ref_ = this->ref(cuco::experimental::trie_lookup); - trie_lookup_kernel<<>>( + trie_lookup_kernel<<>>( ref_, keys_begin, offsets_begin, outputs_begin, num_keys); } diff --git a/include/cuco/detail/trie/trie_ref.inl b/include/cuco/detail/trie/trie_ref.inl index 2eaa57ae6..d4f70e3c6 100644 --- a/include/cuco/detail/trie/trie_ref.inl +++ b/include/cuco/detail/trie/trie_ref.inl @@ -37,7 +37,7 @@ class operator_impl> { // Check for terminal node bit that indicates a valid key size_type leaf_level_id = length; - if (!trie->outs_refs_ptr_[leaf_level_id].get(node_id)) { return -1lu; } + if (!trie->outs_refs_ptr_[leaf_level_id].test(node_id)) { return -1lu; } // Key exists in trie, generate the index auto offset = trie->d_levels_ptr_[leaf_level_id].offset_; @@ -65,7 +65,7 @@ class operator_impl> { node_id = node_pos - node_id; } - auto pos_end = louds.find_next_set(node_pos); + auto pos_end = louds.find_next(node_pos); return node_id + (pos_end - node_pos); } From fdca1c2f28150a0362d8b51e790943b002ba1efb Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Sun, 10 Sep 2023 06:14:02 +0000 Subject: [PATCH 10/41] Template variable naming style --- include/cuco/detail/trie/trie.inl | 40 +++++++++++++-------------- include/cuco/detail/trie/trie_ref.inl | 14 +++++----- include/cuco/trie.cuh | 20 +++++++------- include/cuco/trie_ref.cuh | 12 ++++---- 4 files changed, 43 insertions(+), 43 deletions(-) diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl index 1735c4d12..6538b4965 100644 --- a/include/cuco/detail/trie/trie.inl +++ b/include/cuco/detail/trie/trie.inl @@ -20,8 +20,8 @@ namespace cuco { namespace experimental { -template -constexpr trie::trie() +template +constexpr trie::trie() : num_keys_{0}, num_nodes_{1}, last_key_{}, @@ -37,15 +37,15 @@ constexpr trie::trie() levels_[0].labels_.push_back(root_label_); } -template -trie::~trie() noexcept(false) +template +trie::~trie() noexcept(false) { if (d_levels_ptr_) { CUCO_CUDA_TRY(cudaFree(d_levels_ptr_)); } if (device_ptr_) { CUCO_CUDA_TRY(cudaFree(device_ptr_)); } } -template -void trie::insert(const std::vector& key) noexcept +template +void trie::insert(const std::vector& key) noexcept { if (key == last_key_) { return; } // Ignore duplicate keys assert(num_keys_ == 0 || key > last_key_); // Keys are expected to be inserted in sorted order @@ -95,8 +95,8 @@ void trie::insert(const std::vector& key) noexcept last_key_ = key; } -template -void trie::build() noexcept(false) +template +void trie::build() noexcept(false) { // Perform build level-by-level for all levels, followed by a deep-copy from host to device size_type offset = 0; @@ -121,17 +121,17 @@ void trie::build() noexcept(false) cudaMemcpy(d_levels_ptr_, &levels_[0], sizeof(level) * num_levels_, cudaMemcpyHostToDevice)); // Finally create a device copy of full trie structure - CUCO_CUDA_TRY(cudaMalloc(&device_ptr_, sizeof(trie))); - CUCO_CUDA_TRY(cudaMemcpy(device_ptr_, this, sizeof(trie), cudaMemcpyHostToDevice)); + CUCO_CUDA_TRY(cudaMalloc(&device_ptr_, sizeof(trie))); + CUCO_CUDA_TRY(cudaMemcpy(device_ptr_, this, sizeof(trie), cudaMemcpyHostToDevice)); } -template +template template -void trie::lookup(KeyIt keys_begin, - OffsetIt offsets_begin, - OffsetIt offsets_end, - OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept +void trie::lookup(KeyIt keys_begin, + OffsetIt offsets_begin, + OffsetIt offsets_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept { auto num_keys = cuco::detail::distance(offsets_begin, offsets_end) - 1; if (num_keys == 0) { return; } @@ -159,16 +159,16 @@ __global__ void trie_lookup_kernel( } } -template +template template -auto trie::ref(Operators...) const noexcept +auto trie::ref(Operators...) const noexcept { static_assert(sizeof...(Operators), "No operators specified"); return ref_type{device_ptr_}; } -template -trie::level::level() : louds_{}, outs_{}, labels_{}, labels_ptr_{nullptr}, offset_{0} +template +trie::level::level() : louds_{}, outs_{}, labels_{}, labels_ptr_{nullptr}, offset_{0} { } diff --git a/include/cuco/detail/trie/trie_ref.inl b/include/cuco/detail/trie/trie_ref.inl index d4f70e3c6..8b39f3f63 100644 --- a/include/cuco/detail/trie/trie_ref.inl +++ b/include/cuco/detail/trie/trie_ref.inl @@ -1,18 +1,18 @@ namespace cuco { namespace experimental { -template -__host__ __device__ constexpr trie_ref::trie_ref( - const trie* trie) noexcept +template +__host__ __device__ constexpr trie_ref::trie_ref( + const trie* trie) noexcept : trie_{trie} { } namespace detail { -template -class operator_impl> { - using ref_type = trie_ref; +template +class operator_impl> { + using ref_type = trie_ref; using size_type = size_t; public: @@ -78,7 +78,7 @@ class operator_impl> { * * @return Boolean indicating success of search process */ - [[nodiscard]] __device__ bool search_label_in_children(label_type target, + [[nodiscard]] __device__ bool search_label_in_children(LabelType target, size_type& node_id, size_type level_id) const noexcept { diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh index e94fd316b..410c87a94 100644 --- a/include/cuco/trie.cuh +++ b/include/cuco/trie.cuh @@ -26,7 +26,7 @@ namespace experimental { * * @tparam label_type type of individual characters of vector keys (eg. char or int) */ -template +template class trie { public: constexpr trie(); @@ -37,7 +37,7 @@ class trie { * * @param key Key to insert */ - void insert(const std::vector& key) noexcept; + void insert(const std::vector& key) noexcept; /** * @brief Build level-by-level trie indexes after inserting all keys @@ -88,11 +88,11 @@ class trie { [[nodiscard]] auto ref(Operators... ops) const noexcept; private: - size_type num_keys_; ///< Number of keys inserted into trie - size_type num_nodes_; ///< Number of internal nodes - std::vector last_key_; ///< Last key inserted into trie + size_type num_keys_; ///< Number of keys inserted into trie + size_type num_nodes_; ///< Number of internal nodes + std::vector last_key_; ///< Last key inserted into trie - static constexpr label_type root_label_ = sizeof(label_type) == 1 ? ' ' : -1; ///< Sentinel value + static constexpr LabelType root_label_ = sizeof(LabelType) == 1 ? ' ' : -1; ///< Sentinel value struct level; size_type num_levels_; ///< Number of trie levels @@ -106,11 +106,11 @@ class trie { bitset_ref* louds_refs_ptr_; ///< Raw pointer to d_louds_refs_ bitset_ref* outs_refs_ptr_; ///< Raw pointer to d_outs_refs_ - trie* device_ptr_; ///< Device-side copy of trie + trie* device_ptr_; ///< Device-side copy of trie template using ref_type = - cuco::experimental::trie_ref; ///< Non-owning container ref type + cuco::experimental::trie_ref; ///< Non-owning container ref type // Mixins need to be friends with this class in order to access private members template @@ -126,8 +126,8 @@ class trie { detail::dynamic_bitset<> louds_; ///< Indicates links to next and previous level detail::dynamic_bitset<> outs_; ///< Indicates terminal nodes of valid keys - thrust::device_vector labels_; ///< Stores individual characters of keys - label_type* labels_ptr_; ///< Raw pointer to labels + thrust::device_vector labels_; ///< Stores individual characters of keys + LabelType* labels_ptr_; ///< Raw pointer to labels size_type offset_; ///< Cumulative node count in parent levels }; diff --git a/include/cuco/trie_ref.cuh b/include/cuco/trie_ref.cuh index c5bffd126..57a0ee5dd 100644 --- a/include/cuco/trie_ref.cuh +++ b/include/cuco/trie_ref.cuh @@ -5,28 +5,28 @@ namespace cuco { namespace experimental { -template +template class trie; /** * @brief Device non-owning "ref" type that can be used in device code to perform arbitrary * operations defined in `include/cuco/operator.hpp` * - * @tparam label_type Trie label type + * @tparam LabelType Trie label type * @tparam Operators Device operator options defined in `include/cuco/operator.hpp` */ -template -class trie_ref : public detail::operator_impl>... { +template +class trie_ref : public detail::operator_impl>... { public: /** * @brief Constructs trie_ref. * * @param trie Non-owning ref of trie */ - __host__ __device__ explicit constexpr trie_ref(const trie* trie) noexcept; + __host__ __device__ explicit constexpr trie_ref(const trie* trie) noexcept; private: - const trie* trie_; + const trie* trie_; // Mixins need to be friends with this class in order to access private members template From 10f1b05a688a894b6626d82fcf7e1fdd46c2ebd3 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Sun, 10 Sep 2023 06:26:11 +0000 Subject: [PATCH 11/41] Minor --- include/cuco/detail/trie/trie.inl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl index 6538b4965..86ef763a1 100644 --- a/include/cuco/detail/trie/trie.inl +++ b/include/cuco/detail/trie/trie.inl @@ -147,8 +147,8 @@ template Date: Sun, 10 Sep 2023 07:55:10 +0000 Subject: [PATCH 12/41] Add allocator template parameter --- include/cuco/detail/trie/trie.inl | 40 ++++++++++++++------------- include/cuco/detail/trie/trie_ref.inl | 12 ++++---- include/cuco/trie.cuh | 14 ++++++++-- include/cuco/trie_ref.cuh | 11 ++++---- 4 files changed, 44 insertions(+), 33 deletions(-) diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl index 86ef763a1..9e18f409f 100644 --- a/include/cuco/detail/trie/trie.inl +++ b/include/cuco/detail/trie/trie.inl @@ -20,9 +20,10 @@ namespace cuco { namespace experimental { -template -constexpr trie::trie() - : num_keys_{0}, +template +constexpr trie::trie(Allocator const& allocator) + : allocator_{allocator}, + num_keys_{0}, num_nodes_{1}, last_key_{}, num_levels_{2}, @@ -37,15 +38,15 @@ constexpr trie::trie() levels_[0].labels_.push_back(root_label_); } -template -trie::~trie() noexcept(false) +template +trie::~trie() noexcept(false) { if (d_levels_ptr_) { CUCO_CUDA_TRY(cudaFree(d_levels_ptr_)); } if (device_ptr_) { CUCO_CUDA_TRY(cudaFree(device_ptr_)); } } -template -void trie::insert(const std::vector& key) noexcept +template +void trie::insert(const std::vector& key) noexcept { if (key == last_key_) { return; } // Ignore duplicate keys assert(num_keys_ == 0 || key > last_key_); // Keys are expected to be inserted in sorted order @@ -95,8 +96,8 @@ void trie::insert(const std::vector& key) noexcept last_key_ = key; } -template -void trie::build() noexcept(false) +template +void trie::build() noexcept(false) { // Perform build level-by-level for all levels, followed by a deep-copy from host to device size_type offset = 0; @@ -125,13 +126,13 @@ void trie::build() noexcept(false) CUCO_CUDA_TRY(cudaMemcpy(device_ptr_, this, sizeof(trie), cudaMemcpyHostToDevice)); } -template +template template -void trie::lookup(KeyIt keys_begin, - OffsetIt offsets_begin, - OffsetIt offsets_end, - OutputIt outputs_begin, - cuda_stream_ref stream) const noexcept +void trie::lookup(KeyIt keys_begin, + OffsetIt offsets_begin, + OffsetIt offsets_end, + OutputIt outputs_begin, + cuda_stream_ref stream) const noexcept { auto num_keys = cuco::detail::distance(offsets_begin, offsets_end) - 1; if (num_keys == 0) { return; } @@ -159,16 +160,17 @@ __global__ void trie_lookup_kernel( } } -template +template template -auto trie::ref(Operators...) const noexcept +auto trie::ref(Operators...) const noexcept { static_assert(sizeof...(Operators), "No operators specified"); return ref_type{device_ptr_}; } -template -trie::level::level() : louds_{}, outs_{}, labels_{}, labels_ptr_{nullptr}, offset_{0} +template +trie::level::level() + : louds_{}, outs_{}, labels_{}, labels_ptr_{nullptr}, offset_{0} { } diff --git a/include/cuco/detail/trie/trie_ref.inl b/include/cuco/detail/trie/trie_ref.inl index 8b39f3f63..33e505539 100644 --- a/include/cuco/detail/trie/trie_ref.inl +++ b/include/cuco/detail/trie/trie_ref.inl @@ -1,18 +1,18 @@ namespace cuco { namespace experimental { -template -__host__ __device__ constexpr trie_ref::trie_ref( - const trie* trie) noexcept +template +__host__ __device__ constexpr trie_ref::trie_ref( + const trie* trie) noexcept : trie_{trie} { } namespace detail { -template -class operator_impl> { - using ref_type = trie_ref; +template +class operator_impl> { + using ref_type = trie_ref; using size_type = size_t; public: diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh index 410c87a94..5fab14303 100644 --- a/include/cuco/trie.cuh +++ b/include/cuco/trie.cuh @@ -25,11 +25,17 @@ namespace experimental { * @brief Trie class * * @tparam label_type type of individual characters of vector keys (eg. char or int) + * @tparam Allocator Type of allocator used for device storage */ -template +template > class trie { public: - constexpr trie(); + /** + * @brief Constructs an empty trie + * + * @param allocator Allocator used for allocating device storage + */ + constexpr trie(Allocator const& allocator = Allocator{}); ~trie() noexcept(false); /** @@ -88,6 +94,7 @@ class trie { [[nodiscard]] auto ref(Operators... ops) const noexcept; private: + Allocator allocator_; ///< Allocator size_type num_keys_; ///< Number of keys inserted into trie size_type num_nodes_; ///< Number of internal nodes std::vector last_key_; ///< Last key inserted into trie @@ -110,7 +117,8 @@ class trie { template using ref_type = - cuco::experimental::trie_ref; ///< Non-owning container ref type + cuco::experimental::trie_ref; ///< Non-owning container ref + ///< type // Mixins need to be friends with this class in order to access private members template diff --git a/include/cuco/trie_ref.cuh b/include/cuco/trie_ref.cuh index 57a0ee5dd..244d3adfb 100644 --- a/include/cuco/trie_ref.cuh +++ b/include/cuco/trie_ref.cuh @@ -5,7 +5,7 @@ namespace cuco { namespace experimental { -template +template class trie; /** @@ -15,18 +15,19 @@ class trie; * @tparam LabelType Trie label type * @tparam Operators Device operator options defined in `include/cuco/operator.hpp` */ -template -class trie_ref : public detail::operator_impl>... { +template +class trie_ref + : public detail::operator_impl>... { public: /** * @brief Constructs trie_ref. * * @param trie Non-owning ref of trie */ - __host__ __device__ explicit constexpr trie_ref(const trie* trie) noexcept; + __host__ __device__ explicit constexpr trie_ref(const trie* trie) noexcept; private: - const trie* trie_; + const trie* trie_; // Mixins need to be friends with this class in order to access private members template From d2e339b0b788cf2c342dca645c0cc5c5e923e4ed Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Sun, 10 Sep 2023 07:55:57 +0000 Subject: [PATCH 13/41] Misc coding style --- include/cuco/detail/trie/trie_ref.inl | 10 +++++----- include/cuco/trie.cuh | 4 ++-- tests/trie/lookup_test.cu | 16 ++++++++-------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/cuco/detail/trie/trie_ref.inl b/include/cuco/detail/trie/trie_ref.inl index 33e505539..be385ed0a 100644 --- a/include/cuco/detail/trie/trie_ref.inl +++ b/include/cuco/detail/trie/trie_ref.inl @@ -50,14 +50,14 @@ class operator_impl - [[nodiscard]] __device__ size_type get_last_child_position(BitVectorRef louds, - size_type& node_id) const noexcept + template + [[nodiscard]] __device__ size_type last_child_position(BitsetRef louds, + size_type& node_id) const noexcept { size_type node_pos = 0; if (node_id != 0) { @@ -85,7 +85,7 @@ class operator_impl(*this).trie_; auto louds = trie->louds_refs_ptr_[level_id]; - auto end = get_last_child_position(louds, node_id); // Position of last child + auto end = last_child_position(louds, node_id); // Position of last child auto begin = node_id; // Position of first child, initialized after find_last_child call auto& level = trie->d_levels_ptr_[level_id]; diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh index 5fab14303..099da41f3 100644 --- a/include/cuco/trie.cuh +++ b/include/cuco/trie.cuh @@ -107,8 +107,8 @@ class trie { level* d_levels_ptr_; ///< Device-side array of levels using bitset_ref = detail::dynamic_bitset<>::ref_type; ///< Read ref - thrust::device_vector louds_refs_; ///< refs to per-level louds bitvectors - thrust::device_vector outs_refs_; ///< refs to per-level outs bitvectors + thrust::device_vector louds_refs_; ///< refs to per-level louds bitsets + thrust::device_vector outs_refs_; ///< refs to per-level outs bitsets bitset_ref* louds_refs_ptr_; ///< Raw pointer to d_louds_refs_ bitset_ref* outs_refs_ptr_; ///< Raw pointer to d_outs_refs_ diff --git a/tests/trie/lookup_test.cu b/tests/trie/lookup_test.cu index ed1bf7721..b431aa002 100644 --- a/tests/trie/lookup_test.cu +++ b/tests/trie/lookup_test.cu @@ -25,14 +25,14 @@ #include struct valid_key { - valid_key(uint64_t num_keys) : num_keys_(num_keys) {} - __host__ __device__ bool operator()(uint64_t x) const { return x < num_keys_; } - const uint64_t num_keys_; + valid_key(size_t num_keys) : num_keys_(num_keys) {} + __host__ __device__ bool operator()(size_t x) const { return x < num_keys_; } + const size_t num_keys_; }; template void generate_keys(thrust::host_vector& keys, - thrust::host_vector& offsets, + thrust::host_vector& offsets, size_t num_keys, size_t max_key_value, size_t max_key_length) @@ -58,7 +58,7 @@ TEST_CASE("Lookup test", "") std::size_t max_key_value = 1000; std::size_t max_key_length = 32; thrust::host_vector keys; - thrust::host_vector offsets; + thrust::host_vector offsets; generate_keys(keys, offsets, num_keys, max_key_value, max_key_length); @@ -97,9 +97,9 @@ TEST_CASE("Lookup test", "") trie.build(); { - thrust::device_vector lookup_result(num_keys, -1lu); - thrust::device_vector device_keys = keys; - thrust::device_vector device_offsets = offsets; + thrust::device_vector lookup_result(num_keys, -1lu); + thrust::device_vector device_keys = keys; + thrust::device_vector device_offsets = offsets; trie.lookup( device_keys.begin(), device_offsets.begin(), device_offsets.end(), lookup_result.begin()); From 74effe6758dcddd208ade996decfc2af49079cee Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Sun, 10 Sep 2023 17:36:23 +0000 Subject: [PATCH 14/41] Comments --- include/cuco/detail/trie/trie.inl | 2 +- include/cuco/detail/trie/trie_ref.inl | 11 ++++++++--- include/cuco/trie.cuh | 18 ++++++++++++------ 3 files changed, 21 insertions(+), 10 deletions(-) diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl index 9e18f409f..4accd08fc 100644 --- a/include/cuco/detail/trie/trie.inl +++ b/include/cuco/detail/trie/trie.inl @@ -155,7 +155,7 @@ __global__ void trie_lookup_kernel( auto key_start_pos = keys + offsets[key_id]; auto key_length = offsets[key_id + 1] - offsets[key_id]; - outputs[key_id] = ref.lookup_key(key_start_pos, key_length); + outputs[key_id] = ref.lookup(key_start_pos, key_length); key_id += loop_stride; } } diff --git a/include/cuco/detail/trie/trie_ref.inl b/include/cuco/detail/trie/trie_ref.inl index be385ed0a..b9ace0421 100644 --- a/include/cuco/detail/trie/trie_ref.inl +++ b/include/cuco/detail/trie/trie_ref.inl @@ -17,15 +17,18 @@ class operator_impl - [[nodiscard]] __device__ size_type lookup_key(KeyIt key, size_type length) const noexcept + [[nodiscard]] __device__ size_type lookup(KeyIt key, size_type length) const noexcept { auto const& trie = static_cast(*this).trie_; @@ -50,6 +53,8 @@ class operator_impl From c6d300faba53f97c866fb4a7d39c32f52a9d773f Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Tue, 12 Sep 2023 22:06:03 +0000 Subject: [PATCH 15/41] Run a dummy test() on bitsets --- .../cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl | 1 + include/cuco/detail/trie/trie.inl | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index d56ef9d7c..b1fcb3ea2 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -82,6 +82,7 @@ constexpr void dynamic_bitset::test(KeyIt keys_begin, { build(); + if (n_bits_ == 0) { return; } auto const num_keys = cuco::detail::distance(keys_begin, keys_end); if (num_keys == 0) { return; } diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl index 4accd08fc..3d018c548 100644 --- a/include/cuco/detail/trie/trie.inl +++ b/include/cuco/detail/trie/trie.inl @@ -101,8 +101,16 @@ void trie::build() noexcept(false) { // Perform build level-by-level for all levels, followed by a deep-copy from host to device size_type offset = 0; + + thrust::device_vector test_keys(1, 0); + thrust::device_vector test_results(1); + for (auto& level : levels_) { + // Run host-bulk test on bitvectors to initiate internal build() + level.louds_.test(test_keys.begin(), test_keys.end(), test_results.begin()); louds_refs_.push_back(level.louds_.ref()); + + level.outs_.test(test_keys.begin(), test_keys.end(), test_results.begin()); outs_refs_.push_back(level.outs_.ref()); level.labels_ptr_ = thrust::raw_pointer_cast(level.labels_.data()); From c57e491a90302fdda4e75ac49e286bf282b6ebe1 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Tue, 12 Sep 2023 22:07:02 +0000 Subject: [PATCH 16/41] Use allocator for member classes --- include/cuco/detail/trie/trie.inl | 5 +++-- include/cuco/trie.cuh | 32 ++++++++++++++++++------------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl index 3d018c548..92844f2f1 100644 --- a/include/cuco/detail/trie/trie.inl +++ b/include/cuco/detail/trie/trie.inl @@ -130,8 +130,9 @@ void trie::build() noexcept(false) cudaMemcpy(d_levels_ptr_, &levels_[0], sizeof(level) * num_levels_, cudaMemcpyHostToDevice)); // Finally create a device copy of full trie structure - CUCO_CUDA_TRY(cudaMalloc(&device_ptr_, sizeof(trie))); - CUCO_CUDA_TRY(cudaMemcpy(device_ptr_, this, sizeof(trie), cudaMemcpyHostToDevice)); + CUCO_CUDA_TRY(cudaMalloc(&device_ptr_, sizeof(trie))); + CUCO_CUDA_TRY( + cudaMemcpy(device_ptr_, this, sizeof(trie), cudaMemcpyHostToDevice)); } template diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh index 4ae46cd5e..f1ee35e59 100644 --- a/include/cuco/trie.cuh +++ b/include/cuco/trie.cuh @@ -112,19 +112,22 @@ class trie { std::vector levels_; ///< Host-side array of levels level* d_levels_ptr_; ///< Device-side array of levels - using bitset_ref = detail::dynamic_bitset<>::ref_type; ///< Read ref - thrust::device_vector louds_refs_; ///< refs to per-level louds bitsets - thrust::device_vector outs_refs_; ///< refs to per-level outs bitsets + using bitset_ref = typename detail::dynamic_bitset::ref_type; ///< Read ref + /// Type of the allocator to (de)allocate bitset refs + using bitset_allocator_type = typename std::allocator_traits::rebind_alloc; + ///< refs to per-level louds bitsets + thrust::device_vector louds_refs_; + ///< refs to per-level outs bitsets + thrust::device_vector outs_refs_; - bitset_ref* louds_refs_ptr_; ///< Raw pointer to d_louds_refs_ - bitset_ref* outs_refs_ptr_; ///< Raw pointer to d_outs_refs_ + bitset_ref* louds_refs_ptr_; ///< Raw device pointer to louds_refs_ + bitset_ref* outs_refs_ptr_; ///< Raw device pointer to outs_refs_ - trie* device_ptr_; ///< Device-side copy of trie + trie* device_ptr_; ///< Device-side copy of trie + ///< Non-owning container ref type template - using ref_type = - cuco::experimental::trie_ref; ///< Non-owning container ref - ///< type + using ref_type = cuco::experimental::trie_ref; // Mixins need to be friends with this class in order to access private members template @@ -137,11 +140,14 @@ class trie { level(); level(level&&) = default; ///< Move constructor - detail::dynamic_bitset<> louds_; ///< Indicates links to next and previous level - detail::dynamic_bitset<> outs_; ///< Indicates terminal nodes of valid keys + detail::dynamic_bitset louds_; ///< Indicates links to next and previous level + detail::dynamic_bitset outs_; ///< Indicates terminal nodes of valid keys - thrust::device_vector labels_; ///< Stores individual characters of keys - LabelType* labels_ptr_; ///< Raw pointer to labels + /// Type of the allocator to (de)allocate labels + using label_allocator_type = typename std::allocator_traits::rebind_alloc; + ///< Stores individual characters of keys + thrust::device_vector labels_; + LabelType* labels_ptr_; ///< Raw device pointer to labels size_type offset_; ///< Cumulative node count in parent levels }; From 4e4a08f0c532bac8bfe6953e56f9505806ec80e9 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 13 Sep 2023 00:38:42 +0000 Subject: [PATCH 17/41] Buffer bitset updates on host --- .../trie/dynamic_bitset/dynamic_bitset.cuh | 10 ++++ .../trie/dynamic_bitset/dynamic_bitset.inl | 15 ++++++ include/cuco/detail/trie/trie.inl | 36 +++++++------ include/cuco/trie.cuh | 50 +++++++++++++++++-- 4 files changed, 94 insertions(+), 17 deletions(-) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index 8383669fc..68c30c5b8 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -102,6 +102,16 @@ class dynamic_bitset { */ constexpr dynamic_bitset(Allocator const& allocator = Allocator{}); + /** + * @brief Inserts words in the range [word_begin, word_end) + * + * @param words_begin Begin iterator to words list + * @param words_end End iterator to words list + * @param n_bits Number of bits to be inserted + */ + template + constexpr void insert(WordIt words_begin, WordIt words_end, size_type n_bits); + /** * @brief Appends the given element `value` to the end of the bitset * diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index b1fcb3ea2..5525c6684 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -44,6 +44,21 @@ constexpr dynamic_bitset::dynamic_bitset(Allocator const& allocator) { } +template +template +constexpr void dynamic_bitset::insert(WordIt words_begin, + WordIt words_end, + size_type n_bits) +{ + if (n_bits == 0) { return; } + size_t num_blocks = (n_bits - 1) / bits_per_block + 1; + assert(num_blocks == cuco::detail::distance(words_begin, words_end)); + + words_.reserve(num_blocks); + words_.insert(words_.end(), words_begin, words_end); + n_bits_ = n_bits; +} + template constexpr void dynamic_bitset::push_back(bool bit) noexcept { diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl index 92844f2f1..f1ce68a6f 100644 --- a/include/cuco/detail/trie/trie.inl +++ b/include/cuco/detail/trie/trie.inl @@ -31,10 +31,10 @@ constexpr trie::trie(Allocator const& allocator) d_levels_ptr_{nullptr}, device_ptr_{nullptr} { - levels_[0].louds_.push_back(0); - levels_[0].louds_.push_back(1); - levels_[1].louds_.push_back(1); - levels_[0].outs_.push_back(0); + levels_[0].h_louds_.push_back(0); + levels_[0].h_louds_.push_back(1); + levels_[1].h_louds_.push_back(1); + levels_[0].h_outs_.push_back(0); levels_[0].labels_.push_back(root_label_); } @@ -52,7 +52,7 @@ void trie::insert(const std::vector& key) noexc assert(num_keys_ == 0 || key > last_key_); // Keys are expected to be inserted in sorted order if (key.empty()) { - levels_[0].outs_.set(0, 1); + levels_[0].h_outs_.set(0, 1); ++levels_[1].offset_; ++num_keys_; return; @@ -68,9 +68,9 @@ void trie::insert(const std::vector& key) noexc auto label = key[pos]; if ((pos == last_key_.size()) || (label != level.labels_.back())) { - level.louds_.set_last(0); - level.louds_.push_back(1); - level.outs_.push_back(0); + level.h_louds_.set_last(0); + level.h_louds_.push_back(1); + level.h_outs_.push_back(0); level.labels_.push_back(label); ++num_nodes_; break; @@ -81,16 +81,16 @@ void trie::insert(const std::vector& key) noexc // Each such label will create a new edge and node pair for (++pos; pos < key.size(); ++pos) { auto& level = levels_[pos + 1]; - level.louds_.push_back(0); - level.louds_.push_back(1); - level.outs_.push_back(0); + level.h_louds_.push_back(0); + level.h_louds_.push_back(1); + level.h_outs_.push_back(0); level.labels_.push_back(key[pos]); ++num_nodes_; } - levels_[key.size() + 1].louds_.push_back(1); // Mark end of current key + levels_[key.size() + 1].h_louds_.push_back(1); // Mark end of current key ++levels_[key.size() + 1].offset_; - levels_[key.size()].outs_.set_last(1); // Set terminal bit indicating valid path + levels_[key.size()].h_outs_.set_last(1); // Set terminal bit indicating valid path ++num_keys_; last_key_ = key; @@ -106,10 +106,18 @@ void trie::build() noexcept(false) thrust::device_vector test_results(1); for (auto& level : levels_) { + level.louds_.insert( + level.h_louds_.words_.begin(), level.h_louds_.words_.end(), level.h_louds_.n_bits_); + level.h_louds_.clear(); + // Run host-bulk test on bitvectors to initiate internal build() level.louds_.test(test_keys.begin(), test_keys.end(), test_results.begin()); louds_refs_.push_back(level.louds_.ref()); + level.outs_.insert( + level.h_outs_.words_.begin(), level.h_outs_.words_.end(), level.h_outs_.n_bits_); + level.h_outs_.clear(); + level.outs_.test(test_keys.begin(), test_keys.end(), test_results.begin()); outs_refs_.push_back(level.outs_.ref()); @@ -179,7 +187,7 @@ auto trie::ref(Operators...) const noexcept template trie::level::level() - : louds_{}, outs_{}, labels_{}, labels_ptr_{nullptr}, offset_{0} + : h_louds_{}, h_outs_{}, louds_{}, outs_{}, labels_{}, labels_ptr_{nullptr}, offset_{0} { } diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh index f1ee35e59..b24578bac 100644 --- a/include/cuco/trie.cuh +++ b/include/cuco/trie.cuh @@ -17,6 +17,7 @@ #include #include +#include namespace cuco { namespace experimental { @@ -112,7 +113,8 @@ class trie { std::vector levels_; ///< Host-side array of levels level* d_levels_ptr_; ///< Device-side array of levels - using bitset_ref = typename detail::dynamic_bitset::ref_type; ///< Read ref + using bitset_type = typename detail::dynamic_bitset; ///< Bitset type + using bitset_ref = typename bitset_type::ref_type; ///< Bitset ref /// Type of the allocator to (de)allocate bitset refs using bitset_allocator_type = typename std::allocator_traits::rebind_alloc; ///< refs to per-level louds bitsets @@ -133,6 +135,45 @@ class trie { template friend class detail::operator_impl; + // Host bitset to buffer bit updates before bulk initializing dynamic_bitset on device + // TODO: This struct replicates code from dynamic_bitset. Remove these parts from dynamic_bitset? + struct host_bitset { + using word_type = typename bitset_type::word_type; + thrust::host_vector words_; + size_type n_bits_; + + host_bitset() noexcept : n_bits_{0} {} + void push_back(bool bit) noexcept + { + if (n_bits_ % bits_per_block == 0) { + words_.resize(words_.size() + words_per_block); // Extend storage by one block + } + + set(n_bits_++, bit); + } + void set(size_type index, bool bit) noexcept + { + size_type word_id = index / bits_per_word; + size_type bit_id = index % bits_per_word; + if (bit) { + words_[word_id] |= 1UL << bit_id; + } else { + words_[word_id] &= ~(1UL << bit_id); + } + } + void set_last(bool bit) noexcept { set(n_bits_ - 1, bit); } + void clear() noexcept + { + words_.clear(); + n_bits_ = 0; + } + + private: + const size_type words_per_block = bitset_type::words_per_block; + const size_type bits_per_block = bitset_type::bits_per_block; + const size_type bits_per_word = bitset_type::bits_per_word; + }; + /** * @brief Struct to represent each trie level */ @@ -140,8 +181,11 @@ class trie { level(); level(level&&) = default; ///< Move constructor - detail::dynamic_bitset louds_; ///< Indicates links to next and previous level - detail::dynamic_bitset outs_; ///< Indicates terminal nodes of valid keys + bitset_type louds_; ///< Indicates links to next and previous level + bitset_type outs_; ///< Indicates terminal nodes of valid keys + + host_bitset h_louds_; ///< Host buffer for louds + host_bitset h_outs_; ///< Host buffer for outs /// Type of the allocator to (de)allocate labels using label_allocator_type = typename std::allocator_traits::rebind_alloc; From 6a8af914dab56939aa73a3461cb6e21fbe425f49 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Mon, 2 Oct 2023 04:30:12 +0000 Subject: [PATCH 18/41] Minor changes in lookup test --- tests/trie/lookup_test.cu | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/tests/trie/lookup_test.cu b/tests/trie/lookup_test.cu index b431aa002..68fe23d3d 100644 --- a/tests/trie/lookup_test.cu +++ b/tests/trie/lookup_test.cu @@ -34,19 +34,17 @@ template void generate_keys(thrust::host_vector& keys, thrust::host_vector& offsets, size_t num_keys, - size_t max_key_value, size_t max_key_length) { for (size_t key_id = 0; key_id < num_keys; key_id++) { size_t cur_key_length = 1 + (std::rand() % max_key_length); offsets.push_back(cur_key_length); for (size_t pos = 0; pos < cur_key_length; pos++) { - keys.push_back(1 + (std::rand() % max_key_value)); + keys.push_back(std::rand()); } } - // Add a dummy 0 to simplify subsequent scan - offsets.push_back(0); + offsets.push_back(0); // Extend size by 1 for subsequent scan thrust::exclusive_scan(offsets.begin(), offsets.end(), offsets.begin()); // in-place scan } @@ -55,12 +53,11 @@ TEST_CASE("Lookup test", "") using KeyType = int; std::size_t num_keys = 64 * 1024; - std::size_t max_key_value = 1000; - std::size_t max_key_length = 32; + std::size_t max_key_length = 6; thrust::host_vector keys; thrust::host_vector offsets; - generate_keys(keys, offsets, num_keys, max_key_value, max_key_length); + generate_keys(keys, offsets, num_keys, max_key_length); cuco::experimental::trie trie; @@ -75,7 +72,7 @@ TEST_CASE("Lookup test", "") } struct vectorKeyCompare { - bool operator()(const std::vector& lhs, const std::vector& rhs) + bool operator()(const std::vector& lhs, const std::vector& rhs) const { for (size_t pos = 0; pos < min(lhs.size(), rhs.size()); pos++) { if (lhs[pos] < rhs[pos]) { @@ -92,9 +89,9 @@ TEST_CASE("Lookup test", "") for (auto key : all_keys) { trie.insert(key); } - } - trie.build(); + trie.build(); + } { thrust::device_vector lookup_result(num_keys, -1lu); From 248d71fe5ae9a38769e3729a76fdb12d49f5d18e Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 11 Oct 2023 05:43:57 +0000 Subject: [PATCH 19/41] Use iterators as parameters to insert() --- include/cuco/detail/trie/trie.inl | 39 ++++++++++++++++++++----------- include/cuco/trie.cuh | 8 +++++-- 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl index f1ce68a6f..0fba69cb1 100644 --- a/include/cuco/detail/trie/trie.inl +++ b/include/cuco/detail/trie/trie.inl @@ -46,28 +46,35 @@ trie::~trie() noexcept(false) } template -void trie::insert(const std::vector& key) noexcept +template +void trie::insert(KeyIt keys_begin, KeyIt keys_end) noexcept { - if (key == last_key_) { return; } // Ignore duplicate keys - assert(num_keys_ == 0 || key > last_key_); // Keys are expected to be inserted in sorted order + size_t key_length = std::distance(keys_begin, keys_end); - if (key.empty()) { + bool same_as_last_key = key_length == last_key_.size(); + for (size_t pos = 0; same_as_last_key && pos < last_key_.size(); pos++) { + if (keys_begin[pos] != last_key_[pos]) { same_as_last_key = false; } + } + if (same_as_last_key) { return; } // Ignore duplicate keys + // assert(num_keys_ == 0 || key > last_key_); // Keys are expected to be inserted in sorted order + + if (key_length == 0) { levels_[0].h_outs_.set(0, 1); ++levels_[1].offset_; ++num_keys_; return; } - if (key.size() + 1 >= levels_.size()) { levels_.resize(key.size() + 2); } + if (key_length + 1 >= levels_.size()) { levels_.resize(key_length + 2); } // Find first position where label is different from last_key // Trie is not updated till that position is reached, simply skip to next position size_type pos = 0; - for (; pos < key.size(); ++pos) { + for (; pos < key_length; ++pos) { auto& level = levels_[pos + 1]; - auto label = key[pos]; + auto label = keys_begin[pos]; - if ((pos == last_key_.size()) || (label != level.labels_.back())) { + if (pos == last_key_.size() || label != level.labels_.back()) { level.h_louds_.set_last(0); level.h_louds_.push_back(1); level.h_outs_.push_back(0); @@ -79,21 +86,25 @@ void trie::insert(const std::vector& key) noexc // Process remaining labels after divergence point from last_key // Each such label will create a new edge and node pair - for (++pos; pos < key.size(); ++pos) { + for (++pos; pos < key_length; ++pos) { auto& level = levels_[pos + 1]; level.h_louds_.push_back(0); level.h_louds_.push_back(1); level.h_outs_.push_back(0); - level.labels_.push_back(key[pos]); + level.labels_.push_back(keys_begin[pos]); ++num_nodes_; } - levels_[key.size() + 1].h_louds_.push_back(1); // Mark end of current key - ++levels_[key.size() + 1].offset_; - levels_[key.size()].h_outs_.set_last(1); // Set terminal bit indicating valid path + levels_[key_length + 1].h_louds_.push_back(1); // Mark end of current key + ++levels_[key_length + 1].offset_; + levels_[key_length].h_outs_.set_last(1); // Set terminal bit indicating valid path ++num_keys_; - last_key_ = key; + + last_key_.resize(key_length); + for (size_t pos = 0; pos < key_length; pos++) { + last_key_[pos] = keys_begin[pos]; + } } template diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh index b24578bac..59d4dedd9 100644 --- a/include/cuco/trie.cuh +++ b/include/cuco/trie.cuh @@ -42,9 +42,13 @@ class trie { /** * @brief Insert a single key into trie * - * @param key Key to insert + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to trie's + * `LabelType` + * @param keys_begin Begin iterator to list of labels of input key + * @param keys_end End iterator to list of labels of input key */ - void insert(const std::vector& key) noexcept; + template + void insert(KeyIt keys_begin, KeyIt keys_end) noexcept; /** * @brief Build level-by-level trie indexes after inserting all keys From 25eaa9d4abe8ad1ca7e189fabf024c290799f93e Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 11 Oct 2023 05:45:06 +0000 Subject: [PATCH 20/41] Move key generation utilities into different file --- tests/trie/common.hpp | 40 ++++++++++++++++++++++++++++++++++++++ tests/trie/lookup_test.cu | 41 +++------------------------------------ 2 files changed, 43 insertions(+), 38 deletions(-) create mode 100644 tests/trie/common.hpp diff --git a/tests/trie/common.hpp b/tests/trie/common.hpp new file mode 100644 index 000000000..6f1d6e0be --- /dev/null +++ b/tests/trie/common.hpp @@ -0,0 +1,40 @@ +#pragma once + +struct valid_key { + valid_key(size_t num_keys) : num_keys_(num_keys) {} + __host__ __device__ bool operator()(size_t x) const { return x < num_keys_; } + const size_t num_keys_; +}; + +template +void generate_keys(thrust::host_vector& keys, + thrust::host_vector& offsets, + size_t num_keys, + size_t max_key_length) +{ + for (size_t key_id = 0; key_id < num_keys; key_id++) { + size_t cur_key_length = 1 + (std::rand() % max_key_length); + offsets.push_back(cur_key_length); + for (size_t pos = 0; pos < cur_key_length; pos++) { + keys.push_back(std::rand() % 100000); + } + } + + offsets.push_back(0); // Extend size by 1 for subsequent scan + thrust::exclusive_scan(offsets.begin(), offsets.end(), offsets.begin()); // in-place scan +} + +template +struct vectorKeyCompare { + bool operator()(const std::vector& lhs, const std::vector& rhs) const + { + for (size_t pos = 0; pos < min(lhs.size(), rhs.size()); pos++) { + if (lhs[pos] < rhs[pos]) { + return true; + } else if (lhs[pos] > rhs[pos]) { + return false; + } + } + return lhs.size() <= rhs.size(); + } +}; diff --git a/tests/trie/lookup_test.cu b/tests/trie/lookup_test.cu index 68fe23d3d..c929e77fb 100644 --- a/tests/trie/lookup_test.cu +++ b/tests/trie/lookup_test.cu @@ -24,29 +24,7 @@ #include -struct valid_key { - valid_key(size_t num_keys) : num_keys_(num_keys) {} - __host__ __device__ bool operator()(size_t x) const { return x < num_keys_; } - const size_t num_keys_; -}; - -template -void generate_keys(thrust::host_vector& keys, - thrust::host_vector& offsets, - size_t num_keys, - size_t max_key_length) -{ - for (size_t key_id = 0; key_id < num_keys; key_id++) { - size_t cur_key_length = 1 + (std::rand() % max_key_length); - offsets.push_back(cur_key_length); - for (size_t pos = 0; pos < cur_key_length; pos++) { - keys.push_back(std::rand()); - } - } - - offsets.push_back(0); // Extend size by 1 for subsequent scan - thrust::exclusive_scan(offsets.begin(), offsets.end(), offsets.begin()); // in-place scan -} +#include "common.hpp" TEST_CASE("Lookup test", "") { @@ -71,23 +49,10 @@ TEST_CASE("Lookup test", "") all_keys.push_back(cur_key); } - struct vectorKeyCompare { - bool operator()(const std::vector& lhs, const std::vector& rhs) const - { - for (size_t pos = 0; pos < min(lhs.size(), rhs.size()); pos++) { - if (lhs[pos] < rhs[pos]) { - return true; - } else if (lhs[pos] > rhs[pos]) { - return false; - } - } - return lhs.size() <= rhs.size(); - } - }; - sort(all_keys.begin(), all_keys.end(), vectorKeyCompare()); + sort(all_keys.begin(), all_keys.end(), vectorKeyCompare()); for (auto key : all_keys) { - trie.insert(key); + trie.insert(key.begin(), key.end()); } trie.build(); From 2753a38e53c4359673b8a707d958bb61bb347cf4 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 11 Oct 2023 19:53:46 +0000 Subject: [PATCH 21/41] Limit grid size to 128 --- include/cuco/detail/trie/trie.inl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl index 0fba69cb1..340ac8219 100644 --- a/include/cuco/detail/trie/trie.inl +++ b/include/cuco/detail/trie/trie.inl @@ -165,7 +165,7 @@ void trie::lookup(KeyIt keys_begin, auto num_keys = cuco::detail::distance(offsets_begin, offsets_end) - 1; if (num_keys == 0) { return; } - auto const grid_size = cuco::detail::grid_size(num_keys); + auto const grid_size = min(128lu, cuco::detail::grid_size(num_keys)); auto ref_ = this->ref(cuco::experimental::trie_lookup); trie_lookup_kernel<<>>( From 01c35dfd6d7297e7cee2dd4af20535fbd4008f87 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 11 Oct 2023 21:32:32 +0000 Subject: [PATCH 22/41] Add performance test --- tests/CMakeLists.txt | 3 +- tests/trie/common.hpp | 17 +++++ tests/trie/perf_test.cu | 146 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 165 insertions(+), 1 deletion(-) create mode 100644 tests/trie/perf_test.cu diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b837a3964..bfb731231 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -109,4 +109,5 @@ ConfigureTest(DYNAMIC_BITSET_TEST ################################################################################################### # - trie tests ------------------------------------------------------------------------------ ConfigureTest(TRIE_TEST - trie/lookup_test.cu) + trie/lookup_test.cu + trie/perf_test.cu) diff --git a/tests/trie/common.hpp b/tests/trie/common.hpp index 6f1d6e0be..4abaa1eb4 100644 --- a/tests/trie/common.hpp +++ b/tests/trie/common.hpp @@ -38,3 +38,20 @@ struct vectorKeyCompare { return lhs.size() <= rhs.size(); } }; + +inline std::chrono::high_resolution_clock::time_point current_time() +{ + return std::chrono::high_resolution_clock::now(); +} +inline size_t elapsed_seconds(std::chrono::high_resolution_clock::time_point begin) +{ + return std::chrono::duration_cast(current_time() - begin).count(); +} +inline size_t elapsed_milliseconds(std::chrono::high_resolution_clock::time_point begin) +{ + return std::chrono::duration_cast(current_time() - begin).count(); +} +inline size_t elapsed_microseconds(std::chrono::high_resolution_clock::time_point begin) +{ + return std::chrono::duration_cast(current_time() - begin).count(); +} diff --git a/tests/trie/perf_test.cu b/tests/trie/perf_test.cu new file mode 100644 index 000000000..1e0464381 --- /dev/null +++ b/tests/trie/perf_test.cu @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "common.hpp" + +using namespace std; + +vector read_input_keys(const char* filename, size_t num_keys) +{ + ifstream input_file(filename); + if (!input_file.is_open()) { + std::cout << "Error opening file: " << filename << std::endl; + exit(1); + } + vector keys; + string line; + while (keys.size() < num_keys and getline(input_file, line)) { + keys.push_back(line); + } + return keys; +} + +template +vector split_str_into_ints(const string& key) +{ + stringstream ss(key); + vector tokens; + string buf; + + while (ss >> buf) { + tokens.push_back(stoi(buf)); + } + return tokens; +} + +template +vector> generate_split_keys(const vector& keys) +{ + vector> split_keys(keys.size()); + for (size_t i = 0; i < keys.size(); i++) { + split_keys[i] = split_str_into_ints(keys[i]); + } + return split_keys; +} + +template +void find_pivots(const vector>& keys, + std::vector& pivot_vals, + std::vector& pivot_offsets) +{ + pivot_vals.push_back(keys[0][1]); + pivot_offsets.push_back(0); + + for (size_t pos = 1; pos < keys.size(); pos++) { + if (keys[pos][1] != keys[pos - 1][1]) { + pivot_vals.push_back(keys[pos][1]); + pivot_offsets.push_back(pos); + } + } + pivot_offsets.push_back(keys.size()); +} + +TEST_CASE("Perf test", "") +{ + using KeyType = int; + + const char* input_filename = "trie_dataset.txt"; + auto keys = generate_split_keys(read_input_keys(input_filename, 1000 * 1000)); + size_t num_keys = keys.size(); + std::cout << "Num keys " << num_keys << std::endl; + + auto begin = current_time(); + cuco::experimental::trie trie; + for (auto& key : keys) { + trie.insert(key.begin(), key.end()); + } + auto insert_sec = elapsed_seconds(begin); + std::cout << "Insert time " << insert_sec << "s "; + std::cout << std::setprecision(2) << (1. * num_keys / insert_sec) / 1000 << "K keys/sec" + << std::endl; + + begin = current_time(); + trie.build(); + auto build_msec = elapsed_milliseconds(begin); + + std::cout << "Build time " << build_msec << "ms "; + std::cout << std::setprecision(2) << (1. * num_keys / build_msec) / 1000 << "M keys/sec" + << std::endl; + + std::random_shuffle(keys.begin(), keys.end()); + + thrust::host_vector lookup_inputs; + thrust::host_vector lookup_offsets; + lookup_offsets.push_back(0); + for (auto key : keys) { + for (auto subkey : key) { + lookup_inputs.push_back(subkey); + } + lookup_offsets.push_back(lookup_offsets.back() + key.size()); + } + std::cout << "Average key length " << std::setprecision(2) + << 1. * lookup_offsets.back() / num_keys << std::endl; + + thrust::device_vector d_lookup_inputs = lookup_inputs; + thrust::device_vector d_lookup_offsets = lookup_offsets; + thrust::device_vector d_lookup_result(num_keys, -1lu); + + begin = current_time(); + trie.lookup(d_lookup_inputs.begin(), + d_lookup_offsets.begin(), + d_lookup_offsets.end(), + d_lookup_result.begin()); + auto lookup_usec = elapsed_microseconds(begin); + std::cout << "Lookup time " << lookup_usec << "us "; + std::cout << std::setprecision(2) << (num_keys / lookup_usec) / 1000 << "B keys/sec" << std::endl; + + REQUIRE(cuco::test::all_of(d_lookup_result.begin(), d_lookup_result.end(), valid_key(num_keys))); +} From 715173306a37bed5b565cfeabc4f3ea976e03ed1 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Thu, 12 Oct 2023 07:02:24 +0000 Subject: [PATCH 23/41] Store labels in host vector during insertions This improves insertion perfomance massively --- include/cuco/detail/trie/trie.inl | 19 +++++++++++++----- include/cuco/trie.cuh | 2 ++ tests/trie/perf_test.cu | 32 +++++++++++++++++++------------ 3 files changed, 36 insertions(+), 17 deletions(-) diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl index 340ac8219..b49f740af 100644 --- a/include/cuco/detail/trie/trie.inl +++ b/include/cuco/detail/trie/trie.inl @@ -35,7 +35,7 @@ constexpr trie::trie(Allocator const& allocator) levels_[0].h_louds_.push_back(1); levels_[1].h_louds_.push_back(1); levels_[0].h_outs_.push_back(0); - levels_[0].labels_.push_back(root_label_); + levels_[0].h_labels_.push_back(root_label_); } template @@ -74,11 +74,11 @@ void trie::insert(KeyIt keys_begin, KeyIt keys_end) noexce auto& level = levels_[pos + 1]; auto label = keys_begin[pos]; - if (pos == last_key_.size() || label != level.labels_.back()) { + if (pos == last_key_.size() || label != level.h_labels_.back()) { level.h_louds_.set_last(0); level.h_louds_.push_back(1); level.h_outs_.push_back(0); - level.labels_.push_back(label); + level.h_labels_.push_back(label); ++num_nodes_; break; } @@ -91,7 +91,7 @@ void trie::insert(KeyIt keys_begin, KeyIt keys_end) noexce level.h_louds_.push_back(0); level.h_louds_.push_back(1); level.h_outs_.push_back(0); - level.labels_.push_back(keys_begin[pos]); + level.h_labels_.push_back(keys_begin[pos]); ++num_nodes_; } @@ -132,6 +132,8 @@ void trie::build() noexcept(false) level.outs_.test(test_keys.begin(), test_keys.end(), test_results.begin()); outs_refs_.push_back(level.outs_.ref()); + level.labels_ = level.h_labels_; + level.h_labels_.clear(); level.labels_ptr_ = thrust::raw_pointer_cast(level.labels_.data()); offset += level.offset_; @@ -198,7 +200,14 @@ auto trie::ref(Operators...) const noexcept template trie::level::level() - : h_louds_{}, h_outs_{}, louds_{}, outs_{}, labels_{}, labels_ptr_{nullptr}, offset_{0} + : h_louds_{}, + h_outs_{}, + louds_{}, + outs_{}, + labels_{}, + labels_ptr_{nullptr}, + h_labels_{}, + offset_{0} { } diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh index 59d4dedd9..1b03f8111 100644 --- a/include/cuco/trie.cuh +++ b/include/cuco/trie.cuh @@ -197,6 +197,8 @@ class trie { thrust::device_vector labels_; LabelType* labels_ptr_; ///< Raw device pointer to labels + std::vector h_labels_; ///< Host copy of labels, using std::vector for performance + size_type offset_; ///< Cumulative node count in parent levels }; }; diff --git a/tests/trie/perf_test.cu b/tests/trie/perf_test.cu index 1e0464381..d1944aadd 100644 --- a/tests/trie/perf_test.cu +++ b/tests/trie/perf_test.cu @@ -93,7 +93,7 @@ TEST_CASE("Perf test", "") using KeyType = int; const char* input_filename = "trie_dataset.txt"; - auto keys = generate_split_keys(read_input_keys(input_filename, 1000 * 1000)); + auto keys = generate_split_keys(read_input_keys(input_filename, 45 * 1000 * 1000)); size_t num_keys = keys.size(); std::cout << "Num keys " << num_keys << std::endl; @@ -102,17 +102,18 @@ TEST_CASE("Perf test", "") for (auto& key : keys) { trie.insert(key.begin(), key.end()); } - auto insert_sec = elapsed_seconds(begin); - std::cout << "Insert time " << insert_sec << "s "; - std::cout << std::setprecision(2) << (1. * num_keys / insert_sec) / 1000 << "K keys/sec" + auto insert_msec = elapsed_milliseconds(begin); + + std::cout << "Insert " << std::setprecision(2) << insert_msec / 1000. << "s @ "; + std::cout << std::setprecision(2) << (1. * num_keys / insert_msec) / 1000 << "M keys/sec" << std::endl; begin = current_time(); trie.build(); auto build_msec = elapsed_milliseconds(begin); - std::cout << "Build time " << build_msec << "ms "; - std::cout << std::setprecision(2) << (1. * num_keys / build_msec) / 1000 << "M keys/sec" + std::cout << "Build " << build_msec << "ms @ "; + std::cout << std::setprecision(3) << (1. * num_keys / build_msec) / 1000 << "M keys/sec" << std::endl; std::random_shuffle(keys.begin(), keys.end()); @@ -126,21 +127,28 @@ TEST_CASE("Perf test", "") } lookup_offsets.push_back(lookup_offsets.back() + key.size()); } - std::cout << "Average key length " << std::setprecision(2) - << 1. * lookup_offsets.back() / num_keys << std::endl; + // std::cout << "Average key length " << std::setprecision(2) + // << 1. * lookup_offsets.back() / num_keys << std::endl; thrust::device_vector d_lookup_inputs = lookup_inputs; thrust::device_vector d_lookup_offsets = lookup_offsets; thrust::device_vector d_lookup_result(num_keys, -1lu); + cudaStream_t stream; + cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + begin = current_time(); trie.lookup(d_lookup_inputs.begin(), d_lookup_offsets.begin(), d_lookup_offsets.end(), - d_lookup_result.begin()); - auto lookup_usec = elapsed_microseconds(begin); - std::cout << "Lookup time " << lookup_usec << "us "; - std::cout << std::setprecision(2) << (num_keys / lookup_usec) / 1000 << "B keys/sec" << std::endl; + d_lookup_result.begin(), + stream); + cudaStreamSynchronize(stream); + auto lookup_msec = elapsed_milliseconds(begin); + + std::cout << "Lookup " << lookup_msec << "ms @ "; + std::cout << std::setprecision(2) << (1. * num_keys / lookup_msec) / 1000.0 << "M keys/sec" + << std::endl; REQUIRE(cuco::test::all_of(d_lookup_result.begin(), d_lookup_result.end(), valid_key(num_keys))); } From 9f7d6d392474c81633e2ecfbda2ef47a58064038 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Thu, 12 Oct 2023 20:18:11 +0000 Subject: [PATCH 24/41] Parallelize input preprocessing --- tests/CMakeLists.txt | 5 +++-- tests/trie/perf_test.cu | 23 ++++++++++++++++------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index bfb731231..7032eff01 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -34,12 +34,13 @@ endif() ################################################################################################### function(ConfigureTest TEST_NAME) add_executable(${TEST_NAME} ${ARGN}) - target_link_libraries(${TEST_NAME} PRIVATE Catch2::Catch2WithMain cuco CUDA::cudart) + target_link_options(${TEST_NAME} PRIVATE -fopenmp) + target_link_libraries(${TEST_NAME} PRIVATE Catch2::Catch2WithMain cuco CUDA::cudart Threads::Threads) target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) set_target_properties(${TEST_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests") target_compile_options(${TEST_NAME} PRIVATE --compiler-options=-Wall --compiler-options=-Wextra - --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage) + --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage -fopenmp) catch_discover_tests(${TEST_NAME} EXTRA_ARGS --allow-running-no-tests) endfunction(ConfigureTest) diff --git a/tests/trie/perf_test.cu b/tests/trie/perf_test.cu index d1944aadd..de49ff0c6 100644 --- a/tests/trie/perf_test.cu +++ b/tests/trie/perf_test.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -65,6 +66,7 @@ template vector> generate_split_keys(const vector& keys) { vector> split_keys(keys.size()); +#pragma omp parallel for for (size_t i = 0; i < keys.size(); i++) { split_keys[i] = split_str_into_ints(keys[i]); } @@ -118,15 +120,22 @@ TEST_CASE("Perf test", "") std::random_shuffle(keys.begin(), keys.end()); - thrust::host_vector lookup_inputs; - thrust::host_vector lookup_offsets; - lookup_offsets.push_back(0); - for (auto key : keys) { - for (auto subkey : key) { - lookup_inputs.push_back(subkey); + thrust::host_vector lookup_offsets(num_keys + 1); + lookup_offsets[0] = 0; +#pragma omp parallel for + for (size_t i = 0; i < num_keys; i++) { + lookup_offsets[i + 1] = keys[i].size(); + } + std::partial_sum(lookup_offsets.begin(), lookup_offsets.end(), lookup_offsets.begin()); + + thrust::host_vector lookup_inputs(lookup_offsets.back()); +#pragma omp parallel for + for (size_t i = 0; i < num_keys; i++) { + for (size_t pos = 0; pos < keys[i].size(); pos++) { + lookup_inputs[lookup_offsets[i] + pos] = keys[i][pos]; } - lookup_offsets.push_back(lookup_offsets.back() + key.size()); } + // std::cout << "Average key length " << std::setprecision(2) // << 1. * lookup_offsets.back() / num_keys << std::endl; From 342bcee7721aef08d25b61dcf30855a62b033abe Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Thu, 12 Oct 2023 20:33:34 +0000 Subject: [PATCH 25/41] Move dynamic_bitset tests into tests/trie --- tests/CMakeLists.txt | 10 +++++----- tests/{ => trie}/dynamic_bitset/find_next_test.cu | 0 tests/{ => trie}/dynamic_bitset/get_test.cu | 0 tests/{ => trie}/dynamic_bitset/rank_test.cu | 0 tests/{ => trie}/dynamic_bitset/select_test.cu | 0 tests/{ => trie}/dynamic_bitset/size_test.cu | 0 6 files changed, 5 insertions(+), 5 deletions(-) rename tests/{ => trie}/dynamic_bitset/find_next_test.cu (100%) rename tests/{ => trie}/dynamic_bitset/get_test.cu (100%) rename tests/{ => trie}/dynamic_bitset/rank_test.cu (100%) rename tests/{ => trie}/dynamic_bitset/select_test.cu (100%) rename tests/{ => trie}/dynamic_bitset/size_test.cu (100%) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 7032eff01..1c366b9c1 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -101,11 +101,11 @@ ConfigureTest(STATIC_MULTIMAP_TEST ################################################################################################### # - dynamic_bitset tests -------------------------------------------------------------------------- ConfigureTest(DYNAMIC_BITSET_TEST - dynamic_bitset/find_next_test.cu - dynamic_bitset/get_test.cu - dynamic_bitset/rank_test.cu - dynamic_bitset/select_test.cu - dynamic_bitset/size_test.cu) + trie/dynamic_bitset/find_next_test.cu + trie/dynamic_bitset/get_test.cu + trie/dynamic_bitset/rank_test.cu + trie/dynamic_bitset/select_test.cu + trie/dynamic_bitset/size_test.cu) ################################################################################################### # - trie tests ------------------------------------------------------------------------------ diff --git a/tests/dynamic_bitset/find_next_test.cu b/tests/trie/dynamic_bitset/find_next_test.cu similarity index 100% rename from tests/dynamic_bitset/find_next_test.cu rename to tests/trie/dynamic_bitset/find_next_test.cu diff --git a/tests/dynamic_bitset/get_test.cu b/tests/trie/dynamic_bitset/get_test.cu similarity index 100% rename from tests/dynamic_bitset/get_test.cu rename to tests/trie/dynamic_bitset/get_test.cu diff --git a/tests/dynamic_bitset/rank_test.cu b/tests/trie/dynamic_bitset/rank_test.cu similarity index 100% rename from tests/dynamic_bitset/rank_test.cu rename to tests/trie/dynamic_bitset/rank_test.cu diff --git a/tests/dynamic_bitset/select_test.cu b/tests/trie/dynamic_bitset/select_test.cu similarity index 100% rename from tests/dynamic_bitset/select_test.cu rename to tests/trie/dynamic_bitset/select_test.cu diff --git a/tests/dynamic_bitset/size_test.cu b/tests/trie/dynamic_bitset/size_test.cu similarity index 100% rename from tests/dynamic_bitset/size_test.cu rename to tests/trie/dynamic_bitset/size_test.cu From 81f9d4631d5245be2616c3cbbcfb4c251258131c Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Thu, 12 Oct 2023 21:20:21 +0000 Subject: [PATCH 26/41] Consolidate trie utils in a single file --- tests/trie/lookup_test.cu | 2 +- tests/trie/perf_test.cu | 65 +---------------------- tests/trie/{common.hpp => trie_utils.hpp} | 60 +++++++++++++++++++++ 3 files changed, 62 insertions(+), 65 deletions(-) rename tests/trie/{common.hpp => trie_utils.hpp} (55%) diff --git a/tests/trie/lookup_test.cu b/tests/trie/lookup_test.cu index c929e77fb..b96691113 100644 --- a/tests/trie/lookup_test.cu +++ b/tests/trie/lookup_test.cu @@ -24,7 +24,7 @@ #include -#include "common.hpp" +#include "trie_utils.hpp" TEST_CASE("Lookup test", "") { diff --git a/tests/trie/perf_test.cu b/tests/trie/perf_test.cu index de49ff0c6..c875a5260 100644 --- a/tests/trie/perf_test.cu +++ b/tests/trie/perf_test.cu @@ -18,77 +18,14 @@ #include -#include -#include #include #include -#include #include #include -#include -#include #include -#include "common.hpp" - -using namespace std; - -vector read_input_keys(const char* filename, size_t num_keys) -{ - ifstream input_file(filename); - if (!input_file.is_open()) { - std::cout << "Error opening file: " << filename << std::endl; - exit(1); - } - vector keys; - string line; - while (keys.size() < num_keys and getline(input_file, line)) { - keys.push_back(line); - } - return keys; -} - -template -vector split_str_into_ints(const string& key) -{ - stringstream ss(key); - vector tokens; - string buf; - - while (ss >> buf) { - tokens.push_back(stoi(buf)); - } - return tokens; -} - -template -vector> generate_split_keys(const vector& keys) -{ - vector> split_keys(keys.size()); -#pragma omp parallel for - for (size_t i = 0; i < keys.size(); i++) { - split_keys[i] = split_str_into_ints(keys[i]); - } - return split_keys; -} - -template -void find_pivots(const vector>& keys, - std::vector& pivot_vals, - std::vector& pivot_offsets) -{ - pivot_vals.push_back(keys[0][1]); - pivot_offsets.push_back(0); - - for (size_t pos = 1; pos < keys.size(); pos++) { - if (keys[pos][1] != keys[pos - 1][1]) { - pivot_vals.push_back(keys[pos][1]); - pivot_offsets.push_back(pos); - } - } - pivot_offsets.push_back(keys.size()); -} +#include "trie_utils.hpp" TEST_CASE("Perf test", "") { diff --git a/tests/trie/common.hpp b/tests/trie/trie_utils.hpp similarity index 55% rename from tests/trie/common.hpp rename to tests/trie/trie_utils.hpp index 4abaa1eb4..52d66a62c 100644 --- a/tests/trie/common.hpp +++ b/tests/trie/trie_utils.hpp @@ -1,5 +1,9 @@ #pragma once +#include +#include +#include + struct valid_key { valid_key(size_t num_keys) : num_keys_(num_keys) {} __host__ __device__ bool operator()(size_t x) const { return x < num_keys_; } @@ -39,6 +43,62 @@ struct vectorKeyCompare { } }; +inline std::vector read_input_keys(const char* filename, size_t num_keys) +{ + std::ifstream input_file(filename); + if (!input_file.is_open()) { + std::cout << "Error opening file: " << filename << std::endl; + exit(1); + } + std::vector keys; + std::string line; + while (keys.size() < num_keys and getline(input_file, line)) { + keys.push_back(line); + } + return keys; +} + +template +std::vector split_str_into_ints(const std::string& key) +{ + std::stringstream ss(key); + std::vector tokens; + std::string buf; + + while (ss >> buf) { + tokens.push_back(stoi(buf)); + } + return tokens; +} + +template +std::vector> generate_split_keys(const std::vector& keys) +{ + std::vector> split_keys(keys.size()); +#pragma omp parallel for + for (size_t i = 0; i < keys.size(); i++) { + split_keys[i] = split_str_into_ints(keys[i]); + } + return split_keys; +} + +template +void find_pivots(const std::vector>& keys, + std::vector& pivot_vals, + std::vector& pivot_offsets) +{ + pivot_vals.push_back(keys[0][1]); + pivot_offsets.push_back(0); + + for (size_t pos = 1; pos < keys.size(); pos++) { + if (keys[pos][1] != keys[pos - 1][1]) { + pivot_vals.push_back(keys[pos][1]); + pivot_offsets.push_back(pos); + } + } + pivot_offsets.push_back(keys.size()); +} + inline std::chrono::high_resolution_clock::time_point current_time() { return std::chrono::high_resolution_clock::now(); From 03821efe1e1fc733513bc0ad315f7757d8f73d94 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 13 Oct 2023 01:35:03 +0000 Subject: [PATCH 27/41] find_next host-bulk API in dynamic_bitset --- .../trie/dynamic_bitset/dynamic_bitset.cuh | 20 +++++++++++++ .../trie/dynamic_bitset/dynamic_bitset.inl | 19 ++++++++++++ .../detail/trie/dynamic_bitset/kernels.cuh | 29 +++++++++++++++++++ 3 files changed, 68 insertions(+) diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index 68c30c5b8..06d845045 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -156,6 +156,26 @@ class dynamic_bitset { OutputIt outputs_begin, cuda_stream_ref stream = {}) noexcept; + /** + * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores + * position of first set bit including or after position `keys_begin[i]`, to `output_begin[i]`. + * + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` + * + * @param keys_begin Begin iterator to list of positions to be queried + * @param keys_end End iterator to positions list + * @param outputs_begin Begin iterator to outputs of find_next operation + * @param stream Stream to execute find_next kernel + */ + template + constexpr void find_next(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream = {}) noexcept; + /** * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores total * count of `1` bits preceeding (but not including) position `keys_begin[i]` to `output_begin[i]`. diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl index 5525c6684..34d6c9a70 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl @@ -107,6 +107,25 @@ constexpr void dynamic_bitset::test(KeyIt keys_begin, ref(), keys_begin, outputs_begin, num_keys); } +template +template +constexpr void dynamic_bitset::find_next(KeyIt keys_begin, + KeyIt keys_end, + OutputIt outputs_begin, + cuda_stream_ref stream) noexcept + +{ + build(); + if (n_bits_ == 0) { return; } + auto const num_keys = cuco::detail::distance(keys_begin, keys_end); + if (num_keys == 0) { return; } + + auto const grid_size = cuco::detail::grid_size(num_keys); + + bitset_find_next_kernel<<>>( + ref(), keys_begin, outputs_begin, num_keys); +} + template template constexpr void dynamic_bitset::rank(KeyIt keys_begin, diff --git a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh index c92ab60b2..b67ba96ee 100644 --- a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh @@ -55,6 +55,35 @@ __global__ void bitset_test_kernel(BitsetRef ref, } } +/* + * @brief Compute position of next set bit for a range of keys + * + * @tparam BitsetRef Bitset reference type + * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's + * `size_type` + * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's + * `size_type` + * + * @param ref Bitset ref + * @param keys Begin iterator to keys + * @param outputs Begin iterator to outputs + * @param num_keys Number of input keys + */ +template +__global__ void bitset_find_next_kernel(BitsetRef ref, + KeyIt keys, + OutputIt outputs, + cuco::detail::index_type num_keys) +{ + auto key_id = cuco::detail::global_thread_id(); + auto const stride = cuco::detail::grid_stride(); + + while (key_id < num_keys) { + outputs[key_id] = ref.find_next(keys[key_id]); + key_id += stride; + } +} + /* * @brief Gather rank values for a range of keys * From f70d528e2b6684e3ded03f23007cf3456a95066c Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 13 Oct 2023 01:47:26 +0000 Subject: [PATCH 28/41] Dynamic bitset benchmarks --- benchmarks/CMakeLists.txt | 9 +++ .../trie/dynamic_bitset/find_next_bench.cu | 62 +++++++++++++++++++ benchmarks/trie/dynamic_bitset/rank_bench.cu | 62 +++++++++++++++++++ .../trie/dynamic_bitset/select_bench.cu | 62 +++++++++++++++++++ benchmarks/trie/dynamic_bitset/size_bench.cu | 56 +++++++++++++++++ benchmarks/trie/dynamic_bitset/test_bench.cu | 62 +++++++++++++++++++ 6 files changed, 313 insertions(+) create mode 100644 benchmarks/trie/dynamic_bitset/find_next_bench.cu create mode 100644 benchmarks/trie/dynamic_bitset/rank_bench.cu create mode 100644 benchmarks/trie/dynamic_bitset/select_bench.cu create mode 100644 benchmarks/trie/dynamic_bitset/size_bench.cu create mode 100644 benchmarks/trie/dynamic_bitset/test_bench.cu diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 3635336e8..b16f01337 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -83,3 +83,12 @@ ConfigureBench(DYNAMIC_MAP_BENCH # - hash function benchmarks ---------------------------------------------------------------------- ConfigureBench(HASH_BENCH hash_bench.cu) + +################################################################################################### +# - dynamic_bitset benchmarks ------------------------------------------------------------------------- +ConfigureBench(DYNAMIC_BITSET_BENCH + trie/dynamic_bitset/find_next_bench.cu + trie/dynamic_bitset/rank_bench.cu + trie/dynamic_bitset/select_bench.cu + trie/dynamic_bitset/size_bench.cu + trie/dynamic_bitset/test_bench.cu) diff --git a/benchmarks/trie/dynamic_bitset/find_next_bench.cu b/benchmarks/trie/dynamic_bitset/find_next_bench.cu new file mode 100644 index 000000000..029378422 --- /dev/null +++ b/benchmarks/trie/dynamic_bitset/find_next_bench.cu @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::experimental::detail::dynamic_bitset::find_next` performance + */ +template +void dynamic_bitset_find_next(nvbench::state& state, nvbench::type_list) +{ + auto const num_bits = state.get_int64_or_default("NumInputs", defaults::N); + using word_type = typename cuco::experimental::detail::dynamic_bitset<>::word_type; + auto const bits_per_word = cuco::experimental::detail::dynamic_bitset<>::bits_per_word; + thrust::host_vector keys((num_bits - 1) / bits_per_word + 1); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + cuco::experimental::detail::dynamic_bitset bitset; + bitset.insert(keys.begin(), keys.end(), num_bits); + + const size_t query_size = min(1000 * 1000lu, num_bits / 10); + thrust::device_vector inputs(query_size); + thrust::sequence(inputs.begin(), inputs.end(), 0); + thrust::device_vector outputs(query_size); + + state.add_element_count(query_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + bitset.find_next(inputs.begin(), inputs.end(), outputs.begin()); + }); +} + +NVBENCH_BENCH_TYPES(dynamic_bitset_find_next, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("dynamic_bitset_find_next") + .set_type_axes_names({"Distribution"}) + .set_max_noise(defaults::MAX_NOISE); diff --git a/benchmarks/trie/dynamic_bitset/rank_bench.cu b/benchmarks/trie/dynamic_bitset/rank_bench.cu new file mode 100644 index 000000000..20e6a10cf --- /dev/null +++ b/benchmarks/trie/dynamic_bitset/rank_bench.cu @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::experimental::detail::dynamic_bitset::rank` performance + */ +template +void dynamic_bitset_rank(nvbench::state& state, nvbench::type_list) +{ + auto const num_bits = state.get_int64_or_default("NumInputs", defaults::N); + using word_type = typename cuco::experimental::detail::dynamic_bitset<>::word_type; + auto const bits_per_word = cuco::experimental::detail::dynamic_bitset<>::bits_per_word; + thrust::host_vector keys((num_bits - 1) / bits_per_word + 1); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + cuco::experimental::detail::dynamic_bitset bitset; + bitset.insert(keys.begin(), keys.end(), num_bits); + + const size_t query_size = min(1000 * 1000lu, num_bits / 10); + thrust::device_vector inputs(query_size); + thrust::sequence(inputs.begin(), inputs.end(), 0); + thrust::device_vector outputs(query_size); + + state.add_element_count(query_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + bitset.rank(inputs.begin(), inputs.end(), outputs.begin()); + }); +} + +NVBENCH_BENCH_TYPES(dynamic_bitset_rank, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("dynamic_bitset_rank") + .set_type_axes_names({"Distribution"}) + .set_max_noise(defaults::MAX_NOISE); diff --git a/benchmarks/trie/dynamic_bitset/select_bench.cu b/benchmarks/trie/dynamic_bitset/select_bench.cu new file mode 100644 index 000000000..9530b5379 --- /dev/null +++ b/benchmarks/trie/dynamic_bitset/select_bench.cu @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::experimental::detail::dynamic_bitset::select` performance + */ +template +void dynamic_bitset_select(nvbench::state& state, nvbench::type_list) +{ + auto const num_bits = state.get_int64_or_default("NumInputs", defaults::N); + using word_type = typename cuco::experimental::detail::dynamic_bitset<>::word_type; + auto const bits_per_word = cuco::experimental::detail::dynamic_bitset<>::bits_per_word; + thrust::host_vector keys((num_bits - 1) / bits_per_word + 1); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + cuco::experimental::detail::dynamic_bitset bitset; + bitset.insert(keys.begin(), keys.end(), num_bits); + + const size_t query_size = min(1000 * 1000lu, num_bits / 10); + thrust::device_vector inputs(query_size); + thrust::sequence(inputs.begin(), inputs.end(), 0); + thrust::device_vector outputs(query_size); + + state.add_element_count(query_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + bitset.select(inputs.begin(), inputs.end(), outputs.begin()); + }); +} + +NVBENCH_BENCH_TYPES(dynamic_bitset_select, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("dynamic_bitset_select") + .set_type_axes_names({"Distribution"}) + .set_max_noise(defaults::MAX_NOISE); diff --git a/benchmarks/trie/dynamic_bitset/size_bench.cu b/benchmarks/trie/dynamic_bitset/size_bench.cu new file mode 100644 index 000000000..98a004bd1 --- /dev/null +++ b/benchmarks/trie/dynamic_bitset/size_bench.cu @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::experimental::detail::dynamic_bitset::size` performance + */ +template +void dynamic_bitset_size(nvbench::state& state, nvbench::type_list) +{ + auto const num_bits = state.get_int64_or_default("NumInputs", defaults::N); + using word_type = typename cuco::experimental::detail::dynamic_bitset<>::word_type; + auto const bits_per_word = cuco::experimental::detail::dynamic_bitset<>::bits_per_word; + thrust::host_vector keys((num_bits - 1) / bits_per_word + 1); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + cuco::experimental::detail::dynamic_bitset bitset; + bitset.insert(keys.begin(), keys.end(), num_bits); + + state.add_element_count(1); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { auto const size = bitset.size(); }); +} + +NVBENCH_BENCH_TYPES(dynamic_bitset_size, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("dynamic_bitset_size") + .set_type_axes_names({"Distribution"}) + .set_max_noise(defaults::MAX_NOISE); diff --git a/benchmarks/trie/dynamic_bitset/test_bench.cu b/benchmarks/trie/dynamic_bitset/test_bench.cu new file mode 100644 index 000000000..4786a909a --- /dev/null +++ b/benchmarks/trie/dynamic_bitset/test_bench.cu @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::experimental::detail::dynamic_bitset::test` performance + */ +template +void dynamic_bitset_test(nvbench::state& state, nvbench::type_list) +{ + auto const num_bits = state.get_int64_or_default("NumInputs", defaults::N); + using word_type = typename cuco::experimental::detail::dynamic_bitset<>::word_type; + auto const bits_per_word = cuco::experimental::detail::dynamic_bitset<>::bits_per_word; + thrust::host_vector keys((num_bits - 1) / bits_per_word + 1); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + cuco::experimental::detail::dynamic_bitset bitset; + bitset.insert(keys.begin(), keys.end(), num_bits); + + const size_t query_size = min(1000 * 1000lu, num_bits / 10); + thrust::device_vector inputs(query_size); + thrust::sequence(inputs.begin(), inputs.end(), 0); + thrust::device_vector outputs(query_size); + + state.add_element_count(query_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + bitset.test(inputs.begin(), inputs.end(), outputs.begin()); + }); +} + +NVBENCH_BENCH_TYPES(dynamic_bitset_test, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("dynamic_bitset_test") + .set_type_axes_names({"Distribution"}) + .set_max_noise(defaults::MAX_NOISE); From bd4549ef2aa3305b114e8d72edf6c1d0edc00e8a Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 13 Oct 2023 02:15:04 +0000 Subject: [PATCH 29/41] Consistent use of LabelType A key is a list of labels. Iterators use LabelIt, rather than KeyIt. --- include/cuco/detail/trie/trie.inl | 24 +++++++++++------------ include/cuco/detail/trie/trie_ref.inl | 12 ++++++------ include/cuco/trie.cuh | 27 +++++++++++++------------- tests/trie/lookup_test.cu | 14 +++++++------- tests/trie/perf_test.cu | 10 +++++----- tests/trie/trie_utils.hpp | 28 +++++++++++++-------------- 6 files changed, 57 insertions(+), 58 deletions(-) diff --git a/include/cuco/detail/trie/trie.inl b/include/cuco/detail/trie/trie.inl index b49f740af..ca202d236 100644 --- a/include/cuco/detail/trie/trie.inl +++ b/include/cuco/detail/trie/trie.inl @@ -46,14 +46,14 @@ trie::~trie() noexcept(false) } template -template -void trie::insert(KeyIt keys_begin, KeyIt keys_end) noexcept +template +void trie::insert(LabelIt labels_begin, LabelIt labels_end) noexcept { - size_t key_length = std::distance(keys_begin, keys_end); + size_t key_length = std::distance(labels_begin, labels_end); bool same_as_last_key = key_length == last_key_.size(); for (size_t pos = 0; same_as_last_key && pos < last_key_.size(); pos++) { - if (keys_begin[pos] != last_key_[pos]) { same_as_last_key = false; } + if (labels_begin[pos] != last_key_[pos]) { same_as_last_key = false; } } if (same_as_last_key) { return; } // Ignore duplicate keys // assert(num_keys_ == 0 || key > last_key_); // Keys are expected to be inserted in sorted order @@ -72,7 +72,7 @@ void trie::insert(KeyIt keys_begin, KeyIt keys_end) noexce size_type pos = 0; for (; pos < key_length; ++pos) { auto& level = levels_[pos + 1]; - auto label = keys_begin[pos]; + auto label = labels_begin[pos]; if (pos == last_key_.size() || label != level.h_labels_.back()) { level.h_louds_.set_last(0); @@ -91,7 +91,7 @@ void trie::insert(KeyIt keys_begin, KeyIt keys_end) noexce level.h_louds_.push_back(0); level.h_louds_.push_back(1); level.h_outs_.push_back(0); - level.h_labels_.push_back(keys_begin[pos]); + level.h_labels_.push_back(labels_begin[pos]); ++num_nodes_; } @@ -103,7 +103,7 @@ void trie::insert(KeyIt keys_begin, KeyIt keys_end) noexce last_key_.resize(key_length); for (size_t pos = 0; pos < key_length; pos++) { - last_key_[pos] = keys_begin[pos]; + last_key_[pos] = labels_begin[pos]; } } @@ -157,8 +157,8 @@ void trie::build() noexcept(false) } template -template -void trie::lookup(KeyIt keys_begin, +template +void trie::lookup(LabelIt labels_begin, OffsetIt offsets_begin, OffsetIt offsets_end, OutputIt outputs_begin, @@ -171,12 +171,12 @@ void trie::lookup(KeyIt keys_begin, auto ref_ = this->ref(cuco::experimental::trie_lookup); trie_lookup_kernel<<>>( - ref_, keys_begin, offsets_begin, outputs_begin, num_keys); + ref_, labels_begin, offsets_begin, outputs_begin, num_keys); } -template +template __global__ void trie_lookup_kernel( - TrieRef ref, KeyIt keys, OffsetIt offsets, OutputIt outputs, size_t num_keys) + TrieRef ref, LabelIt keys, OffsetIt offsets, OutputIt outputs, size_t num_keys) { auto key_id = cuco::detail::global_thread_id(); auto const loop_stride = cuco::detail::grid_stride(); diff --git a/include/cuco/detail/trie/trie_ref.inl b/include/cuco/detail/trie/trie_ref.inl index b9ace0421..d7cb9427a 100644 --- a/include/cuco/detail/trie/trie_ref.inl +++ b/include/cuco/detail/trie/trie_ref.inl @@ -19,23 +19,23 @@ class operator_impl - [[nodiscard]] __device__ size_type lookup(KeyIt key, size_type length) const noexcept + template + [[nodiscard]] __device__ size_type lookup(LabelIt labels, size_type length) const noexcept { auto const& trie = static_cast(*this).trie_; // Level-by-level search. node_id is updated at each level size_type node_id = 0; for (size_type cur_depth = 1; cur_depth <= length; cur_depth++) { - if (!search_label_in_children(key[cur_depth - 1], node_id, cur_depth)) { return -1lu; } + if (!search_label_in_children(labels[cur_depth - 1], node_id, cur_depth)) { return -1lu; } } // Check for terminal node bit that indicates a valid key diff --git a/include/cuco/trie.cuh b/include/cuco/trie.cuh index 1b03f8111..6b1837fde 100644 --- a/include/cuco/trie.cuh +++ b/include/cuco/trie.cuh @@ -25,7 +25,7 @@ namespace experimental { /** * @brief Trie class * - * @tparam label_type type of individual characters of vector keys (eg. char or int) + * @tparam label_type type of individual elements of vector keys (eg. char or int) * @tparam Allocator Type of allocator used for device storage */ template > @@ -42,13 +42,13 @@ class trie { /** * @brief Insert a single key into trie * - * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to trie's + * @tparam LabelIt Device-accessible iterator whose `value_type` can be converted to trie's * `LabelType` - * @param keys_begin Begin iterator to list of labels of input key - * @param keys_end End iterator to list of labels of input key + * @param labels_begin Begin iterator to list of labels of input key + * @param labels_end End iterator to list of labels of input key */ - template - void insert(KeyIt keys_begin, KeyIt keys_end) noexcept; + template + void insert(LabelIt labels_begin, LabelIt labels_end) noexcept; /** * @brief Build level-by-level trie indexes after inserting all keys @@ -59,25 +59,25 @@ class trie { /** * @brief For every pair (`offsets_begin[i]`, `offsets_begin[i + 1]`) in the range - * `[offsets_begin, offsets_end)`, checks if the key defined by characters in the range - * [`keys_begin[offsets_begin[i]]`, `keys_begin[offsets_begin[i + 1]]`) is present in trie. + * `[offsets_begin, offsets_end)`, checks if the key defined by labels in the range + * [`labels_begin[offsets_begin[i]]`, `labels_begin[offsets_begin[i + 1]]`) is present in trie. * Stores the index of key if it exists in trie (-1 otherwise) in `outputs_begin[i]` * - * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to trie's + * @tparam LabelIt Device-accessible iterator whose `value_type` can be converted to trie's * `LabelType` * @tparam OffsetIt Device-accessible iterator whose `value_type` can be converted to trie's * `size_type` * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from boolean * type * - * @param keys_begin Begin iterator to individual key characters + * @param labels_begin Begin iterator to labels list of all keys * @param offsets_begin Begin iterator to offsets of key boundaries * @param offsets_end End iterator to offsets * @param outputs_begin Begin iterator to lookup results * @param stream Stream to execute lookup kernel */ - template - void lookup(KeyIt keys_begin, + template + void lookup(LabelIt labels_begin, OffsetIt offsets_begin, OffsetIt offsets_end, OutputIt outputs_begin, @@ -193,8 +193,7 @@ class trie { /// Type of the allocator to (de)allocate labels using label_allocator_type = typename std::allocator_traits::rebind_alloc; - ///< Stores individual characters of keys - thrust::device_vector labels_; + thrust::device_vector labels_; ///< Labels at this level LabelType* labels_ptr_; ///< Raw device pointer to labels std::vector h_labels_; ///< Host copy of labels, using std::vector for performance diff --git a/tests/trie/lookup_test.cu b/tests/trie/lookup_test.cu index b96691113..b48d4b293 100644 --- a/tests/trie/lookup_test.cu +++ b/tests/trie/lookup_test.cu @@ -28,28 +28,28 @@ TEST_CASE("Lookup test", "") { - using KeyType = int; + using LabelType = int; std::size_t num_keys = 64 * 1024; std::size_t max_key_length = 6; - thrust::host_vector keys; + thrust::host_vector keys; thrust::host_vector offsets; generate_keys(keys, offsets, num_keys, max_key_length); - cuco::experimental::trie trie; + cuco::experimental::trie trie; { - std::vector> all_keys; + std::vector> all_keys; for (size_t key_id = 0; key_id < num_keys; key_id++) { - std::vector cur_key; + std::vector cur_key; for (size_t pos = offsets[key_id]; pos < offsets[key_id + 1]; pos++) { cur_key.push_back(keys[pos]); } all_keys.push_back(cur_key); } - sort(all_keys.begin(), all_keys.end(), vectorKeyCompare()); + sort(all_keys.begin(), all_keys.end(), vectorKeyCompare()); for (auto key : all_keys) { trie.insert(key.begin(), key.end()); @@ -60,7 +60,7 @@ TEST_CASE("Lookup test", "") { thrust::device_vector lookup_result(num_keys, -1lu); - thrust::device_vector device_keys = keys; + thrust::device_vector device_keys = keys; thrust::device_vector device_offsets = offsets; trie.lookup( diff --git a/tests/trie/perf_test.cu b/tests/trie/perf_test.cu index c875a5260..e0e7d3ac3 100644 --- a/tests/trie/perf_test.cu +++ b/tests/trie/perf_test.cu @@ -29,15 +29,15 @@ TEST_CASE("Perf test", "") { - using KeyType = int; + using LabelType = int; const char* input_filename = "trie_dataset.txt"; - auto keys = generate_split_keys(read_input_keys(input_filename, 45 * 1000 * 1000)); + auto keys = generate_split_keys(read_input_keys(input_filename, 45 * 1000 * 1000)); size_t num_keys = keys.size(); std::cout << "Num keys " << num_keys << std::endl; auto begin = current_time(); - cuco::experimental::trie trie; + cuco::experimental::trie trie; for (auto& key : keys) { trie.insert(key.begin(), key.end()); } @@ -76,8 +76,8 @@ TEST_CASE("Perf test", "") // std::cout << "Average key length " << std::setprecision(2) // << 1. * lookup_offsets.back() / num_keys << std::endl; - thrust::device_vector d_lookup_inputs = lookup_inputs; - thrust::device_vector d_lookup_offsets = lookup_offsets; + thrust::device_vector d_lookup_inputs = lookup_inputs; + thrust::device_vector d_lookup_offsets = lookup_offsets; thrust::device_vector d_lookup_result(num_keys, -1lu); cudaStream_t stream; diff --git a/tests/trie/trie_utils.hpp b/tests/trie/trie_utils.hpp index 52d66a62c..5e1e0138a 100644 --- a/tests/trie/trie_utils.hpp +++ b/tests/trie/trie_utils.hpp @@ -10,8 +10,8 @@ struct valid_key { const size_t num_keys_; }; -template -void generate_keys(thrust::host_vector& keys, +template +void generate_keys(thrust::host_vector& keys, thrust::host_vector& offsets, size_t num_keys, size_t max_key_length) @@ -28,9 +28,9 @@ void generate_keys(thrust::host_vector& keys, thrust::exclusive_scan(offsets.begin(), offsets.end(), offsets.begin()); // in-place scan } -template +template struct vectorKeyCompare { - bool operator()(const std::vector& lhs, const std::vector& rhs) const + bool operator()(const std::vector& lhs, const std::vector& rhs) const { for (size_t pos = 0; pos < min(lhs.size(), rhs.size()); pos++) { if (lhs[pos] < rhs[pos]) { @@ -58,11 +58,11 @@ inline std::vector read_input_keys(const char* filename, size_t num return keys; } -template -std::vector split_str_into_ints(const std::string& key) +template +std::vector split_str_into_ints(const std::string& key) { std::stringstream ss(key); - std::vector tokens; + std::vector tokens; std::string buf; while (ss >> buf) { @@ -71,20 +71,20 @@ std::vector split_str_into_ints(const std::string& key) return tokens; } -template -std::vector> generate_split_keys(const std::vector& keys) +template +std::vector> generate_split_keys(const std::vector& keys) { - std::vector> split_keys(keys.size()); + std::vector> split_keys(keys.size()); #pragma omp parallel for for (size_t i = 0; i < keys.size(); i++) { - split_keys[i] = split_str_into_ints(keys[i]); + split_keys[i] = split_str_into_ints(keys[i]); } return split_keys; } -template -void find_pivots(const std::vector>& keys, - std::vector& pivot_vals, +template +void find_pivots(const std::vector>& keys, + std::vector& pivot_vals, std::vector& pivot_offsets) { pivot_vals.push_back(keys[0][1]); From 5f67117a6e80f43070767f9ac3ebc290f8a44492 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 13 Oct 2023 05:09:15 +0000 Subject: [PATCH 30/41] Trie benchmarks --- benchmarks/CMakeLists.txt | 6 ++ benchmarks/trie/insert_bench.cu | 59 +++++++++++++++++++ benchmarks/trie/lookup_bench.cu | 61 +++++++++++++++++++ tests/CMakeLists.txt | 3 +- tests/trie/perf_test.cu | 100 -------------------------------- tests/trie/trie_utils.hpp | 18 ++++++ 6 files changed, 145 insertions(+), 102 deletions(-) create mode 100644 benchmarks/trie/insert_bench.cu create mode 100644 benchmarks/trie/lookup_bench.cu delete mode 100644 tests/trie/perf_test.cu diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index b16f01337..3617426f7 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -92,3 +92,9 @@ ConfigureBench(DYNAMIC_BITSET_BENCH trie/dynamic_bitset/select_bench.cu trie/dynamic_bitset/size_bench.cu trie/dynamic_bitset/test_bench.cu) + +################################################################################################### +# - trie benchmarks ------------------------------------------------------------------------- +ConfigureBench(TRIE_BENCH + trie/insert_bench.cu + trie/lookup_bench.cu) diff --git a/benchmarks/trie/insert_bench.cu b/benchmarks/trie/insert_bench.cu new file mode 100644 index 000000000..8afbde7d8 --- /dev/null +++ b/benchmarks/trie/insert_bench.cu @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "../../tests/trie/trie_utils.hpp" +#include +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::experimental::trie::insert` performance + */ +void trie_insert(nvbench::state& state) +{ + using LabelType = int; + cuco::experimental::trie trie; + + auto const num_keys = 64 * 1024; + std::vector> keys; + + bool synthetic_dataset = true; + if (synthetic_dataset) { + thrust::host_vector labels; + thrust::host_vector offsets; + auto const max_key_length = 6; + generate_keys(labels, offsets, num_keys, max_key_length); + keys = sorted_keys(labels, offsets); + } else { + keys = generate_split_keys(read_input_keys("trie_dataset.txt", num_keys)); + } + + state.add_element_count(num_keys); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + for (auto& key : keys) { + trie.insert(key.begin(), key.end()); + } + }); +} + +NVBENCH_BENCH(trie_insert).set_name("trie_insert").set_max_noise(defaults::MAX_NOISE); diff --git a/benchmarks/trie/lookup_bench.cu b/benchmarks/trie/lookup_bench.cu new file mode 100644 index 000000000..be380065d --- /dev/null +++ b/benchmarks/trie/lookup_bench.cu @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "../../tests/trie/trie_utils.hpp" +#include +#include + +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::experimental::trie::lookup` performance + */ +void trie_lookup(nvbench::state& state) +{ + auto const num_keys = 64 * 1024; + auto const max_key_length = 6; + + using LabelType = int; + cuco::experimental::trie trie; + + thrust::host_vector labels; + thrust::host_vector offsets; + generate_keys(labels, offsets, num_keys, max_key_length); + + auto keys = sorted_keys(labels, offsets); + for (auto key : keys) { + trie.insert(key.begin(), key.end()); + } + trie.build(); + + const size_t query_size = num_keys / 10; + thrust::device_vector inputs(labels.begin(), labels.begin() + offsets[query_size]); + thrust::device_vector d_offsets(offsets.begin(), offsets.begin() + query_size); + thrust::device_vector outputs(query_size, -1lu); + + state.add_element_count(query_size); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + trie.lookup(inputs.begin(), d_offsets.begin(), d_offsets.end(), outputs.begin()); + }); +} + +NVBENCH_BENCH(trie_lookup).set_name("trie_lookup").set_max_noise(defaults::MAX_NOISE); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1c366b9c1..62569500f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -110,5 +110,4 @@ ConfigureTest(DYNAMIC_BITSET_TEST ################################################################################################### # - trie tests ------------------------------------------------------------------------------ ConfigureTest(TRIE_TEST - trie/lookup_test.cu - trie/perf_test.cu) + trie/lookup_test.cu) diff --git a/tests/trie/perf_test.cu b/tests/trie/perf_test.cu deleted file mode 100644 index e0e7d3ac3..000000000 --- a/tests/trie/perf_test.cu +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2023, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include - -#include -#include -#include -#include - -#include - -#include "trie_utils.hpp" - -TEST_CASE("Perf test", "") -{ - using LabelType = int; - - const char* input_filename = "trie_dataset.txt"; - auto keys = generate_split_keys(read_input_keys(input_filename, 45 * 1000 * 1000)); - size_t num_keys = keys.size(); - std::cout << "Num keys " << num_keys << std::endl; - - auto begin = current_time(); - cuco::experimental::trie trie; - for (auto& key : keys) { - trie.insert(key.begin(), key.end()); - } - auto insert_msec = elapsed_milliseconds(begin); - - std::cout << "Insert " << std::setprecision(2) << insert_msec / 1000. << "s @ "; - std::cout << std::setprecision(2) << (1. * num_keys / insert_msec) / 1000 << "M keys/sec" - << std::endl; - - begin = current_time(); - trie.build(); - auto build_msec = elapsed_milliseconds(begin); - - std::cout << "Build " << build_msec << "ms @ "; - std::cout << std::setprecision(3) << (1. * num_keys / build_msec) / 1000 << "M keys/sec" - << std::endl; - - std::random_shuffle(keys.begin(), keys.end()); - - thrust::host_vector lookup_offsets(num_keys + 1); - lookup_offsets[0] = 0; -#pragma omp parallel for - for (size_t i = 0; i < num_keys; i++) { - lookup_offsets[i + 1] = keys[i].size(); - } - std::partial_sum(lookup_offsets.begin(), lookup_offsets.end(), lookup_offsets.begin()); - - thrust::host_vector lookup_inputs(lookup_offsets.back()); -#pragma omp parallel for - for (size_t i = 0; i < num_keys; i++) { - for (size_t pos = 0; pos < keys[i].size(); pos++) { - lookup_inputs[lookup_offsets[i] + pos] = keys[i][pos]; - } - } - - // std::cout << "Average key length " << std::setprecision(2) - // << 1. * lookup_offsets.back() / num_keys << std::endl; - - thrust::device_vector d_lookup_inputs = lookup_inputs; - thrust::device_vector d_lookup_offsets = lookup_offsets; - thrust::device_vector d_lookup_result(num_keys, -1lu); - - cudaStream_t stream; - cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); - - begin = current_time(); - trie.lookup(d_lookup_inputs.begin(), - d_lookup_offsets.begin(), - d_lookup_offsets.end(), - d_lookup_result.begin(), - stream); - cudaStreamSynchronize(stream); - auto lookup_msec = elapsed_milliseconds(begin); - - std::cout << "Lookup " << lookup_msec << "ms @ "; - std::cout << std::setprecision(2) << (1. * num_keys / lookup_msec) / 1000.0 << "M keys/sec" - << std::endl; - - REQUIRE(cuco::test::all_of(d_lookup_result.begin(), d_lookup_result.end(), valid_key(num_keys))); -} diff --git a/tests/trie/trie_utils.hpp b/tests/trie/trie_utils.hpp index 5e1e0138a..d641c5316 100644 --- a/tests/trie/trie_utils.hpp +++ b/tests/trie/trie_utils.hpp @@ -3,6 +3,7 @@ #include #include #include +#include struct valid_key { valid_key(size_t num_keys) : num_keys_(num_keys) {} @@ -43,6 +44,23 @@ struct vectorKeyCompare { } }; +template +std::vector> sorted_keys(thrust::host_vector& labels, + thrust::host_vector& offsets) +{ + std::vector> keys; + size_t num_keys = offsets.size() - 1; + for (size_t key_id = 0; key_id < num_keys; key_id++) { + std::vector cur_key; + for (size_t pos = offsets[key_id]; pos < offsets[key_id + 1]; pos++) { + cur_key.push_back(labels[pos]); + } + keys.push_back(cur_key); + } + sort(keys.begin(), keys.end(), vectorKeyCompare()); + return keys; +} + inline std::vector read_input_keys(const char* filename, size_t num_keys) { std::ifstream input_file(filename); From 660bdc0ba17b160e101cdb48cacea1bf45af391a Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 13 Oct 2023 06:04:01 +0000 Subject: [PATCH 31/41] Reorganize trie utils --- benchmarks/trie/insert_bench.cu | 4 +-- benchmarks/trie/lookup_bench.cu | 4 +-- tests/trie/lookup_test.cu | 41 +++++++----------------- tests/trie/trie_utils.hpp | 56 ++++++++++++--------------------- 4 files changed, 36 insertions(+), 69 deletions(-) diff --git a/benchmarks/trie/insert_bench.cu b/benchmarks/trie/insert_bench.cu index 8afbde7d8..0e86fbd1f 100644 --- a/benchmarks/trie/insert_bench.cu +++ b/benchmarks/trie/insert_bench.cu @@ -42,10 +42,10 @@ void trie_insert(nvbench::state& state) thrust::host_vector labels; thrust::host_vector offsets; auto const max_key_length = 6; - generate_keys(labels, offsets, num_keys, max_key_length); + generate_labels(labels, offsets, num_keys, max_key_length); keys = sorted_keys(labels, offsets); } else { - keys = generate_split_keys(read_input_keys("trie_dataset.txt", num_keys)); + keys = read_keys("trie_dataset.txt", num_keys); } state.add_element_count(num_keys); diff --git a/benchmarks/trie/lookup_bench.cu b/benchmarks/trie/lookup_bench.cu index be380065d..3090d872e 100644 --- a/benchmarks/trie/lookup_bench.cu +++ b/benchmarks/trie/lookup_bench.cu @@ -39,8 +39,8 @@ void trie_lookup(nvbench::state& state) thrust::host_vector labels; thrust::host_vector offsets; - generate_keys(labels, offsets, num_keys, max_key_length); + generate_labels(labels, offsets, num_keys, max_key_length); auto keys = sorted_keys(labels, offsets); for (auto key : keys) { trie.insert(key.begin(), key.end()); @@ -50,7 +50,7 @@ void trie_lookup(nvbench::state& state) const size_t query_size = num_keys / 10; thrust::device_vector inputs(labels.begin(), labels.begin() + offsets[query_size]); thrust::device_vector d_offsets(offsets.begin(), offsets.begin() + query_size); - thrust::device_vector outputs(query_size, -1lu); + thrust::device_vector outputs(query_size); state.add_element_count(query_size); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { diff --git a/tests/trie/lookup_test.cu b/tests/trie/lookup_test.cu index b48d4b293..e61695186 100644 --- a/tests/trie/lookup_test.cu +++ b/tests/trie/lookup_test.cu @@ -32,40 +32,23 @@ TEST_CASE("Lookup test", "") std::size_t num_keys = 64 * 1024; std::size_t max_key_length = 6; - thrust::host_vector keys; + + thrust::host_vector labels; thrust::host_vector offsets; - generate_keys(keys, offsets, num_keys, max_key_length); + generate_labels(labels, offsets, num_keys, max_key_length); + auto keys = sorted_keys(labels, offsets); cuco::experimental::trie trie; - - { - std::vector> all_keys; - for (size_t key_id = 0; key_id < num_keys; key_id++) { - std::vector cur_key; - for (size_t pos = offsets[key_id]; pos < offsets[key_id + 1]; pos++) { - cur_key.push_back(keys[pos]); - } - all_keys.push_back(cur_key); - } - - sort(all_keys.begin(), all_keys.end(), vectorKeyCompare()); - - for (auto key : all_keys) { - trie.insert(key.begin(), key.end()); - } - - trie.build(); + for (auto key : keys) { + trie.insert(key.begin(), key.end()); } + trie.build(); - { - thrust::device_vector lookup_result(num_keys, -1lu); - thrust::device_vector device_keys = keys; - thrust::device_vector device_offsets = offsets; - - trie.lookup( - device_keys.begin(), device_offsets.begin(), device_offsets.end(), lookup_result.begin()); + thrust::device_vector d_labels = labels; + thrust::device_vector d_offsets = offsets; + thrust::device_vector result(num_keys, -1lu); - REQUIRE(cuco::test::all_of(lookup_result.begin(), lookup_result.end(), valid_key(num_keys))); - } + trie.lookup(d_labels.begin(), d_offsets.begin(), d_offsets.end(), result.begin()); + REQUIRE(cuco::test::all_of(result.begin(), result.end(), valid_key(num_keys))); } diff --git a/tests/trie/trie_utils.hpp b/tests/trie/trie_utils.hpp index d641c5316..5427c1858 100644 --- a/tests/trie/trie_utils.hpp +++ b/tests/trie/trie_utils.hpp @@ -12,10 +12,10 @@ struct valid_key { }; template -void generate_keys(thrust::host_vector& keys, - thrust::host_vector& offsets, - size_t num_keys, - size_t max_key_length) +void generate_labels(thrust::host_vector& keys, + thrust::host_vector& offsets, + size_t num_keys, + size_t max_key_length) { for (size_t key_id = 0; key_id < num_keys; key_id++) { size_t cur_key_length = 1 + (std::rand() % max_key_length); @@ -61,32 +61,17 @@ std::vector> sorted_keys(thrust::host_vector& return keys; } -inline std::vector read_input_keys(const char* filename, size_t num_keys) -{ - std::ifstream input_file(filename); - if (!input_file.is_open()) { - std::cout << "Error opening file: " << filename << std::endl; - exit(1); - } - std::vector keys; - std::string line; - while (keys.size() < num_keys and getline(input_file, line)) { - keys.push_back(line); - } - return keys; -} - template -std::vector split_str_into_ints(const std::string& key) +std::vector split_key_into_labels(const std::string& key) { std::stringstream ss(key); - std::vector tokens; + std::vector labels; std::string buf; while (ss >> buf) { - tokens.push_back(stoi(buf)); + labels.push_back(stoi(buf)); } - return tokens; + return labels; } template @@ -95,26 +80,25 @@ std::vector> generate_split_keys(const std::vector> split_keys(keys.size()); #pragma omp parallel for for (size_t i = 0; i < keys.size(); i++) { - split_keys[i] = split_str_into_ints(keys[i]); + split_keys[i] = split_key_into_labels(keys[i]); } return split_keys; } template -void find_pivots(const std::vector>& keys, - std::vector& pivot_vals, - std::vector& pivot_offsets) +inline std::vector> read_keys(const char* filename, size_t num_keys) { - pivot_vals.push_back(keys[0][1]); - pivot_offsets.push_back(0); - - for (size_t pos = 1; pos < keys.size(); pos++) { - if (keys[pos][1] != keys[pos - 1][1]) { - pivot_vals.push_back(keys[pos][1]); - pivot_offsets.push_back(pos); - } + std::ifstream input_file(filename); + if (!input_file.is_open()) { + std::cout << "Error opening file: " << filename << std::endl; + exit(1); + } + std::vector keys; + std::string line; + while (keys.size() < num_keys and getline(input_file, line)) { + keys.push_back(line); } - pivot_offsets.push_back(keys.size()); + return generate_split_keys(keys); } inline std::chrono::high_resolution_clock::time_point current_time() From 6f2b0b88641d0057a31f5559432a6ba8273c125a Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 13 Oct 2023 07:20:03 +0000 Subject: [PATCH 32/41] Use cuco key generators --- tests/trie/trie_utils.hpp | 42 +++++++++++++++------------------------ 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/tests/trie/trie_utils.hpp b/tests/trie/trie_utils.hpp index 5427c1858..5dd136070 100644 --- a/tests/trie/trie_utils.hpp +++ b/tests/trie/trie_utils.hpp @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -12,21 +13,27 @@ struct valid_key { }; template -void generate_labels(thrust::host_vector& keys, +void generate_labels(thrust::host_vector& labels, thrust::host_vector& offsets, size_t num_keys, size_t max_key_length) { - for (size_t key_id = 0; key_id < num_keys; key_id++) { - size_t cur_key_length = 1 + (std::rand() % max_key_length); - offsets.push_back(cur_key_length); - for (size_t pos = 0; pos < cur_key_length; pos++) { - keys.push_back(std::rand() % 100000); - } + cuco::utility::key_generator gen; + + cuco::utility::distribution::unique lengths_dist; + offsets.resize(num_keys); + gen.generate(lengths_dist, offsets.begin(), offsets.end()); + + for (auto& offset : offsets) { + offset = 1 + (offset % max_key_length); } - offsets.push_back(0); // Extend size by 1 for subsequent scan - thrust::exclusive_scan(offsets.begin(), offsets.end(), offsets.begin()); // in-place scan + offsets.push_back(0); + thrust::exclusive_scan(offsets.begin(), offsets.end(), offsets.begin()); + + cuco::utility::distribution::gaussian labels_dist{0.5}; + labels.resize(offsets.back()); + gen.generate(labels_dist, labels.begin(), labels.end()); } template @@ -100,20 +107,3 @@ inline std::vector> read_keys(const char* filename, size_ } return generate_split_keys(keys); } - -inline std::chrono::high_resolution_clock::time_point current_time() -{ - return std::chrono::high_resolution_clock::now(); -} -inline size_t elapsed_seconds(std::chrono::high_resolution_clock::time_point begin) -{ - return std::chrono::duration_cast(current_time() - begin).count(); -} -inline size_t elapsed_milliseconds(std::chrono::high_resolution_clock::time_point begin) -{ - return std::chrono::duration_cast(current_time() - begin).count(); -} -inline size_t elapsed_microseconds(std::chrono::high_resolution_clock::time_point begin) -{ - return std::chrono::duration_cast(current_time() - begin).count(); -} From 799d96855a4fcb34d3d55eb11a7adec5684fb7a4 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Fri, 13 Oct 2023 20:15:55 +0000 Subject: [PATCH 33/41] Add key length and key count nvbench axes --- benchmarks/trie/insert_bench.cu | 27 ++++++++++++++------------- benchmarks/trie/lookup_bench.cu | 18 +++++++++++++----- tests/trie/lookup_test.cu | 6 +++++- tests/trie/trie_utils.hpp | 8 ++++---- 4 files changed, 36 insertions(+), 23 deletions(-) diff --git a/benchmarks/trie/insert_bench.cu b/benchmarks/trie/insert_bench.cu index 0e86fbd1f..1a0dea6c6 100644 --- a/benchmarks/trie/insert_bench.cu +++ b/benchmarks/trie/insert_bench.cu @@ -31,22 +31,19 @@ using namespace cuco::utility; */ void trie_insert(nvbench::state& state) { + auto const num_keys = state.get_int64_or_default("NumKeys", 100 * 1000); + auto const max_key_length = state.get_int64_or_default("MaxKeyLength", 10); + using LabelType = int; cuco::experimental::trie trie; - auto const num_keys = 64 * 1024; - std::vector> keys; + thrust::host_vector labels; + thrust::host_vector offsets; - bool synthetic_dataset = true; - if (synthetic_dataset) { - thrust::host_vector labels; - thrust::host_vector offsets; - auto const max_key_length = 6; - generate_labels(labels, offsets, num_keys, max_key_length); - keys = sorted_keys(labels, offsets); - } else { - keys = read_keys("trie_dataset.txt", num_keys); - } + distribution::unique lengths_dist; + distribution::gaussian labels_dist{0.5}; + generate_labels(labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist); + auto keys = sorted_keys(labels, offsets); state.add_element_count(num_keys); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { @@ -56,4 +53,8 @@ void trie_insert(nvbench::state& state) }); } -NVBENCH_BENCH(trie_insert).set_name("trie_insert").set_max_noise(defaults::MAX_NOISE); +NVBENCH_BENCH(trie_insert) + .set_name("trie_insert") + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("NumKeys", std::vector{100 * 1000, 1000 * 1000}) + .add_int64_axis("MaxKeyLength", std::vector{4, 8, 16}); diff --git a/benchmarks/trie/lookup_bench.cu b/benchmarks/trie/lookup_bench.cu index 3090d872e..7f6f301cf 100644 --- a/benchmarks/trie/lookup_bench.cu +++ b/benchmarks/trie/lookup_bench.cu @@ -31,8 +31,8 @@ using namespace cuco::utility; */ void trie_lookup(nvbench::state& state) { - auto const num_keys = 64 * 1024; - auto const max_key_length = 6; + auto const num_keys = state.get_int64_or_default("NumKeys", 100 * 1000); + auto const max_key_length = state.get_int64_or_default("MaxKeyLength", 10); using LabelType = int; cuco::experimental::trie trie; @@ -40,14 +40,17 @@ void trie_lookup(nvbench::state& state) thrust::host_vector labels; thrust::host_vector offsets; - generate_labels(labels, offsets, num_keys, max_key_length); + distribution::unique lengths_dist; + distribution::gaussian labels_dist{0.5}; + generate_labels(labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist); auto keys = sorted_keys(labels, offsets); + for (auto key : keys) { trie.insert(key.begin(), key.end()); } trie.build(); - const size_t query_size = num_keys / 10; + const size_t query_size = min(1000 * 1000lu, num_keys / 10); thrust::device_vector inputs(labels.begin(), labels.begin() + offsets[query_size]); thrust::device_vector d_offsets(offsets.begin(), offsets.begin() + query_size); thrust::device_vector outputs(query_size); @@ -58,4 +61,9 @@ void trie_lookup(nvbench::state& state) }); } -NVBENCH_BENCH(trie_lookup).set_name("trie_lookup").set_max_noise(defaults::MAX_NOISE); +NVBENCH_BENCH(trie_lookup) + .set_name("trie_lookup") + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("NumKeys", + std::vector{100 * 1000, 1000 * 1000, 10 * 1000 * 1000}) + .add_int64_axis("MaxKeyLength", std::vector{4, 8, 16}); diff --git a/tests/trie/lookup_test.cu b/tests/trie/lookup_test.cu index e61695186..6c259434b 100644 --- a/tests/trie/lookup_test.cu +++ b/tests/trie/lookup_test.cu @@ -26,6 +26,8 @@ #include "trie_utils.hpp" +using namespace cuco::utility; + TEST_CASE("Lookup test", "") { using LabelType = int; @@ -36,7 +38,9 @@ TEST_CASE("Lookup test", "") thrust::host_vector labels; thrust::host_vector offsets; - generate_labels(labels, offsets, num_keys, max_key_length); + distribution::unique lengths_dist; + distribution::gaussian labels_dist{0.5}; + generate_labels(labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist); auto keys = sorted_keys(labels, offsets); cuco::experimental::trie trie; diff --git a/tests/trie/trie_utils.hpp b/tests/trie/trie_utils.hpp index 5dd136070..0f6cef856 100644 --- a/tests/trie/trie_utils.hpp +++ b/tests/trie/trie_utils.hpp @@ -12,15 +12,16 @@ struct valid_key { const size_t num_keys_; }; -template +template void generate_labels(thrust::host_vector& labels, thrust::host_vector& offsets, size_t num_keys, - size_t max_key_length) + size_t max_key_length, + LengthsDist lengths_dist, + LabelsDist labels_dist) { cuco::utility::key_generator gen; - cuco::utility::distribution::unique lengths_dist; offsets.resize(num_keys); gen.generate(lengths_dist, offsets.begin(), offsets.end()); @@ -31,7 +32,6 @@ void generate_labels(thrust::host_vector& labels, offsets.push_back(0); thrust::exclusive_scan(offsets.begin(), offsets.end(), offsets.begin()); - cuco::utility::distribution::gaussian labels_dist{0.5}; labels.resize(offsets.back()); gen.generate(labels_dist, labels.begin(), labels.end()); } From 025c59a2b9e3852b845a3b714a8f428c920417b0 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 18 Oct 2023 01:19:13 +0000 Subject: [PATCH 34/41] Add namespaces --- benchmarks/trie/insert_bench.cu | 7 ++++--- benchmarks/trie/lookup_bench.cu | 7 ++++--- tests/trie/lookup_test.cu | 10 +++++----- tests/{trie => }/trie_utils.hpp | 8 ++++++++ 4 files changed, 21 insertions(+), 11 deletions(-) rename tests/{trie => }/trie_utils.hpp (96%) diff --git a/benchmarks/trie/insert_bench.cu b/benchmarks/trie/insert_bench.cu index 1a0dea6c6..6db0d6a50 100644 --- a/benchmarks/trie/insert_bench.cu +++ b/benchmarks/trie/insert_bench.cu @@ -17,7 +17,7 @@ #include #include -#include "../../tests/trie/trie_utils.hpp" +#include <../tests/trie_utils.hpp> #include #include @@ -42,8 +42,9 @@ void trie_insert(nvbench::state& state) distribution::unique lengths_dist; distribution::gaussian labels_dist{0.5}; - generate_labels(labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist); - auto keys = sorted_keys(labels, offsets); + cuco::test::trie::generate_labels( + labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist); + auto keys = cuco::test::trie::sorted_keys(labels, offsets); state.add_element_count(num_keys); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { diff --git a/benchmarks/trie/lookup_bench.cu b/benchmarks/trie/lookup_bench.cu index 7f6f301cf..635d0b7b8 100644 --- a/benchmarks/trie/lookup_bench.cu +++ b/benchmarks/trie/lookup_bench.cu @@ -17,7 +17,7 @@ #include #include -#include "../../tests/trie/trie_utils.hpp" +#include <../tests/trie_utils.hpp> #include #include @@ -42,8 +42,9 @@ void trie_lookup(nvbench::state& state) distribution::unique lengths_dist; distribution::gaussian labels_dist{0.5}; - generate_labels(labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist); - auto keys = sorted_keys(labels, offsets); + cuco::test::trie::generate_labels( + labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist); + auto keys = cuco::test::trie::sorted_keys(labels, offsets); for (auto key : keys) { trie.insert(key.begin(), key.end()); diff --git a/tests/trie/lookup_test.cu b/tests/trie/lookup_test.cu index 6c259434b..31eea6d52 100644 --- a/tests/trie/lookup_test.cu +++ b/tests/trie/lookup_test.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include @@ -24,8 +25,6 @@ #include -#include "trie_utils.hpp" - using namespace cuco::utility; TEST_CASE("Lookup test", "") @@ -40,8 +39,9 @@ TEST_CASE("Lookup test", "") distribution::unique lengths_dist; distribution::gaussian labels_dist{0.5}; - generate_labels(labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist); - auto keys = sorted_keys(labels, offsets); + cuco::test::trie::generate_labels( + labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist); + auto keys = cuco::test::trie::sorted_keys(labels, offsets); cuco::experimental::trie trie; for (auto key : keys) { @@ -54,5 +54,5 @@ TEST_CASE("Lookup test", "") thrust::device_vector result(num_keys, -1lu); trie.lookup(d_labels.begin(), d_offsets.begin(), d_offsets.end(), result.begin()); - REQUIRE(cuco::test::all_of(result.begin(), result.end(), valid_key(num_keys))); + REQUIRE(cuco::test::all_of(result.begin(), result.end(), cuco::test::trie::valid_key(num_keys))); } diff --git a/tests/trie/trie_utils.hpp b/tests/trie_utils.hpp similarity index 96% rename from tests/trie/trie_utils.hpp rename to tests/trie_utils.hpp index 0f6cef856..1036e2843 100644 --- a/tests/trie/trie_utils.hpp +++ b/tests/trie_utils.hpp @@ -6,6 +6,10 @@ #include #include +namespace cuco { +namespace test { +namespace trie { + struct valid_key { valid_key(size_t num_keys) : num_keys_(num_keys) {} __host__ __device__ bool operator()(size_t x) const { return x < num_keys_; } @@ -107,3 +111,7 @@ inline std::vector> read_keys(const char* filename, size_ } return generate_split_keys(keys); } + +} // namespace trie +} // namespace test +} // namespace cuco From 6184e5062cc664f965abeb37e397292efaceedcb Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 18 Oct 2023 01:42:04 +0000 Subject: [PATCH 35/41] Remove custom key comparator --- tests/trie_utils.hpp | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/tests/trie_utils.hpp b/tests/trie_utils.hpp index 1036e2843..607ddc2c4 100644 --- a/tests/trie_utils.hpp +++ b/tests/trie_utils.hpp @@ -40,21 +40,6 @@ void generate_labels(thrust::host_vector& labels, gen.generate(labels_dist, labels.begin(), labels.end()); } -template -struct vectorKeyCompare { - bool operator()(const std::vector& lhs, const std::vector& rhs) const - { - for (size_t pos = 0; pos < min(lhs.size(), rhs.size()); pos++) { - if (lhs[pos] < rhs[pos]) { - return true; - } else if (lhs[pos] > rhs[pos]) { - return false; - } - } - return lhs.size() <= rhs.size(); - } -}; - template std::vector> sorted_keys(thrust::host_vector& labels, thrust::host_vector& offsets) @@ -68,7 +53,7 @@ std::vector> sorted_keys(thrust::host_vector& } keys.push_back(cur_key); } - sort(keys.begin(), keys.end(), vectorKeyCompare()); + sort(keys.begin(), keys.end()); return keys; } From 7f1d083d27c2e67cda42d30e0145791c6100e9c3 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 18 Oct 2023 03:27:29 +0000 Subject: [PATCH 36/41] Include multiple label types Both in tests and benchmarks --- benchmarks/trie/insert_bench.cu | 6 +++--- benchmarks/trie/lookup_bench.cu | 9 ++++----- tests/trie/lookup_test.cu | 11 ++++++++--- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/benchmarks/trie/insert_bench.cu b/benchmarks/trie/insert_bench.cu index 6db0d6a50..1f78cd458 100644 --- a/benchmarks/trie/insert_bench.cu +++ b/benchmarks/trie/insert_bench.cu @@ -29,12 +29,12 @@ using namespace cuco::utility; /** * @brief A benchmark evaluating `cuco::experimental::trie::insert` performance */ -void trie_insert(nvbench::state& state) +template +void trie_insert(nvbench::state& state, nvbench::type_list) { auto const num_keys = state.get_int64_or_default("NumKeys", 100 * 1000); auto const max_key_length = state.get_int64_or_default("MaxKeyLength", 10); - using LabelType = int; cuco::experimental::trie trie; thrust::host_vector labels; @@ -54,7 +54,7 @@ void trie_insert(nvbench::state& state) }); } -NVBENCH_BENCH(trie_insert) +NVBENCH_BENCH_TYPES(trie_insert, NVBENCH_TYPE_AXES(nvbench::type_list)) .set_name("trie_insert") .set_max_noise(defaults::MAX_NOISE) .add_int64_axis("NumKeys", std::vector{100 * 1000, 1000 * 1000}) diff --git a/benchmarks/trie/lookup_bench.cu b/benchmarks/trie/lookup_bench.cu index 635d0b7b8..b026726ef 100644 --- a/benchmarks/trie/lookup_bench.cu +++ b/benchmarks/trie/lookup_bench.cu @@ -29,12 +29,12 @@ using namespace cuco::utility; /** * @brief A benchmark evaluating `cuco::experimental::trie::lookup` performance */ -void trie_lookup(nvbench::state& state) +template +void trie_lookup(nvbench::state& state, nvbench::type_list) { auto const num_keys = state.get_int64_or_default("NumKeys", 100 * 1000); auto const max_key_length = state.get_int64_or_default("MaxKeyLength", 10); - using LabelType = int; cuco::experimental::trie trie; thrust::host_vector labels; @@ -62,9 +62,8 @@ void trie_lookup(nvbench::state& state) }); } -NVBENCH_BENCH(trie_lookup) +NVBENCH_BENCH_TYPES(trie_lookup, NVBENCH_TYPE_AXES(nvbench::type_list)) .set_name("trie_lookup") .set_max_noise(defaults::MAX_NOISE) - .add_int64_axis("NumKeys", - std::vector{100 * 1000, 1000 * 1000, 10 * 1000 * 1000}) + .add_int64_axis("NumKeys", std::vector{100 * 1000, 1000 * 1000}) .add_int64_axis("MaxKeyLength", std::vector{4, 8, 16}); diff --git a/tests/trie/lookup_test.cu b/tests/trie/lookup_test.cu index 31eea6d52..90ea7c132 100644 --- a/tests/trie/lookup_test.cu +++ b/tests/trie/lookup_test.cu @@ -27,10 +27,9 @@ using namespace cuco::utility; -TEST_CASE("Lookup test", "") +template +void trie_lookup_test() { - using LabelType = int; - std::size_t num_keys = 64 * 1024; std::size_t max_key_length = 6; @@ -56,3 +55,9 @@ TEST_CASE("Lookup test", "") trie.lookup(d_labels.begin(), d_offsets.begin(), d_offsets.end(), result.begin()); REQUIRE(cuco::test::all_of(result.begin(), result.end(), cuco::test::trie::valid_key(num_keys))); } + +TEST_CASE("Trie lookup", "") +{ + trie_lookup_test(); + trie_lookup_test(); +} From 5ed049857a02d2fe51b6c1343d1ac5ee9cc5128f Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 18 Oct 2023 03:51:22 +0000 Subject: [PATCH 37/41] Change key distribution to uniform --- benchmarks/trie/dynamic_bitset/find_next_bench.cu | 2 +- benchmarks/trie/dynamic_bitset/rank_bench.cu | 2 +- benchmarks/trie/dynamic_bitset/select_bench.cu | 2 +- benchmarks/trie/dynamic_bitset/size_bench.cu | 2 +- benchmarks/trie/dynamic_bitset/test_bench.cu | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/benchmarks/trie/dynamic_bitset/find_next_bench.cu b/benchmarks/trie/dynamic_bitset/find_next_bench.cu index 029378422..c6d38745c 100644 --- a/benchmarks/trie/dynamic_bitset/find_next_bench.cu +++ b/benchmarks/trie/dynamic_bitset/find_next_bench.cu @@ -56,7 +56,7 @@ void dynamic_bitset_find_next(nvbench::state& state, nvbench::type_list) } NVBENCH_BENCH_TYPES(dynamic_bitset_find_next, - NVBENCH_TYPE_AXES(nvbench::type_list)) + NVBENCH_TYPE_AXES(nvbench::type_list)) .set_name("dynamic_bitset_find_next") .set_type_axes_names({"Distribution"}) .set_max_noise(defaults::MAX_NOISE); diff --git a/benchmarks/trie/dynamic_bitset/rank_bench.cu b/benchmarks/trie/dynamic_bitset/rank_bench.cu index 20e6a10cf..6b41cf029 100644 --- a/benchmarks/trie/dynamic_bitset/rank_bench.cu +++ b/benchmarks/trie/dynamic_bitset/rank_bench.cu @@ -56,7 +56,7 @@ void dynamic_bitset_rank(nvbench::state& state, nvbench::type_list) } NVBENCH_BENCH_TYPES(dynamic_bitset_rank, - NVBENCH_TYPE_AXES(nvbench::type_list)) + NVBENCH_TYPE_AXES(nvbench::type_list)) .set_name("dynamic_bitset_rank") .set_type_axes_names({"Distribution"}) .set_max_noise(defaults::MAX_NOISE); diff --git a/benchmarks/trie/dynamic_bitset/select_bench.cu b/benchmarks/trie/dynamic_bitset/select_bench.cu index 9530b5379..755018ea3 100644 --- a/benchmarks/trie/dynamic_bitset/select_bench.cu +++ b/benchmarks/trie/dynamic_bitset/select_bench.cu @@ -56,7 +56,7 @@ void dynamic_bitset_select(nvbench::state& state, nvbench::type_list) } NVBENCH_BENCH_TYPES(dynamic_bitset_select, - NVBENCH_TYPE_AXES(nvbench::type_list)) + NVBENCH_TYPE_AXES(nvbench::type_list)) .set_name("dynamic_bitset_select") .set_type_axes_names({"Distribution"}) .set_max_noise(defaults::MAX_NOISE); diff --git a/benchmarks/trie/dynamic_bitset/size_bench.cu b/benchmarks/trie/dynamic_bitset/size_bench.cu index 98a004bd1..d31ecebbd 100644 --- a/benchmarks/trie/dynamic_bitset/size_bench.cu +++ b/benchmarks/trie/dynamic_bitset/size_bench.cu @@ -50,7 +50,7 @@ void dynamic_bitset_size(nvbench::state& state, nvbench::type_list) } NVBENCH_BENCH_TYPES(dynamic_bitset_size, - NVBENCH_TYPE_AXES(nvbench::type_list)) + NVBENCH_TYPE_AXES(nvbench::type_list)) .set_name("dynamic_bitset_size") .set_type_axes_names({"Distribution"}) .set_max_noise(defaults::MAX_NOISE); diff --git a/benchmarks/trie/dynamic_bitset/test_bench.cu b/benchmarks/trie/dynamic_bitset/test_bench.cu index 4786a909a..46109c146 100644 --- a/benchmarks/trie/dynamic_bitset/test_bench.cu +++ b/benchmarks/trie/dynamic_bitset/test_bench.cu @@ -56,7 +56,7 @@ void dynamic_bitset_test(nvbench::state& state, nvbench::type_list) } NVBENCH_BENCH_TYPES(dynamic_bitset_test, - NVBENCH_TYPE_AXES(nvbench::type_list)) + NVBENCH_TYPE_AXES(nvbench::type_list)) .set_name("dynamic_bitset_test") .set_type_axes_names({"Distribution"}) .set_max_noise(defaults::MAX_NOISE); From 6842144e3f750e5806e70116560f382c30af1fb4 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 18 Oct 2023 04:05:08 +0000 Subject: [PATCH 38/41] Move file --- benchmarks/trie/insert_bench.cu | 2 +- benchmarks/trie/lookup_bench.cu | 2 +- tests/trie/lookup_test.cu | 2 +- tests/{trie_utils.hpp => trie/utils.hpp} | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename tests/{trie_utils.hpp => trie/utils.hpp} (100%) diff --git a/benchmarks/trie/insert_bench.cu b/benchmarks/trie/insert_bench.cu index 1f78cd458..fa21b2347 100644 --- a/benchmarks/trie/insert_bench.cu +++ b/benchmarks/trie/insert_bench.cu @@ -17,7 +17,7 @@ #include #include -#include <../tests/trie_utils.hpp> +#include <../tests/trie/utils.hpp> #include #include diff --git a/benchmarks/trie/lookup_bench.cu b/benchmarks/trie/lookup_bench.cu index b026726ef..5ae8f0b1f 100644 --- a/benchmarks/trie/lookup_bench.cu +++ b/benchmarks/trie/lookup_bench.cu @@ -17,7 +17,7 @@ #include #include -#include <../tests/trie_utils.hpp> +#include <../tests/trie/utils.hpp> #include #include diff --git a/tests/trie/lookup_test.cu b/tests/trie/lookup_test.cu index 90ea7c132..cdc38cd35 100644 --- a/tests/trie/lookup_test.cu +++ b/tests/trie/lookup_test.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include diff --git a/tests/trie_utils.hpp b/tests/trie/utils.hpp similarity index 100% rename from tests/trie_utils.hpp rename to tests/trie/utils.hpp From 4dcb7ceb487392b2ff7742a7fd6b5b93a6f34072 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 18 Oct 2023 18:59:46 +0000 Subject: [PATCH 39/41] Delete file read utilities --- tests/trie/utils.hpp | 40 ---------------------------------------- 1 file changed, 40 deletions(-) diff --git a/tests/trie/utils.hpp b/tests/trie/utils.hpp index 607ddc2c4..7eae100af 100644 --- a/tests/trie/utils.hpp +++ b/tests/trie/utils.hpp @@ -57,46 +57,6 @@ std::vector> sorted_keys(thrust::host_vector& return keys; } -template -std::vector split_key_into_labels(const std::string& key) -{ - std::stringstream ss(key); - std::vector labels; - std::string buf; - - while (ss >> buf) { - labels.push_back(stoi(buf)); - } - return labels; -} - -template -std::vector> generate_split_keys(const std::vector& keys) -{ - std::vector> split_keys(keys.size()); -#pragma omp parallel for - for (size_t i = 0; i < keys.size(); i++) { - split_keys[i] = split_key_into_labels(keys[i]); - } - return split_keys; -} - -template -inline std::vector> read_keys(const char* filename, size_t num_keys) -{ - std::ifstream input_file(filename); - if (!input_file.is_open()) { - std::cout << "Error opening file: " << filename << std::endl; - exit(1); - } - std::vector keys; - std::string line; - while (keys.size() < num_keys and getline(input_file, line)) { - keys.push_back(line); - } - return generate_split_keys(keys); -} - } // namespace trie } // namespace test } // namespace cuco From 989237805b85a9e0232d45f5273fb39e4153444b Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 18 Oct 2023 19:51:15 +0000 Subject: [PATCH 40/41] Change nvbench axes order --- benchmarks/trie/insert_bench.cu | 4 ++-- benchmarks/trie/lookup_bench.cu | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/trie/insert_bench.cu b/benchmarks/trie/insert_bench.cu index fa21b2347..90c243ed0 100644 --- a/benchmarks/trie/insert_bench.cu +++ b/benchmarks/trie/insert_bench.cu @@ -57,5 +57,5 @@ void trie_insert(nvbench::state& state, nvbench::type_list) NVBENCH_BENCH_TYPES(trie_insert, NVBENCH_TYPE_AXES(nvbench::type_list)) .set_name("trie_insert") .set_max_noise(defaults::MAX_NOISE) - .add_int64_axis("NumKeys", std::vector{100 * 1000, 1000 * 1000}) - .add_int64_axis("MaxKeyLength", std::vector{4, 8, 16}); + .add_int64_axis("MaxKeyLength", std::vector{4, 8, 16}) + .add_int64_axis("NumKeys", std::vector{100 * 1000, 1000 * 1000}); diff --git a/benchmarks/trie/lookup_bench.cu b/benchmarks/trie/lookup_bench.cu index 5ae8f0b1f..55a30b168 100644 --- a/benchmarks/trie/lookup_bench.cu +++ b/benchmarks/trie/lookup_bench.cu @@ -65,5 +65,5 @@ void trie_lookup(nvbench::state& state, nvbench::type_list) NVBENCH_BENCH_TYPES(trie_lookup, NVBENCH_TYPE_AXES(nvbench::type_list)) .set_name("trie_lookup") .set_max_noise(defaults::MAX_NOISE) - .add_int64_axis("NumKeys", std::vector{100 * 1000, 1000 * 1000}) - .add_int64_axis("MaxKeyLength", std::vector{4, 8, 16}); + .add_int64_axis("MaxKeyLength", std::vector{4, 8, 16}) + .add_int64_axis("NumKeys", std::vector{100 * 1000, 1000 * 1000}); From fc3fa7aeb008a5d4db1a78663fd46ce83f515ac1 Mon Sep 17 00:00:00 2001 From: Anurag Mukkara Date: Wed, 18 Oct 2023 22:19:01 +0000 Subject: [PATCH 41/41] Trie examples --- examples/CMakeLists.txt | 2 + examples/trie/device_ref_example.cu | 89 +++++++++++++++++++++++++++++ examples/trie/host_bulk_example.cu | 69 ++++++++++++++++++++++ 3 files changed, 160 insertions(+) create mode 100644 examples/trie/device_ref_example.cu create mode 100644 examples/trie/host_bulk_example.cu diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d78627eee..5b0ff128e 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -40,3 +40,5 @@ ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/sta ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu") ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu") ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu") +ConfigureExample(TRIE_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/trie/host_bulk_example.cu") +ConfigureExample(TRIE_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/trie/device_ref_example.cu") diff --git a/examples/trie/device_ref_example.cu b/examples/trie/device_ref_example.cu new file mode 100644 index 000000000..d26e0524c --- /dev/null +++ b/examples/trie/device_ref_example.cu @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <../tests/trie/utils.hpp> +#include + +#include +#include +#include +#include + +using namespace cuco::utility; + +template +__global__ void lookup_kernel( + TrieRef ref, LabelIt keys, OffsetIt offsets, OutputIt outputs, size_t num_keys) +{ + auto key_id = cuco::detail::global_thread_id(); + auto const loop_stride = cuco::detail::grid_stride(); + + while (key_id < num_keys) { + auto key_start_pos = keys + offsets[key_id]; + auto key_length = offsets[key_id + 1] - offsets[key_id]; + + outputs[key_id] = ref.lookup(key_start_pos, key_length); + key_id += loop_stride; + } +} + +/** + * @file device_ref_example.cu + * @brief Demonstrates usage of the trie device-side APIs. + * + * trie provides a non-owning reference which can be used to interact with + * the container from within device code. + * + */ +int main(void) +{ + using LabelType = int; + + std::size_t num_keys = 64 * 1024; + std::size_t max_key_length = 6; + + thrust::host_vector labels; + thrust::host_vector offsets; + + distribution::unique lengths_dist; + distribution::gaussian labels_dist{0.5}; + + cuco::test::trie::generate_labels( + labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist); + auto keys = cuco::test::trie::sorted_keys(labels, offsets); + + cuco::experimental::trie trie; + for (auto key : keys) { + trie.insert(key.begin(), key.end()); + } + trie.build(); + + thrust::device_vector d_labels = labels; + thrust::device_vector d_offsets = offsets; + thrust::device_vector result(num_keys, -1lu); + + trie_lookup_kernel<<<128, 128>>>(trie.ref(cuco::experimental::trie_lookup), + d_labels.begin(), + d_offsets.begin(), + result.begin(), + num_keys); + + bool const all_keys_found = + thrust::all_of(result.begin(), result.end(), cuco::test::trie::valid_key(num_keys)); + if (all_keys_found) { std::cout << "Success! Found all keys.\n"; } + + return 0; +} diff --git a/examples/trie/host_bulk_example.cu b/examples/trie/host_bulk_example.cu new file mode 100644 index 000000000..6cf6ac7ef --- /dev/null +++ b/examples/trie/host_bulk_example.cu @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <../tests/trie/utils.hpp> +#include + +#include +#include +#include +#include + +using namespace cuco::utility; + +/** + * @file host_bulk_example.cu + * @brief Demonstrates usage of the trie "bulk" host APIs. + * + * The bulk APIs are only invocable from the host and are used for doing operations like `insert` or + * `lookup` on a set of keys. + * + */ +int main(void) +{ + using LabelType = int; + + std::size_t num_keys = 64 * 1024; + std::size_t max_key_length = 6; + + thrust::host_vector labels; + thrust::host_vector offsets; + + distribution::unique lengths_dist; + distribution::gaussian labels_dist{0.5}; + + cuco::test::trie::generate_labels( + labels, offsets, num_keys, max_key_length, lengths_dist, labels_dist); + auto keys = cuco::test::trie::sorted_keys(labels, offsets); + + cuco::experimental::trie trie; + for (auto key : keys) { + trie.insert(key.begin(), key.end()); + } + trie.build(); + + thrust::device_vector d_labels = labels; + thrust::device_vector d_offsets = offsets; + thrust::device_vector result(num_keys, -1lu); + + trie.lookup(d_labels.begin(), d_offsets.begin(), d_offsets.end(), result.begin()); + + bool const all_keys_found = + thrust::all_of(result.begin(), result.end(), cuco::test::trie::valid_key(num_keys)); + if (all_keys_found) { std::cout << "Success! Found all keys.\n"; } + + return 0; +}