Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
104 commits
Select commit Hold shift + click to select a range
b674ee2
Initial checkin of bitvector
amukkara May 11, 2023
8e31b2d
Add bitvector get test
amukkara May 11, 2023
c3e253e
Add more bitvector tests
amukkara May 11, 2023
0d56be4
Add trie tests
amukkara May 12, 2023
528778a
Move implementations to .inl files
amukkara May 16, 2023
5e9ad38
Add bit_vector reference classes
amukkara May 18, 2023
b5b8f5a
Use aow_storage for bit_vector structures
amukkara May 19, 2023
97cf357
Minor
amukkara May 20, 2023
2259c2a
Remove trie
amukkara May 22, 2023
52307cb
Clang format
amukkara May 24, 2023
903e36d
Coalesce all bitvector operations into single tag
amukkara May 24, 2023
0749c04
Naming
amukkara May 26, 2023
b795601
Allocate and size aow arrays after host side build
amukkara May 26, 2023
d29e304
Handle empty bitvector with zero bits
amukkara May 26, 2023
03a212c
Minor
amukkara May 26, 2023
e8f186d
Remove bitvector memory footprint logic
amukkara May 28, 2023
9e8f68e
Doxygen comments
amukkara May 29, 2023
201c894
Avoid C-style casts
amukkara May 30, 2023
5e3bdbf
Consistent use of 64-bit integer types
amukkara May 30, 2023
149109b
Rename method from `add` to `append`
amukkara May 30, 2023
d545194
Remove `Key` template parameter
amukkara May 30, 2023
0e411be
Minor
amukkara May 30, 2023
88dc1a6
Cherrypick bitvector files from trie branch
amukkara Aug 17, 2023
fe68a91
Use cuda math instrincs in device functions
amukkara Aug 18, 2023
3894d5a
Use cuda::std::array
amukkara Aug 18, 2023
80835d7
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Aug 18, 2023
10c0900
Add constexpr and noexcept specifiers
amukkara Aug 18, 2023
460f989
Spacing
amukkara Aug 18, 2023
d063cc5
Remove unnecessary template parameters
amukkara Aug 18, 2023
a371e27
Allocator argument in constructor
amukkara Aug 18, 2023
91b628a
Use size_type and slot_type
amukkara Aug 18, 2023
9ac0da9
Explicitly define frequently used constants
amukkara Aug 18, 2023
81ed984
Comments
amukkara Aug 18, 2023
8e40ef0
Improve order of function implementations
amukkara Aug 18, 2023
660d807
Refactor selects entry addition
amukkara Aug 21, 2023
0c233fe
Merge branch 'NVIDIA:dev' into trie-bitvector
amukkara Aug 22, 2023
cf232c1
Bulk bitvector get operation
amukkara Aug 22, 2023
3eb6402
Add device-ref set operation
amukkara Aug 22, 2023
f72684f
Bulk set API
amukkara Aug 22, 2023
74694b9
Use size_type in tests
amukkara Aug 22, 2023
9d5100d
Add static constexpr
amukkara Aug 23, 2023
e248c9d
Minor coding style
amukkara Aug 23, 2023
1d57d3a
Minor
amukkara Aug 23, 2023
721a5ab
cuda::std popcount intrinsic
amukkara Aug 23, 2023
dbd5313
Comments
amukkara Aug 23, 2023
6e16961
get_word operation
amukkara Aug 23, 2023
525e5c1
Comment out set test check
amukkara Aug 23, 2023
6028a79
Generic template parameters
amukkara Aug 23, 2023
f74bee9
Comments
amukkara Aug 23, 2023
eb7f957
Use unique_ptrs
amukkara Aug 23, 2023
8687daa
Use cuda::std intrinsics
amukkara Aug 23, 2023
005bd5d
Curly braces in initialization list
amukkara Aug 23, 2023
7485d28
Remove unused header
amukkara Aug 23, 2023
8feefc8
[pre-commit.ci] auto code formatting
pre-commit-ci[bot] Aug 23, 2023
d538c34
Merge branch 'trie-bitvector' of https://github.com/amukkara/cuCollec…
amukkara Aug 23, 2023
c5ec254
Collect aow refs in a single struct
amukkara Aug 24, 2023
4451d7e
Fix includes
amukkara Aug 24, 2023
4b46a9c
Avoid repeated definition of same magic number
amukkara Aug 24, 2023
3e4a413
Larger bitvector sizes in tests
amukkara Aug 25, 2023
f378626
Device kernels for rank and select generation
amukkara Aug 25, 2023
d195766
Misc fixes in bitvector build
amukkara Aug 26, 2023
aaa2261
Move constructor
amukkara Aug 27, 2023
d237263
Bulk API for rank and select operations
amukkara Aug 27, 2023
07aa813
Remove bulk set operation
amukkara Aug 29, 2023
fb07de9
Remove aow_storage structures
amukkara Aug 29, 2023
017fd1c
Grow bitvector on device
amukkara Aug 29, 2023
7436684
Move bit_vector to detail namespace
PointKernel Aug 29, 2023
30209fb
Add missing headers
PointKernel Aug 29, 2023
15033a3
Clean up type aliases and static constexpr
PointKernel Aug 29, 2023
d9914bf
Add missing headers
PointKernel Aug 29, 2023
7290190
Add missing headers
PointKernel Aug 29, 2023
9fdbc17
Add allocator template parameter
PointKernel Aug 29, 2023
82d1e26
Clean up docs
PointKernel Aug 29, 2023
793cf28
Move kernels to a separate file
PointKernel Aug 29, 2023
0e29dbc
Make bit_vector_ref a nested type
PointKernel Aug 29, 2023
2314801
Remove bv read operator tag
PointKernel Aug 29, 2023
0348daf
Move implementation details to inl file
PointKernel Aug 29, 2023
f804b89
Rename bit_vector as dynamic_bitset
PointKernel Aug 29, 2023
b65d5b1
Merge remote-tracking branch 'upstream/dev' into trie-bitvector
PointKernel Aug 30, 2023
0cf4bac
Cleanups: constexpr instead of inline, TODO, etc
PointKernel Aug 30, 2023
c7faed0
Change names of rank, select variables
amukkara Aug 30, 2023
a85b7b7
Rename members and methods of `rank`
amukkara Sep 1, 2023
ec6bb11
Rename bitvector to bitset
amukkara Sep 1, 2023
59457e9
Remove `get_` prefixes in method names
amukkara Sep 1, 2023
5b86c00
Use rank_type
amukkara Sep 1, 2023
c753f10
Rename bulk API methods
amukkara Sep 1, 2023
b26f326
Use cuco::detail::index_type in kernels
amukkara Sep 1, 2023
f96468d
Change some API to match boost dynamic_bitset
amukkara Sep 1, 2023
9f452c0
Rename file to match previous API change
amukkara Sep 1, 2023
0f1db0f
More API changes
amukkara Sep 1, 2023
63ed552
Rename slot_type to word_type
amukkara Sep 1, 2023
81b8e90
Specify iterator's value_type in doxygen comments
amukkara Sep 1, 2023
b2c88de
Comments
amukkara Sep 1, 2023
31b9e60
Merge remote-tracking branch 'upstream/dev' into trie-bitvector
PointKernel Sep 6, 2023
37ebd0c
Use detail CUDA utilities to determine grid size
PointKernel Sep 6, 2023
a629730
Make build() a private member
amukkara Sep 6, 2023
cedc5d4
Merge branch 'trie-bitvector' of github.com:amukkara/cuCollections in…
amukkara Sep 6, 2023
9a3018d
Minor doc updates
PointKernel Sep 8, 2023
e1527ec
Clean up kernels with cuda utilities
PointKernel Sep 8, 2023
4d0d78a
Minor style cleanup
PointKernel Sep 8, 2023
fceb5f9
Cleanups: renaming + update docs
PointKernel Sep 8, 2023
adab866
Consistently use the same allocator for intermediate vars
PointKernel Sep 8, 2023
2b8851e
Make build process exposed to CUDA stream
PointKernel Sep 8, 2023
8cf54b8
Cleanups + deallocate before return
PointKernel Sep 8, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
375 changes: 375 additions & 0 deletions include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,375 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cuco/cuda_stream_ref.hpp>

#include <thrust/device_malloc_allocator.h>
#include <thrust/device_vector.h>

#include <cuda/std/array>

#include <climits>
#include <cstddef>

namespace cuco {
namespace experimental {
namespace detail {

/**
* @brief Struct to store ranks of bits at 256-bit intervals (or blocks)
*
* This struct encodes a list of four rank values using base + offset format
* e.g. [1000, 1005, 1006, 1009] is stored as base = 1000, offsets = [5, 6, 9]
* base uses 40 bits, split between one uint32_t and one uint8_t
* each offset uses 8 bits
*/
struct rank {
uint32_t base_hi_; ///< Upper 32 bits of base
uint8_t base_lo_; ///< Lower 8 bits of base
cuda::std::array<uint8_t, 3> offsets_; ///< Offsets for 64-bit sub-intervals, relative to base

/**
* @brief Gets base rank of current 256-bit interval
*
* @return The base rank
*/
__host__ __device__ constexpr uint64_t base() const noexcept
{
return (static_cast<uint64_t>(base_hi_) << CHAR_BIT) | base_lo_;
}

/**
* @brief Sets base rank of current 256-bit interval
*
* @param base Base rank
*/
__host__ __device__ constexpr void set_base(uint64_t base) noexcept
{
base_hi_ = static_cast<uint32_t>(base >> CHAR_BIT);
base_lo_ = static_cast<uint8_t>(base);
}
};

/**
* @brief Bitset class with rank and select index structures
*
* In addition to standard bitset set/test operations, this class provides
* rank and select operation API. It maintains index structures to make both these
* new operations close to constant time.
*
* Current limitations:
* - Stream controls are partially supported due to the use of `thrust::device_vector` as storage
* - Device ref doesn't support modifiers like `set`, `reset`, etc.
*
* @tparam Allocator Type of allocator used for device storage
*/
// TODO: have to use device_malloc_allocator for now otherwise the container cannot grow
template <class Allocator = thrust::device_malloc_allocator<std::byte>>
class dynamic_bitset {
public:
using size_type = std::size_t; ///< size type to specify bit index
using word_type = uint64_t; ///< word type
/// Type of the allocator to (de)allocate words
using allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<word_type>;

/// Number of bits per block. Note this is a tradeoff between space efficiency and perf.
static constexpr size_type words_per_block = 4;
/// Number of bits in a word
static constexpr size_type bits_per_word = sizeof(word_type) * CHAR_BIT;
/// Number of bits in a block
static constexpr size_type bits_per_block = words_per_block * bits_per_word;

/**
* @brief Constructs an empty bitset
*
* @param allocator Allocator used for allocating device storage
*/
constexpr dynamic_bitset(Allocator const& allocator = Allocator{});

/**
* @brief Appends the given element `value` to the end of the bitset
*
* This API may involve data reallocation if the current storage is exhausted.
*
* @param value Boolean value of the new bit to be added
*/
constexpr void push_back(bool value) noexcept;

/**
* @brief Sets the target bit indexed by `index` to a specified `value`.
*
* @param index Position of bit to be modified
* @param value New value of the target bit
*/
constexpr void set(size_type index, bool value) noexcept;

/**
* @brief Sets the last bit to a specified value
*
* @param value New value of the last bit
*/
constexpr void set_last(bool value) noexcept;

/**
* @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the
* boolean value at position `keys_begin[i]` to `output_begin[i]`.
*
* @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
* `size_type`
* @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from boolean
* type
*
* @param keys_begin Begin iterator to keys list whose values are queried
* @param keys_end End iterator to keys list
* @param outputs_begin Begin iterator to outputs of test operation
* @param stream Stream to execute test kernel
*/
template <typename KeyIt, typename OutputIt>
constexpr void test(KeyIt keys_begin,
KeyIt keys_end,
OutputIt outputs_begin,
cuda_stream_ref stream = {}) noexcept;

/**
* @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores total
* count of `1` bits preceeding (but not including) position `keys_begin[i]` to `output_begin[i]`.
*
* @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
* `size_type`
* @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's
* `size_type`
*
* @param keys_begin Begin iterator to keys list whose ranks are queried
* @param keys_end End iterator to keys list
* @param outputs_begin Begin iterator to outputs ranks list
* @param stream Stream to execute ranks kernel
*/
template <typename KeyIt, typename OutputIt>
constexpr void rank(KeyIt keys_begin,
KeyIt keys_end,
OutputIt outputs_begin,
cuda_stream_ref stream = {}) noexcept;

/**
* @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the
* position of `keys_begin[i]`th `1` bit to `output_begin[i]`.
*
* @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
* `size_type`
* @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's
* `size_type`
*
* @param keys_begin Begin iterator to keys list whose select values are queried
* @param keys_end End iterator to keys list
* @param outputs_begin Begin iterator to outputs selects list
* @param stream Stream to execute selects kernel
*/
template <typename KeyIt, typename OutputIt>
constexpr void select(KeyIt keys_begin,
KeyIt keys_end,
OutputIt outputs_begin,
cuda_stream_ref stream = {}) noexcept;

using rank_type = cuco::experimental::detail::rank; ///< Rank type

/**
*@brief Struct to hold all storage refs needed by reference
*/
// TODO: this is not a real ref type, to be changed
struct storage_ref_type {
const word_type* words_ref_; ///< Words ref

const rank_type* ranks_true_ref_; ///< Ranks ref for 1 bits
const size_type* selects_true_ref_; ///< Selects ref for 1 bits

const rank_type* ranks_false_ref_; ///< Ranks ref for 0 bits
const size_type* selects_false_ref_; ///< Selects ref 0 bits
};

/**
* @brief Device non-owning reference type of dynamic_bitset
*/
class reference {
public:
/**
* @brief Constructs a reference
*
* @param storage Struct with non-owning refs to bitset storage arrays
*/
__host__ __device__ explicit constexpr reference(storage_ref_type storage) noexcept;

/**
* @brief Access value of a single bit
*
* @param key Position of bit
*
* @return Value of bit at position specified by key
*/
[[nodiscard]] __device__ constexpr bool test(size_type key) const noexcept;

/**
* @brief Access a single word of internal storage
*
* @param word_id Index of word
*
* @return Word at position specified by index
*/
[[nodiscard]] __device__ constexpr word_type word(size_type word_id) const noexcept;

/**
* @brief Find position of first set bit starting from a given position (inclusive)
*
* @param key Position of starting bit
*
* @return Index of next set bit
*/
[[nodiscard]] __device__ size_type find_next(size_type key) const noexcept;

/**
* @brief Find number of set bits (rank) in all positions before the input position (exclusive)
*
* @param key Input bit position
*
* @return Rank of input position
*/
[[nodiscard]] __device__ constexpr size_type rank(size_type key) const noexcept;

/**
* @brief Find position of Nth set (1) bit counting from start
*
* @param count Input N
*
* @return Position of Nth set bit
*/
[[nodiscard]] __device__ constexpr size_type select(size_type count) const noexcept;

/**
* @brief Find position of Nth not-set (0) bit counting from start
*
* @param count Input N
*
* @return Position of Nth not-set bit
*/
[[nodiscard]] __device__ constexpr size_type select_false(size_type count) const noexcept;

private:
/**
* @brief Helper function for select operation that computes an initial rank estimate
*
* @param count Input count for which select operation is being performed
* @param selects Selects array
* @param ranks Ranks array
*
* @return index in ranks which corresponds to highest rank less than count (least upper bound)
*/
template <typename SelectsRef, typename RanksRef>
[[nodiscard]] __device__ constexpr size_type initial_rank_estimate(
size_type count, const SelectsRef& selects, const RanksRef& ranks) const noexcept;

/**
* @brief Subtract rank estimate from input count and return an increment to word_id
*
* @tparam Rank type
*
* @param count Input count that will be updated
* @param rank Initial rank estimate for count
*
* @return Increment to word_id based on rank values
*/
template <typename Rank>
[[nodiscard]] __device__ constexpr size_type subtract_rank_from_count(size_type& count,
Rank rank) const noexcept;

/**
* @brief Find position of Nth set bit in a 64-bit word
*
* @param N Input count
*
* @return Position of Nth set bit
*/
[[nodiscard]] __device__ size_type select_bit_in_word(size_type N,
word_type word) const noexcept;

storage_ref_type storage_; ///< Non-owning storage
};

using ref_type = reference; ///< Non-owning container ref type

/**
* @brief Gets non-owning device ref of the current object
*
* @return Device ref of the current `dynamic_bitset` object
*/
[[nodiscard]] constexpr ref_type ref() const noexcept;

/**
* @brief Gets the number of bits dynamic_bitset holds
*
* @return Number of bits dynamic_bitset holds
*/
[[nodiscard]] constexpr size_type size() const noexcept;

private:
/// Type of the allocator to (de)allocate ranks
using rank_allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<rank_type>;
/// Type of the allocator to (de)allocate indices
using size_allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<size_type>;

allocator_type allocator_; ///< Words allocator
size_type n_bits_; ///< Number of bits dynamic_bitset currently holds
bool is_built_; ///< Flag indicating whether the rank and select indices are built or not

/// Words vector that represents all bits
thrust::device_vector<word_type, allocator_type> words_;
/// Rank values for every 256-th bit (4-th word)
thrust::device_vector<rank_type, rank_allocator_type> ranks_true_;
/// Same as ranks_ but for `0` bits
thrust::device_vector<rank_type, rank_allocator_type> ranks_false_;
/// Block indices of (0, 256, 512...)th `1` bit
thrust::device_vector<size_type, size_allocator_type> selects_true_;
/// Same as selects_, but for `0` bits
thrust::device_vector<size_type, size_allocator_type> selects_false_;

/**
* @brief Builds indexes for rank and select
*
* @param stream Stream to execute kernels
*/
constexpr void build(cuda_stream_ref stream = {}) noexcept;

/**
* @brief Populates rank and select indexes for true or false bits
*
* @param ranks Output array of ranks
* @param selects Output array of selects
* @param flip_bits If true, negate bits to construct indexes for false bits
* @param stream Stream to execute kernels
*/
constexpr void build_ranks_and_selects(
thrust::device_vector<rank_type, rank_allocator_type>& ranks,
thrust::device_vector<size_type, size_allocator_type>& selects,
bool flip_bits,
cuda_stream_ref stream = {});
};

} // namespace detail
} // namespace experimental
} // namespace cuco

#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl>
Loading