From 9f0c40eca01edacd3b7760c605a73644e2c3ed07 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 4 Jul 2025 16:37:51 -0700
Subject: [PATCH 01/24] Simple lookup working

---
 examples/CMakeLists.txt                       |  29 +-
 examples/roaring_bitmap/bitmapwithoutruns.bin | Bin 0 -> 72616 bytes
 examples/roaring_bitmap/host_bulk_example.cu  |  75 +++++
 .../detail/roaring_bitmap/roaring_bitmap.inl  |  94 ++++++
 .../roaring_bitmap/roaring_bitmap_impl.cuh    | 279 ++++++++++++++++++
 .../roaring_bitmap/roaring_bitmap_ref.inl     |  80 +++++
 include/cuco/roaring_bitmap.cuh               |  85 ++++++
 include/cuco/roaring_bitmap_ref.cuh           |  66 +++++
 8 files changed, 694 insertions(+), 14 deletions(-)
 create mode 100644 examples/roaring_bitmap/bitmapwithoutruns.bin
 create mode 100644 examples/roaring_bitmap/host_bulk_example.cu
 create mode 100644 include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
 create mode 100644 include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
 create mode 100644 include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
 create mode 100644 include/cuco/roaring_bitmap.cuh
 create mode 100644 include/cuco/roaring_bitmap_ref.cuh

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 12b508404..e2328b496 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -33,17 +33,18 @@ endfunction(ConfigureExample)
 ### Example sources ###############################################################################
 ###################################################################################################
 
-ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu")
-ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu")
-ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu")
-ConfigureExample(STATIC_SET_SHARED_MEMORY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/shared_memory_example.cu")
-ConfigureExample(STATIC_SET_MAPPING_TABLE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/mapping_table_example.cu")
-ConfigureExample(STATIC_MULTISET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multiset/host_bulk_example.cu")
-ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu")
-ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_ref_example.cu")
-ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu")
-ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu")
-ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu")
-ConfigureExample(HYPERLOGLOG_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/host_bulk_example.cu")
-ConfigureExample(HYPERLOGLOG_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/device_ref_example.cu")
-ConfigureExample(BLOOM_FILTER_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/bloom_filter/host_bulk_example.cu")
+# ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu")
+# ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu")
+# ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu")
+# ConfigureExample(STATIC_SET_SHARED_MEMORY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/shared_memory_example.cu")
+# ConfigureExample(STATIC_SET_MAPPING_TABLE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/mapping_table_example.cu")
+# ConfigureExample(STATIC_MULTISET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multiset/host_bulk_example.cu")
+# ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu")
+# ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_ref_example.cu")
+# ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu")
+# ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu")
+# ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu")
+# ConfigureExample(HYPERLOGLOG_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/host_bulk_example.cu")
+# ConfigureExample(HYPERLOGLOG_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/device_ref_example.cu")
+# ConfigureExample(BLOOM_FILTER_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/bloom_filter/host_bulk_example.cu")
+ConfigureExample(ROARING_BITMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/roaring_bitmap/host_bulk_example.cu")
diff --git a/examples/roaring_bitmap/bitmapwithoutruns.bin b/examples/roaring_bitmap/bitmapwithoutruns.bin
new file mode 100644
index 0000000000000000000000000000000000000000..a99fd50aff79b98fa93b3219fc6226fa238d72e2
GIT binary patch
literal 72616
zcmeI((;Mqb-01P7u^Kf^(llz)c6l$`wr$(CZQHi(wU=$%wte>dN1Susi}^kCedcB^
zW<K|mGd%<`(Em&U2!>n;g=imxDp$dwDpiR8cJjX{C?7{d^C~dV-@h#MH*@{}PCxpu
z6!3q!|5f0>8vNIQ|5^Y+%Xj_P&HwuO-y!H3a*o)}%;$&8?bYf=emjLHB)_G)m+n}`
z^;xIq?4P%J!SY43mvqXK72j34R%3si<qgI+>E5D#o8lePchP!4eV+9{H+cK-`J;!A
zZ$G*E^!&3^%oi4aTXt{Nv32V=P2bjkSMz=44`x5=oFvb_yKwEw{u|5hjDOJmN&Oea
z-=zPbeSv;F`+W}Hj?JfrbM3|IN`5_sDY)PKdnt~kS)YD-=Kk56=PsW=dts+IS^8bM
zYnAp_TV88?z3z?bH!I#MeLJla)a_Z%bA7iD{GXSA0Rlh(2mk>f00e*l5C8%|00;m9
zAOHk_01yBI|2G6mO!UD50zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00;m9AOHk_
z01yBIKmZ5;0U!VbfB+Bx0zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00;m9AOHk_
z01yBIKmZ5;0U!VbfB+Bx0zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00;m9AOHk_
z01yBIKmZ5;0U!VbfB+Bx0zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00;m9AOHk_
z01yBIKmZ5;0U!VbfB+Bx0zd!=00AHX1b_e#00KY&2>d@2fFK0IAqrw49+Dsh(jWt}
zAQ$qXB$NtD3#ErLL0O?3P;Mw6R1hix6^BYeWuXdCWvCid6RHE%hZ;dmp%zeUs2$W1
z>H>9#dO>}m0nlJ*7&H<Z1C56!K~td_&}?WPv=CYXEr(V?YoQI$W@sC<6WRmqhYmtV
zpySXf=qz*rx(r=|ZbEmU`_LokDf9w*4ZVXtLSLZo&@bpO6o4_9gc&#pi*N{5;RtNP
z4(!1RI3=70{s+zoXMwZBx!}BT0k|++3@!<mfy={{;Hq#9xHeo5ZU{Glo5QW(wr~fy
zGu#dC3HO2f!-L?V@CbM`JPw`+Pl2byv*5Y#0(ddJ3|<MZf!D*E;H~fucsINc{ue$B
zAA?WAXW;YjCHN|Q1HKL4gCD|A;OFow_$~Ya{tSPEf5LxYIDiI-03F~0LO>3L19~7D
zumiC`Jdh%gI*=}qA&@zcEs!&iCy+l-C{Q#|B2YR|E>JO0B~U$3D^NGkAka9_EYLF0
zCeS|6DbO|0BhWk0FEB7LBrrTMDlj%MAuu^GEif}MCon&-D6llJBCtBJF0e7MC9plP
zE3h|kAaE#fG;kttI&dy<F>obtJ#Z^<H}D|vIPfg+GVmtwKJY2<HSi<wI{+aFf+G~d
zB0M4?3ZfwfVj(W#BS|C`k`_siWJ0ncIgs2)KBOR01SyV`Ldqf)kjh9kq$W}asgE>5
znj$Tb)<`>~Bhm%wj`TwMA_I`Y$S`CiG6orsOhTq2GmzQHJY*rV1X+%(Le?T1kj=<8
zWGAu**^eAVjv&X8Q^;B50&*F-hTKH%Aor0+$W!D6@)~)Ed_=w=-;rO)UnGEHD2Xy?
z5Eao7s-h9pL><&a6KF~_4f+q75zT^TM{}Wh(E?~;v=~|vErXUvE1^}<8fa~_9@-FX
zf;LB6p>5F)XlJw=+7s=A_D2VyL(viFXmlJp5uJigM`xjP(FN#YbQ!u5U4yPiH=$e6
z9q4X!ANntP7(IrbM9-k-(M#x6^agqxy@x(TpP<jtSLj>x1Ns^LhW<qVpfHAF1V&>V
zCSWoa#&j%-*;ovVV=1uISUM~NmKn>2<;3z}`LRM+QLF@38Y_oY#HwJ`v07MNtO3>-
zYlgMN+F<RmPFPp02i6<whYiGrV8gLd*jQ`=HW{0S&BW$l^RY$PQfvjb8e4~L#I|7D
zv0d0+>;QHMJBppaPGje=i`W(HI(7@Yi#@;|W6!Xc*c<FU_6hrn{lI=>5RTwDPT?%h
z;}Wjm8gAeg?&3b4#8csE@$`5mJS(09&yDB93*tra;&>^%EM5Vxj90^J;&t%)cq6<i
z-U4rpx5GQ)UGVOBFT5{403VDG!$;y{@bUO0d@4QzpN-GM7vf9s<@hRmExrNYjBmqt
z;(PG@_(A*#ejGoApT#fWm+@=(P5cgiAAf{D#b4mB@pt$~{0sga|Aqg>0|Z8p1VaP~
zkq8kg5g|;%Av_{Mq$JW1{}368EJSu97m=4JKolm55haN-M0uhTQI)7c)F$c?4T&a1
zbD|Z|mgqoqCb|(li9SSsVh}Nu7(t9C#t{>VDa3SQ7BQDtKrAMf5i5x`#Cl>Av6a|C
z>?ZaR{}P9ZW5h|~3~`>gL|i3q5VwhY#6#i<@tk-?yd^#mpNVh8PvQ>&lPF1$G|7<y
zDU)GRC!?fI#>hCCf=o@OBQubh$!ug!G7p)bEJPM1OOU0>a%4ra3R#`3Mb;%7kd4V^
zWJ|IQ*`Dk~b|rg|y~%#$KynB<oE$}tB`1)R$!X+Fat=A4TtqG<SCFg8b>v2J3%Q-#
zMeZdJkcY^l<O%XLd5*kDULmiOx5&HX1M)HXjC@JHA>Wgq$gkuN@;3=l2!&G=#Zo*a
zQ3|C|24zt$<x@#26_u7sPi3OAQaPyHR6eR8RfH-|m7>a06{yNoHL506hpJCCqMA}I
zsMb_Fsw35f>Q42d`cea^!PGEnBsGQ_PfenxQZuO8)I4e-wS-zut)kXa8>r3HHfkre
zhuTjaq>fO>sZ-Qh>H>9{x<=ik?ojusN7Pg51@)SGM}4HeP~WLv)L$w<V>C%KbdVP5
z5UtV?+N2%YqZ4#WIt~2~osrH$XQy+~dFcXlVY(Pyk}gA+rz_D_=^Auxx*pw-ZbCPw
zThVRl4s>U_8{L!cL-(f#(L?DG^k{k<J&~S5Pp4<mbLj>2VtN_9l3qivr#I1C=^gZL
zdLR8SeV9H*pQO*w=jluIRr&^fo4!Xsq@U2w=~why`UCx${zm_#|IjdlG6X|293wC?
z6J~TK%GgYdi8Cpf)J!@i1CyD_#^hx3F!`B6Oi`u;Q<^EqRAj0!)tOpMU8Vukm}$ne
zWZE$8nNCbsrU%oT>BkIYhA_jKQOsCo0yCMJ#>`~qF!Py3%u;3rvzl4QY-F}D+nHU=
zUgiLEh&jrfU`{jVn2XF6<~nnWxyw9Y9y8CFm&_aHJ@bkA%KTt{GZ2fgI7_iC%d--z
zuo`Qy7VEM;n`Be5Y1#B_CN?XZgU!w6V+*oH*y3y{wk%tLt;|+qYqE9N`fMY%Dcgc=
z&9-AZvR&BjY%jJiJAfU`4r52MW7zTRBz7u0gPqOJV;8bZ*yZdhb}hSs-OO%dcd~of
z{p>;Z2z#78#hzs^u$S3u>`nF#d!K#8K4o99ui1C(NA?T*o&ClBWdj_>ksQMXIgtx-
zDi`5Q&fz>R!KLKVaQ|=_xh!0EE*F=VE5H@zig6{mGF*AC5?7V0!PVyKaSgd9Tyw4!
z*Ou$Rb>_NpJ-I$ye{K*rlpDc~=EiXoxhdRqZWcF}Tfi;mmT@b&HQah`6StMy!R_Yu
zasP6MxntZ(?hJRHyTo1PZg97`d)!0r3HO|P#l7V|aG$wv+)wTg2M5t05u}4$PzcJw
za8M6MgLW_$j0aN$QwP%pGXygSvjuYo^91t;3k8b?O9V>?%LOY2s|2eDYX$2D8w48%
zn+012+XUMOI|aK2djxw2`vnIEhXjWQM+L_QCj=)4rv+yQ=LF{m7X_CFR|HoF*9A8Q
zw*<EbcLnzb4+IYdj|NW!PY2HhF9xp!uLo}h?*<<P9|xZWUk2X<-v>VhzXpE<e+MBR
z;c=egS)S)5Ug0&~;4R+eeLl&j;?wf!`AmFPJ_nzh&&L<!i}1zyQhZsy0$-W0#@FQQ
z@b&pdd{e#!-<og7cjUY9-T7X8Uw!~Tm><TE<j3&i`APg#eg;3ApT{rcm+;H^Rs33h
z1HYNy#_#0!@ca3L{1N^*e~LfLU*IqE*Z7<K9sWN5h=0nz;9v9a_>cS-{yYDR|H}si
zOdthD2nwPQ5>z1~n1UmCLPAI>q!IoRG74FQ>_RRfuTVfJEEE$;3T1@yLM5T9P(!FK
z)Ds#CO@!t`E1|8>LFg=W6M71Lg#N-HVW==d7%hwwCJIx8>B1~wuCPE@EG!dN3TuS*
z!X{y>utV4_>=XVK4hzSGlfoI{yl_dlD%=om3-^SF!V}@S@Je_qd=Neh--MsS9|0Co
zkq~K-69rKg!=f%mMO%!CaWRFMT1+Qq5HpL}#GGOtF~3+yEGm`|ON-^iieeS9x>!rB
zD>e`ti_OHAVjHo&*h%av_7HoE{ltOd5OKITN*pUr5GRY%#F^q8alW`nTq>>*SBvY!
zjp7z@ySPi-D;^LJiATi~;%V`mcu~9}UKekPcf|+dWAT~zQhX!67e9$##UJ8t5t0xI
zmneyqcuA5JNs|o8l3dA`l2R%ut(0EMBxRLyNV%naQbDPRR9q@0m6a+;m8EJ@O{tDl
zUuq;Zm0C!xrFK$Bsf*NI>LvA+21tXYVbVxxj5J=FBu$lONVBDR(n4v8v|L&xt(7)N
zo26~iPHB&{Upgoqk&a8Jq_ffm>9TZ9x+&d}?n{rPr_u}Qwe(K<D1DK>OTVPQQb5LJ
zQfB0!EXpBSl_Rn#JF+Jy<dkw6`5!r>oJGzq=aTcv1?0kVF}b8%MlLT`lB>!!<l1sQ
zxuM)dZZ5Zy+sYl}&T==or`$*GFAtK3$|K~_@;G^-JVl-^&ywfL3*^P}GI^!EMqV#(
zlDEn`<lXW<`Cs|4d`vzmpOMeYm*lJR4f(cwPktyrk)O-2<hSw%`Lp~@{we>F;Sd@k
zLUf1=2_ZQY4(Xw2$PUFq@lc9T>QK5+hEV2EwouMco>2Z!p-|CKiBRcKxlqMWl~DCi
ztx(-igHYp8vrx-Wn^5~ur%=~Wk5KPWztF(YkkIhZsL<HZgwW*Bw9w4ZoY4HxqR`UN
ziqPuNy3oeZmeBUluF&4lfzYAQ(a?#|>Cm~*#n6?|_0X-*-Oz*3<IuCv%g~$9`_QM*
z*U*p9?+~OQ3a(HJtMH1XD2k>Silw-UuOyXJN?Ikol1a&`<WO=e`ILf65v90NN-3*U
zP%10cl$uH%rM}WgX{xkPS}X08j!GA$yV6VPs|-*EE5nqL${1z5GD(@L%ur@4^OS|k
z5@orvN?EIHP&O;ul%2{RWxsMzIieg_PAO-V3(95XnsQUQquf^>DNmIb%4_AF@=^Js
zd{=%cf0aNO3zK0c91M%$P*@E|!e-bBd*MVlWjIavpK!)-mT>lPu5jLPfpFn)v2e+7
znQ-}VrEt}7jd1O7y>P>DlW_BJt8m+Jhj8a`w{XvJpK$;1pzzS}i16s}xbVdAl<@TM
ztnl3Mg7D(-vhd3An(+GYrtsGAj_~gAzVN@{!{KA$li@Sr^WjV3tKl2r+u?iRhv6sT
z=iyi3x8V=r&*5+3pW#1YSVdJrrBzN9R9OwHx*An&HKxYZ6l!WUoti<-tY%Yls(IA>
zY9Y0#T0$+YmQyRLRn+QgEw!%NKy9oxQ(LNS)b?s8wX51g?XC7x2dYEV;p!-LtU5uR
ztWHyBs&mx&>LPWix<Xy8u2VOvTh#69E_JVZKs}@$RZpm=)pP1a^@@63y`|n&AE=Mj
zXX;D!jrv~wq<&R@sJ~T6Lo{5YG*;s^NmDdUGc-$cHD60=skF3OdM%TdRm-8}*79iu
zwIW(^t&~<)tDsfZs%bT~I$C|Lk=9gep|#f9X&tpLT6e9N)>j*#4c3NfBegNwcx{q4
zRhyyB*5+vowI$keZI!lG+n{aMwrM-HJ=%Wlpmsz%uAS1(Y8SN2+BNN_c1OFfJ<^_P
zFSOU%JME+PMf<M((*9}z9n(pj(Sy3Ehjdks=%()Io}SQC>S^?U^o)8IJ-ePu&#M>E
z3+u)7l6o1vyk1GKs@KqK>-F@8dK105-b!z)chEcQ-SnP%AHBamNFS<?&`0az^ojZu
zeY!qNpQ|s>7wgOPmHHZey}n7`s_)Qu>-+S7^~3rx{iJ?IKd)cXuj)7S+xk8Iq5edF
zuD{aX>L2va`ZxWj{zr!+XoQH+5iTM`<VZN8N1_os5{twmDI%#O=^_~-nIqXEIU{)@
z`6GoQMI$95r6c7c6(dz5)g!ecbt4TTjU&w>EhB9r?IWEcT_Zgry(9f110zEs!y}_2
zV<QtHlOxk2Gb3{%^COEQOCu{Ht0U_o8zWmH+atRodm{%ThayKKCnBdK=OPy)S0dLV
zw<32V4<e5v&mu1)ZzAs_pCVr)KO(;)kbxMuK^d&U8<L?Inqe50;Tpb?G*TI9jr2w)
zBdd|a$Zg~^3K~U>;zlW>tWm+JY*aI98g-2NMkAxC(ZXnLv@<#yU5xHVFQcz9z!+={
zGe#O?jPb@KW2!O3m~G5678*;8<;E&wt+Bz_Y-}@j8hecW#zEtVaojj%oHZ^OmyK)2
zP2-Mn-*{v^HC`C6jd#XJ<BRd#_+|Vx0#PhVMww_ZDn>(5H5!SUQ77s}6Va5>G|_*e
z8KYUE*`v9ld7}lQg`>ryC8K4c<)f9NRiibcwWIZ-4Wmt>&7-ZNZKEBcoul2NJ)?c1
z{iB1TL!%?2qod=Z6Qfh2)1$MZbE6BQi=)e;E2C?o>!X{ZTcbOoyQBM}|3(i-k3~;L
z&qU8hFGa6LZ$xiL??oR*pG2QWUq#<WKSV!AzeRsW|3qOEH3^e8Ia4rYGi>T+)U?f*
z88=gysm*j|1~apn&CF@$G4q>+%%Wxqv$R>xtY}s-tDCjVx@H5jvDwUQX|^%jo1M(A
zW)HKs+0Ptk4l#$Dqs+191aq=E&75h@G3T3$%%$cEbG5n7+-Pnwx0}1nz2*V)ka^TR
zVV*Y6nHS9~=5_OydDnbkJ~p43FU>dRd-Id|)%;=pHX#eKaEr27i?<|8u{6uDEX%cg
zD`};&(pu@QOjcGahn3sPXBD)HSjDYUR#~fpRoSX$)wJqZ^{qx$Q>%s5+G=NYw7OW`
ztzK4NYk)P_8fJ~O##rO6N!C<rhBe!oXDzgrSj(+d)>><Wwb|Nc?X>n-`>li45$m{h
z$~tRZur6EItee&y>%R5KdTPC}UR&?1kJcCKyY<WZYXxk~CT+$J+M*q@RXbvvwqtvC
z!cJ+YvH!6%+F9)Eb}l=wUBE7E7qd&+W$f~HCA+F!!>(=Dvm4q??B;eWyRF^9?re9n
zd)j^M{`Meys6E0SZI81j+EeW5_AGm@y}({<FSA$LYwY#*CVQ*B!`^N0v;VaZ+sEva
z_8I%UeaXIR->`4n_w0xE6Z^US%6@Bqus_@1?4R}@8+K5KaA=2f1V?tlj_yPq+le`G
zCxw&RN#|s6GCSFvoK7Amzf;I5>XdLwJLQ~;P8FxRQ_HFAG;kU_&777_8>hX~$?59!
zaC$raoPo{|XSg%U8S6}NCOgxdna&(%zO%?#>a1{9JL{Z{&K768v&-4*9B>XfN1YSS
zY3H1C(YfMWcWya%od?ci=b7`;dE>lyJ~>~VAI@(FauFAIDVKG5S8^3sa}C#WUDtP$
zZYnpeo8HajW_5G8x!rtjLAQuo+%4snbt|})-D+-4w~kxiZR9p}Tez*=c5X+vi`(7p
z<@R+4xP#qc?nrlxJKmk-PIYIvv)y^_LU)O~++F3abvL-1-EHnpcaOW@J?I{BkGrSb
zv+f1=vU|<F>E3bgyN}$b?hE&|`_BF7esRCMzudoWAcn=r7!wP|#8@b%#v(B@=ES^M
zB9=0iCiYJ(V=PN7dn{KhZ>&J9aI9FYWUNfAe5_KeYOF@AcC22kVXR54d8}2eZLCAA
zbF5peXRJ@Ge{4`}Xlz7mbZlH~Vr)umdTdr~Zfrqpaco&^Wo%7seQZ-~YivhscWhtm
z-`L^UvDnGjnb`T*rP$Tjjo9tjz1YLpli2gvtJvGvhuG)Xx7g3vpBU_+9^ug*=Lw$d
zg+1MidbSty;$8|bwU^Gz;AQr*c{#m2UVg8TSJW%vmG;Vc6}>86b+49J*K6Q4_L_Mu
zy*6HZuano+>*4kG`gsGrA>MFrlsDF!;7#_Xc{9B^-h6M7x71tVt@hS=8@(;wc5j!r
z*E`@H@{W2Zywlz}@1l3byYAic?s^Zr$KEsVrT4~r?|t&VdOy719^@lF?o&SN^S<OO
zzUCXg<-5M`C;e1@T0gy?$<ONN@N@h5{DOWFzqnt@FY8zEEBn>_ntmO>zTe1i>bLM)
z`|bRWeiy&H-^=go5AX;3!~Bu{7=OG!$)D=a@Mrt;{DuA!f4RTPU+Zu1H~ZWCo&Fww
zzkkp_;ve@<`DgtL{$>A~f78F?-}fK+PyHAEYyX}9(f{Iq_ka0+{XiUxlW`^<jEnJ5
zT#ZNKX55K;@kBgjJWc$cc*b~^c=mX%c;0w{c;R@lc*%H~c=>pxc-44~c<p$-c*A&;
zc=LFxc-we~c;|Sxc+YsBc>nmI_|W)>_~`h!_{8{>`1JU!_}ut{_~Q7o_{#X2`1<&!
z_}2K2`0n_=_`mVP@ni9m@iX!B@k{Zm@f-2m@q6)y@h9=;@mKM;@elFO@o({;@jr1m
zfhLFqo!}BeLQaGedLo*z6R|`*ks^^gkuH%TkvWkqku#Aekv~x=Q8ZB^Q94mBQ87^^
zQ9V&BQ8&>b(Kyj8(K68{(LT{B(KXQ{(L2#EF)%SCF+4FUF*Y$FF*z|UF*7kIF+Z^=
zu{5zFu{yCXu`#hFu|2UXu{UubaVT*#aUyX#aV~K&aV2p*aVv2*@gVUy@htH&@h0&;
z@hR~&@gwm&0VR<no}`j&l21xWC8;Hiq?L4&elnR%l}wvVpUjlZn#_^Roy?akm@JYk
zo-CCto2-zmoUE3tnXHqnpKO$DnrxA5ootuvnCz15p6r$En;eiFoE(-MnH-ZGpPZDO
znw*iGot&3km|T)vo?Mk&o7|AxoZOb&ncS0vzyJXt00e*l5C8%|00;m9AOHk_01yBI
zKmZ5;0U!VbfWZHHfxlpY01yBIKmZ5;0U!VbfB+Bx0zd!=00AHX1b_e#00KY&2mk>f
z00e*l5C8%|00;m9AOHk_01yBIKmZ5;0U!VbfB+Bx0zd!=00AKI|3Tn?{ih3XK_CDG
qfB+Bx0zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00{gq6Zk)YaHA&x

literal 0
HcmV?d00001

diff --git a/examples/roaring_bitmap/host_bulk_example.cu b/examples/roaring_bitmap/host_bulk_example.cu
new file mode 100644
index 000000000..85870f74b
--- /dev/null
+++ b/examples/roaring_bitmap/host_bulk_example.cu
@@ -0,0 +1,75 @@
+#include <cuco/detail/error.hpp>
+#include <cuco/roaring_bitmap.cuh>
+
+#include <cuda/std/span>
+#include <thrust/logical.h>
+#include <thrust/universal_vector.h>
+
+#include <cuda_runtime.h>
+
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+int main(int argc, char* argv[])
+{
+  if (argc != 2) {
+    std::cerr << "Usage: " << argv[0] << " <bitmap_file_path>" << std::endl;
+    return -1;
+  }
+
+  // Open file
+  std::ifstream file(argv[1], std::ios::binary);
+  if (!file.is_open()) {
+    std::cerr << "Failed to open " << argv[1] << std::endl;
+    return -1;
+  }
+
+  // Get file size
+  file.seekg(0, std::ios::end);
+  std::streamsize file_size = file.tellg();
+  file.seekg(0, std::ios::beg);
+
+  // Allocate pinned host memory using cudaMallocHost
+  char* buffer;
+  CUCO_CUDA_TRY(cudaMallocHost(&buffer, file_size));
+
+  // Read file into memory
+  file.read(buffer, file_size);
+  file.close();
+
+  cuda::std::span<cuda::std::byte const> bitmap(reinterpret_cast<cuda::std::byte const*>(buffer),
+                                                file_size);
+  cuco::roaring_bitmap<cuda::std::uint32_t> roaring_bitmap(bitmap);
+
+  std::vector<cuda::std::uint32_t> keys;
+  for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) {
+    keys.push_back(k);
+  }
+  for (int k = 100000; k < 200000; ++k) {
+    keys.push_back(3 * k);
+  }
+  for (int k = 700000; k < 800000; ++k) {
+    keys.push_back(k);
+  }
+
+  thrust::universal_vector<cuda::std::uint32_t> keys_d(keys.begin(), keys.end());
+  thrust::universal_vector<bool> contained(keys.size(), false);
+
+  roaring_bitmap.contains(keys_d.begin(), keys_d.end(), contained.begin());
+
+  for (size_t i = 0; i < keys.size(); i++) {
+    if (not contained[i]) {
+      std::cout << "Error: " << keys_d[i] << " is not contained" << std::endl;
+    }
+  }
+
+  // check if all elements are contained
+  bool all_contained = thrust::all_of(contained.begin(), contained.end(), ::cuda::std::identity{});
+  std::cout << "all_contained: " << all_contained << std::endl;
+
+  // Free the allocated memory
+  CUCO_CUDA_TRY(cudaFreeHost(buffer));
+
+  return 0;
+}
\ No newline at end of file
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
new file mode 100644
index 000000000..3a17a82d0
--- /dev/null
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2025 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/detail/error.hpp>
+#include <cuco/detail/storage/storage_base.cuh>
+#include <cuco/utility/cuda_thread_scope.cuh>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/span>
+#include <cuda/stream_ref>
+
+namespace cuco {
+
+template <class T, cuda::thread_scope Scope, class Allocator>
+__host__ roaring_bitmap<T, Scope, Allocator>::roaring_bitmap(
+  cuda::std::span<cuda::std::byte const> compressed_bitmap,
+  cuda_thread_scope<Scope> scope,
+  Allocator const& alloc,
+  cuda::stream_ref stream)
+  : allocator_{alloc},
+    data_{allocator_.allocate(compressed_bitmap.size()),
+          detail::custom_deleter<cuda::std::size_t, allocator_type>{compressed_bitmap.size(),
+                                                                    allocator_}},
+    ref_{compressed_bitmap,
+         cuda::std::span<cuda::std::byte const>(data_.get(), compressed_bitmap.size()),
+         scope}  // TODO move after memcpy?
+{
+  CUCO_CUDA_TRY(cudaMemcpyAsync(data_.get(),
+                                compressed_bitmap.data(),
+                                compressed_bitmap.size(),
+                                cudaMemcpyHostToDevice,
+                                stream.get()));
+  stream.wait();  // TODO check if this is necessary
+}
+
+template <class T, cuda::thread_scope Scope, class Allocator>
+template <class InputIt, class OutputIt>
+__host__ void roaring_bitmap<T, Scope, Allocator>::contains(InputIt first,
+                                                            InputIt last,
+                                                            OutputIt output,
+                                                            cuda::stream_ref stream) const
+{
+  ref_.contains(first, last, output, stream);
+}
+
+template <class T, cuda::thread_scope Scope, class Allocator>
+template <class InputIt, class OutputIt>
+__host__ void roaring_bitmap<T, Scope, Allocator>::contains_async(
+  InputIt first, InputIt last, OutputIt output, cuda::stream_ref stream) const noexcept
+{
+  ref_.contains_async(first, last, output, stream);
+}
+
+template <class T, cuda::thread_scope Scope, class Allocator>
+__host__ cuda::std::size_t roaring_bitmap<T, Scope, Allocator>::size() const noexcept
+{
+  return ref_.size();
+}
+
+template <class T, cuda::thread_scope Scope, class Allocator>
+__host__ cuda::std::span<cuda::std::byte const> roaring_bitmap<T, Scope, Allocator>::data()
+  const noexcept
+{
+  return ref_.data();
+}
+
+template <class T, cuda::thread_scope Scope, class Allocator>
+__host__ typename roaring_bitmap<T, Scope, Allocator>::allocator_type
+roaring_bitmap<T, Scope, Allocator>::allocator() const noexcept
+{
+  return allocator_;
+}
+
+template <class T, cuda::thread_scope Scope, class Allocator>
+__host__ typename roaring_bitmap<T, Scope, Allocator>::ref_type<>
+roaring_bitmap<T, Scope, Allocator>::ref() const noexcept
+{
+  return ref_;
+}
+}  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
new file mode 100644
index 000000000..248428b69
--- /dev/null
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
@@ -0,0 +1,279 @@
+/*
+ * Copyright (c) 2025 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/detail/error.hpp>
+#include <cuco/utility/cuda_thread_scope.cuh>
+#include <cuco/utility/traits.hpp>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/cstdint>
+#include <cuda/std/functional>
+#include <cuda/std/iterator>
+#include <cuda/std/span>
+#include <cuda/stream_ref>
+#include <thrust/execution_policy.h>
+#include <thrust/fill.h>
+#include <thrust/transform.h>
+
+namespace cuco::detail {
+
+// primary template
+template <class T, cuda::thread_scope Scope>
+class roaring_bitmap_impl {
+  static_assert(cuco::dependent_false<T>, "T must be either uint32_t or uint64_t");
+};
+
+template <cuda::thread_scope Scope>
+class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
+  // Constants from the Roaring format spec
+  static constexpr cuda::std::uint32_t serial_cookie_no_runcontainer = 12346;
+  static constexpr cuda::std::uint32_t serial_cookie                 = 12347;
+  static constexpr cuda::std::uint32_t frozen_cookie                 = 13766;
+  static constexpr cuda::std::int32_t no_offset_threshold            = 4;
+
+ public:
+  static constexpr auto thread_scope = Scope;
+
+  __host__ roaring_bitmap_impl(cuda::std::span<cuda::std::byte const> compressed_bitmap_h,
+                               cuda::std::span<cuda::std::byte const> compressed_bitmap_d,
+                               cuda_thread_scope<Scope> /* scope */)
+    : data_{compressed_bitmap_d}
+  {
+    bool success = this->read_header(compressed_bitmap_h);
+    CUCO_EXPECTS(success, "Failed to read compressed bitmap");
+  }
+
+  __device__ roaring_bitmap_impl(cuda::std::span<cuda::std::byte const> compressed_bitmap,
+                                 cuda_thread_scope<Scope> /* scope */)
+    : data_{compressed_bitmap}
+  {
+    this->read_header(compressed_bitmap);  // TODO error handling
+  }
+
+  template <class InputIt, class OutputIt>
+  __host__ void contains(InputIt first,
+                         InputIt last,
+                         OutputIt contained,
+                         cuda::stream_ref stream = {}) const
+  {
+    this->contains_async(first, last, contained, stream);
+    stream.wait();
+  }
+
+  template <class InputIt, class OutputIt>
+  __host__ void contains_async(InputIt first,
+                               InputIt last,
+                               OutputIt contained,
+                               cuda::stream_ref stream = {}) const noexcept
+  {
+    auto nosync_exec_policy = thrust::cuda::par_nosync.on(stream.get());
+    if (this->empty()) {
+      thrust::fill(
+        nosync_exec_policy, contained, contained + cuda::std::distance(first, last), false);
+    } else {
+      thrust::transform(nosync_exec_policy,
+                        first,
+                        last,
+                        contained,
+                        cuda::proclaim_return_type<bool>(
+                          [*this] __device__(auto key) { return this->contains(key); }));
+    }
+  }
+
+  __device__ bool contains(cuda::std::uint32_t value) const
+  {
+    cuda::std::uint16_t upper = value >> 16;
+    cuda::std::uint16_t lower = value & 0xFFFF;
+
+    // TODO binary search on key_cards_
+    for (cuda::std::int32_t i = 0; i < num_containers_; i++) {
+      if (key_cards_[i * 2] == upper) {
+        cuda::std::uint32_t card = key_cards_[i * 2 + 1] + 1;
+        cuda::std::uint16_t const* container =
+          reinterpret_cast<cuda::std::uint16_t const*>(data_.data() + this->container_offset(i));
+        if (this->is_run_container(i)) {
+          return this->contains_run_container(container, lower, card);
+        } else {
+          if (card <= 4096) {  // TODO check if this is correct
+            return this->contains_array_container(container, lower, card);
+          } else {
+            return this->contains_bitset_container(container, lower, card);
+          }
+        }
+      }
+    }
+    return false;
+  }
+
+  [[nodiscard]] __host__ __device__ cuda::std::size_t size() const noexcept { return size_; }
+
+  [[nodiscard]] __host__ __device__ bool empty() const noexcept { return size_ == 0; }
+
+  [[nodiscard]] __host__ __device__ cuda::std::span<cuda::std::byte const> data() const noexcept
+  {
+    return data_;
+  }
+
+ private:
+  __device__ bool is_run_container(cuda::std::int32_t i) const
+  {
+    if (not has_run_) return false;
+    return run_container_bitmap_[i / 8] & (1 << (i % 8));
+  }
+
+  __device__ bool contains_array_container(cuda::std::uint16_t const* container,
+                                           cuda::std::uint16_t lower,
+                                           cuda::std::uint32_t card) const
+  {
+    // TODO binary search on container
+    // if (card < 256) -> linear search
+    for (cuda::std::uint32_t i = 0; i < card; i++) {
+      if (container[i] == lower) { return true; }
+    }
+    return false;
+  }
+
+  __device__ bool contains_bitset_container(cuda::std::uint16_t const* container,
+                                            cuda::std::uint16_t lower,
+                                            cuda::std::uint32_t card) const
+  {
+    // check if bit at position lower is set
+    return container[lower / 16] & (1 << (lower % 16));
+  }
+
+  __device__ bool contains_run_container(cuda::std::uint16_t const* container,
+                                         cuda::std::uint16_t lower,
+                                         cuda::std::uint32_t card) const
+  {
+    // TODO implement
+    return false;
+  }
+
+  __device__ cuda::std::uint32_t container_offset(cuda::std::int32_t i) const
+  {
+    cuda::std::uint32_t offset;
+    cuda::std::memcpy(
+      &offset, offsets_ + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t));
+    return offset;
+  }
+
+  __host__ __device__ bool read_header(cuda::std::span<cuda::std::byte const> compressed_bitmap)
+  {
+    cuda::std::size_t length                     = compressed_bitmap.size();
+    cuda::std::byte const* buf                   = compressed_bitmap.data();
+    [[maybe_unused]] cuda::std::size_t readbytes = 0;
+
+    // cookie and num_containers
+    if (length < 4) {
+      // printf("length is less than 4\n");
+      return false;
+    }
+
+    cuda::std::uint32_t cookie;
+    cuda::std::memcpy(&cookie, buf, sizeof(cuda::std::uint32_t));
+    readbytes += sizeof(cuda::std::uint32_t);
+    buf += sizeof(cuda::std::uint32_t);
+    if ((cookie & 0xFFFF) != serial_cookie && cookie != serial_cookie_no_runcontainer) {
+      // printf("cookie is not serial cookie or serial cookie no runcontainer\n");
+      return false;
+    }
+
+    if ((cookie & 0xFFFF) == serial_cookie)
+      num_containers_ = (cookie >> 16) + 1;
+    else {
+      readbytes += sizeof(cuda::std::uint32_t);
+      if (readbytes > length) {
+        // printf("readbytes is greater than length\n");
+        return false;
+      }
+      cuda::std::memcpy(&num_containers_, buf, sizeof(cuda::std::uint32_t));
+      buf += sizeof(cuda::std::uint32_t);
+    }
+    if (num_containers_ < 0) {
+      // printf("num_containers_ is less than 0\n");
+      return false;
+    }
+    if (num_containers_ > (1 << 16)) {
+      // printf("num_containers_ is greater than 65536\n");
+      return false;
+    }
+    // printf("num_containers_: %d\n", num_containers_);
+
+    has_run_ = (cookie & 0xFFFF) == serial_cookie;
+    if (has_run_) {
+      cuda::std::size_t s = (num_containers_ + 7) / 8;
+      readbytes += s;
+      if (readbytes > length) {
+        // printf("readbytes is greater than length\n");
+        return false;
+      }
+      run_container_bitmap_ = reinterpret_cast<cuda::std::uint8_t const*>(buf);
+      buf += s;
+    }
+    // printf("has_run: %d\n", has_run_);
+
+    key_cards_ = reinterpret_cast<cuda::std::uint16_t const*>(buf);
+    readbytes += num_containers_ * 2 * sizeof(cuda::std::uint16_t);
+    if (readbytes > length) {
+      // printf("readbytes is greater than length\n");
+      return false;
+    }
+    buf += num_containers_ * 2 * sizeof(cuda::std::uint16_t);
+
+    if ((!has_run_) || (num_containers_ >= no_offset_threshold)) {
+      readbytes += num_containers_ * 4;
+      if (readbytes > length) {
+        // printf("readbytes is greater than length\n");
+        return false;
+      }
+      offsets_ = buf;
+      buf += num_containers_ * 4;
+    }
+
+    readbytes += num_containers_ * 4;
+    if (readbytes > length) {
+      // printf("readbytes is greater than length\n");
+      return false;
+    }
+
+    size_ = 0;
+    for (cuda::std::int32_t i = 0; i < num_containers_; i++) {
+      // cuda::std::uint16_t key  = key_cards_[i * 2];
+      cuda::std::uint32_t card = key_cards_[i * 2 + 1] + 1;
+      size_ += card;
+      // printf("key: %d, card: %d\n", key, card);
+    }
+
+    return true;
+  }
+
+  cuda::std::span<cuda::std::byte const> data_;
+  cuda::std::size_t size_;
+  cuda::std::int32_t num_containers_;
+  cuda::std::uint8_t const* run_container_bitmap_;
+  cuda::std::uint16_t const* key_cards_;
+  cuda::std::byte const* offsets_;
+  bool has_run_;
+};
+
+template <cuda::thread_scope Scope>
+class roaring_bitmap_impl<cuda::std::uint64_t, Scope> {
+  using bucket_type = roaring_bitmap_impl<cuda::std::uint32_t, Scope>;
+  // TODO implement
+};
+
+}  // namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
new file mode 100644
index 000000000..b66ea9e31
--- /dev/null
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2025 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/utility/cuda_thread_scope.cuh>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/span>
+#include <cuda/stream_ref>
+
+namespace cuco {
+
+template <class T, cuda::thread_scope Scope>
+__host__ roaring_bitmap_ref<T, Scope>::roaring_bitmap_ref(
+  cuda::std::span<cuda::std::byte const> compressed_bitmap_h,
+  cuda::std::span<cuda::std::byte const> compressed_bitmap_d,
+  cuda_thread_scope<Scope> scope)
+  : impl_{compressed_bitmap_h, compressed_bitmap_d, scope}
+{
+}
+
+template <class T, cuda::thread_scope Scope>
+__device__ roaring_bitmap_ref<T, Scope>::roaring_bitmap_ref(
+  cuda::std::span<cuda::std::byte const> compressed_bitmap, cuda_thread_scope<Scope> scope)
+  : impl_{compressed_bitmap, scope}
+{
+}
+
+template <class T, cuda::thread_scope Scope>
+template <class InputIt, class OutputIt>
+__host__ void roaring_bitmap_ref<T, Scope>::contains(InputIt first,
+                                                     InputIt last,
+                                                     OutputIt output,
+                                                     cuda::stream_ref stream) const
+{
+  impl_.contains(first, last, output, stream);
+}
+
+template <class T, cuda::thread_scope Scope>
+template <class InputIt, class OutputIt>
+__host__ void roaring_bitmap_ref<T, Scope>::contains_async(InputIt first,
+                                                           InputIt last,
+                                                           OutputIt output,
+                                                           cuda::stream_ref stream) const noexcept
+{
+  impl_.contains_async(first, last, output, stream);
+}
+
+template <class T, cuda::thread_scope Scope>
+__device__ bool roaring_bitmap_ref<T, Scope>::contains(T value) const
+{
+  return impl_.contains(value);
+}
+
+template <class T, cuda::thread_scope Scope>
+__host__ __device__ cuda::std::size_t roaring_bitmap_ref<T, Scope>::size() const noexcept
+{
+  return impl_.size();
+}
+
+template <class T, cuda::thread_scope Scope>
+__host__ __device__ cuda::std::span<cuda::std::byte const> roaring_bitmap_ref<T, Scope>::data()
+  const noexcept
+{
+  return impl_.data();
+}
+}  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/roaring_bitmap.cuh b/include/cuco/roaring_bitmap.cuh
new file mode 100644
index 000000000..b850431a7
--- /dev/null
+++ b/include/cuco/roaring_bitmap.cuh
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2025 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/detail/storage/storage_base.cuh>
+#include <cuco/roaring_bitmap_ref.cuh>
+#include <cuco/utility/allocator.hpp>
+#include <cuco/utility/cuda_thread_scope.cuh>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/span>
+#include <cuda/stream_ref>
+
+#include <memory>
+
+namespace cuco {
+
+template <class T,
+          cuda::thread_scope Scope = cuda::thread_scope_device,
+          class Allocator          = cuco::cuda_allocator<cuda::std::byte>>
+class roaring_bitmap {
+ public:
+  static constexpr auto thread_scope = Scope;
+
+  using allocator_type = Allocator;
+
+  template <cuda::thread_scope NewScope = thread_scope>
+  using ref_type = roaring_bitmap_ref<T, NewScope>;
+
+  __host__ roaring_bitmap(cuda::std::span<const cuda::std::byte> compressed_bitmap,
+                          cuda_thread_scope<Scope> scope = {},
+                          Allocator const& alloc         = {},
+                          cuda::stream_ref stream        = {});
+
+  roaring_bitmap(roaring_bitmap const& other)            = default;
+  roaring_bitmap(roaring_bitmap&& other)                 = default;
+  roaring_bitmap& operator=(roaring_bitmap const& other) = default;
+  roaring_bitmap& operator=(roaring_bitmap&& other)      = default;
+
+  ~roaring_bitmap() = default;
+
+  template <class InputIt, class OutputIt>
+  __host__ void contains(InputIt first,
+                         InputIt last,
+                         OutputIt contained,
+                         cuda::stream_ref stream = {}) const;
+
+  template <class InputIt, class OutputIt>
+  __host__ void contains_async(InputIt first,
+                               InputIt last,
+                               OutputIt contained,
+                               cuda::stream_ref stream = {}) const noexcept;
+
+  // TODO contains_if, contains_if_async, empty
+
+  [[nodiscard]] __host__ cuda::std::size_t size() const noexcept;
+
+  [[nodiscard]] __host__ cuda::std::span<cuda::std::byte const> data() const noexcept;
+
+  [[nodiscard]] __host__ allocator_type allocator() const noexcept;
+
+  [[nodiscard]] __host__ ref_type<> ref() const noexcept;
+
+ private:
+  allocator_type allocator_;
+  std::unique_ptr<cuda::std::byte, detail::custom_deleter<cuda::std::size_t, allocator_type>> data_;
+  ref_type<> ref_;
+};
+
+}  // namespace cuco
+
+#include <cuco/detail/roaring_bitmap/roaring_bitmap.inl>
\ No newline at end of file
diff --git a/include/cuco/roaring_bitmap_ref.cuh b/include/cuco/roaring_bitmap_ref.cuh
new file mode 100644
index 000000000..a26474cd9
--- /dev/null
+++ b/include/cuco/roaring_bitmap_ref.cuh
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2025 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh>
+#include <cuco/utility/cuda_thread_scope.cuh>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/span>
+#include <cuda/stream_ref>
+
+namespace cuco {
+
+template <class T, cuda::thread_scope Scope = cuda::thread_scope_device>
+class roaring_bitmap_ref {
+  using impl_type = detail::roaring_bitmap_impl<T, Scope>;
+
+ public:
+  static constexpr auto thread_scope = impl_type::thread_scope;
+
+  // This is tricky as it is not clear if compressed_bitmap resides in host or device memory.
+  __host__ roaring_bitmap_ref(cuda::std::span<cuda::std::byte const> compressed_bitmap_h,
+                              cuda::std::span<cuda::std::byte const> compressed_bitmap_d,
+                              cuda_thread_scope<Scope> scope = {});
+
+  __device__ roaring_bitmap_ref(cuda::std::span<cuda::std::byte const> compressed_bitmap,
+                                cuda_thread_scope<Scope> scope = {});
+
+  template <class InputIt, class OutputIt>
+  __host__ void contains(InputIt first,
+                         InputIt last,
+                         OutputIt contained,
+                         cuda::stream_ref stream = {}) const;
+
+  template <class InputIt, class OutputIt>
+  __host__ void contains_async(InputIt first,
+                               InputIt last,
+                               OutputIt contained,
+                               cuda::stream_ref stream = {}) const noexcept;
+
+  __device__ bool contains(T value) const;
+
+  [[nodiscard]] __host__ __device__ cuda::std::size_t size() const noexcept;
+
+  [[nodiscard]] __host__ __device__ cuda::std::span<cuda::std::byte const> data() const noexcept;
+
+ private:
+  impl_type impl_;
+};
+
+}  // namespace cuco
+
+#include <cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl>
\ No newline at end of file

From 7ff8399905de735b0e9219d89a1a9a5294c8e5e4 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 4 Jul 2025 17:10:42 -0700
Subject: [PATCH 02/24] Preliminary benchmark

---
 benchmarks/CMakeLists.txt                   |  5 ++
 benchmarks/roaring_bitmap/contains_bench.cu | 94 +++++++++++++++++++++
 examples/CMakeLists.txt                     | 28 +++---
 3 files changed, 113 insertions(+), 14 deletions(-)
 create mode 100644 benchmarks/roaring_bitmap/contains_bench.cu

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 17b5b21c1..ebab3e888 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -100,3 +100,8 @@ ConfigureBench(HYPERLOGLOG_BENCH
 ConfigureBench(BLOOM_FILTER_BENCH
   bloom_filter/add_bench.cu
   bloom_filter/contains_bench.cu)
+
+###################################################################################################
+# - roaring_bitmap benchmarks ---------------------------------------------------------------------
+ConfigureBench(ROARING_BITMAP_BENCH
+  roaring_bitmap/contains_bench.cu)
\ No newline at end of file
diff --git a/benchmarks/roaring_bitmap/contains_bench.cu b/benchmarks/roaring_bitmap/contains_bench.cu
new file mode 100644
index 000000000..f8a0fdcfd
--- /dev/null
+++ b/benchmarks/roaring_bitmap/contains_bench.cu
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <benchmark_defaults.hpp>
+#include <benchmark_utils.hpp>
+
+#include <cuco/roaring_bitmap.cuh>
+
+#include <nvbench/nvbench.cuh>
+
+#include <thrust/device_vector.h>
+
+#include <filesystem>
+#include <fstream>
+#include <iostream>
+#include <vector>
+
+void roaring_bitmap_contains(nvbench::state& state)
+{
+  namespace fs = std::filesystem;
+
+  // Get the path of the current source file
+  fs::path source_file_path = __FILE__;
+  fs::path source_dir       = source_file_path.parent_path();
+
+  fs::path path      = source_dir / "../../examples/roaring_bitmap/bitmapwithoutruns.bin";
+  fs::path full_path = path.lexically_normal();
+
+  // Open file
+  std::ifstream file(full_path, std::ios::binary);
+  if (!file.is_open()) { state.skip("Failed to open bitmap file"); }
+
+  // Get file size
+  file.seekg(0, std::ios::end);
+  std::streamsize file_size = file.tellg();
+  file.seekg(0, std::ios::beg);
+
+  // Allocate pinned host memory using cudaMallocHost
+  char* buffer;
+  CUCO_CUDA_TRY(cudaMallocHost(&buffer, file_size));
+
+  // Read file into memory
+  file.read(buffer, file_size);
+  file.close();
+
+  cuda::std::span<cuda::std::byte const> bitmap(reinterpret_cast<cuda::std::byte const*>(buffer),
+                                                file_size);
+  cuco::roaring_bitmap<cuda::std::uint32_t> roaring_bitmap(bitmap);
+
+  std::vector<cuda::std::uint32_t> keys;
+  for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) {
+    keys.push_back(k);
+  }
+  for (cuda::std::uint32_t k = 100000; k < 200000; ++k) {
+    keys.push_back(3 * k);
+  }
+  for (cuda::std::uint32_t k = 700000; k < 800000; ++k) {
+    keys.push_back(k);
+  }
+
+  // multiply the keys for the benchmark
+  for (int i = 0; i < 13; i++) {
+    keys.insert(keys.end(), keys.begin(), keys.end());
+  }
+
+  thrust::device_vector<cuda::std::uint32_t> keys_d(keys.begin(), keys.end());
+  thrust::device_vector<bool> contained(keys.size(), false);
+
+  state.add_element_count(keys.size());
+  state.add_global_memory_reads<cuda::std::uint32_t>(keys.size(), "InputSize");
+
+  state.exec([&](nvbench::launch& launch) {
+    roaring_bitmap.contains_async(
+      keys_d.begin(), keys_d.end(), contained.begin(), {launch.get_stream()});
+  });
+
+  CUCO_CUDA_TRY(cudaFreeHost(buffer));
+}
+
+NVBENCH_BENCH(roaring_bitmap_contains)
+  .set_name("roaring_bitmap_contains")
+  .set_max_noise(cuco::benchmark::defaults::MAX_NOISE);
\ No newline at end of file
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index e2328b496..08bf51197 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -33,18 +33,18 @@ endfunction(ConfigureExample)
 ### Example sources ###############################################################################
 ###################################################################################################
 
-# ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu")
-# ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu")
-# ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu")
-# ConfigureExample(STATIC_SET_SHARED_MEMORY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/shared_memory_example.cu")
-# ConfigureExample(STATIC_SET_MAPPING_TABLE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/mapping_table_example.cu")
-# ConfigureExample(STATIC_MULTISET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multiset/host_bulk_example.cu")
-# ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu")
-# ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_ref_example.cu")
-# ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu")
-# ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu")
-# ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu")
-# ConfigureExample(HYPERLOGLOG_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/host_bulk_example.cu")
-# ConfigureExample(HYPERLOGLOG_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/device_ref_example.cu")
-# ConfigureExample(BLOOM_FILTER_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/bloom_filter/host_bulk_example.cu")
+ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu")
+ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu")
+ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu")
+ConfigureExample(STATIC_SET_SHARED_MEMORY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/shared_memory_example.cu")
+ConfigureExample(STATIC_SET_MAPPING_TABLE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/mapping_table_example.cu")
+ConfigureExample(STATIC_MULTISET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multiset/host_bulk_example.cu")
+ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu")
+ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_ref_example.cu")
+ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu")
+ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu")
+ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu")
+ConfigureExample(HYPERLOGLOG_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/host_bulk_example.cu")
+ConfigureExample(HYPERLOGLOG_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/device_ref_example.cu")
+ConfigureExample(BLOOM_FILTER_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/bloom_filter/host_bulk_example.cu")
 ConfigureExample(ROARING_BITMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/roaring_bitmap/host_bulk_example.cu")

From 77a4c1d387f4430941303e8bfdef4e84b78f1942 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 4 Jul 2025 18:19:44 -0700
Subject: [PATCH 03/24] Optimizations

---
 benchmarks/roaring_bitmap/contains_bench.cu   |  6 +-
 .../roaring_bitmap/roaring_bitmap_impl.cuh    | 87 ++++++++++++++-----
 2 files changed, 69 insertions(+), 24 deletions(-)

diff --git a/benchmarks/roaring_bitmap/contains_bench.cu b/benchmarks/roaring_bitmap/contains_bench.cu
index f8a0fdcfd..443136315 100644
--- a/benchmarks/roaring_bitmap/contains_bench.cu
+++ b/benchmarks/roaring_bitmap/contains_bench.cu
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include <benchmark_defaults.hpp>
 #include <benchmark_utils.hpp>
 
@@ -38,7 +39,6 @@ void roaring_bitmap_contains(nvbench::state& state)
   fs::path path      = source_dir / "../../examples/roaring_bitmap/bitmapwithoutruns.bin";
   fs::path full_path = path.lexically_normal();
 
-  // Open file
   std::ifstream file(full_path, std::ios::binary);
   if (!file.is_open()) { state.skip("Failed to open bitmap file"); }
 
@@ -47,11 +47,9 @@ void roaring_bitmap_contains(nvbench::state& state)
   std::streamsize file_size = file.tellg();
   file.seekg(0, std::ios::beg);
 
-  // Allocate pinned host memory using cudaMallocHost
   char* buffer;
   CUCO_CUDA_TRY(cudaMallocHost(&buffer, file_size));
 
-  // Read file into memory
   file.read(buffer, file_size);
   file.close();
 
@@ -70,7 +68,7 @@ void roaring_bitmap_contains(nvbench::state& state)
     keys.push_back(k);
   }
 
-  // multiply the keys for the benchmark
+  // multiply the keys for more accurate benchmark numbers
   for (int i = 0; i < 13; i++) {
     keys.insert(keys.end(), keys.begin(), keys.end());
   }
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
index 248428b69..7910d485b 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #pragma once
 
 #include <cuco/detail/error.hpp>
@@ -44,6 +45,7 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
   static constexpr cuda::std::uint32_t serial_cookie                 = 12347;
   static constexpr cuda::std::uint32_t frozen_cookie                 = 13766;
   static constexpr cuda::std::int32_t no_offset_threshold            = 4;
+  static constexpr cuda::std::uint32_t binary_search_threshold = 8;  // TODO determine optimal value
 
  public:
   static constexpr auto thread_scope = Scope;
@@ -99,20 +101,25 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
     cuda::std::uint16_t upper = value >> 16;
     cuda::std::uint16_t lower = value & 0xFFFF;
 
-    // TODO binary search on key_cards_
-    for (cuda::std::int32_t i = 0; i < num_containers_; i++) {
-      if (key_cards_[i * 2] == upper) {
-        cuda::std::uint32_t card = key_cards_[i * 2 + 1] + 1;
-        cuda::std::uint16_t const* container =
-          reinterpret_cast<cuda::std::uint16_t const*>(data_.data() + this->container_offset(i));
-        if (this->is_run_container(i)) {
-          return this->contains_run_container(container, lower, card);
+    // Binary search on key_cards_ to find container with matching upper key
+    cuda::std::uint32_t left  = 0;
+    cuda::std::uint32_t right = num_containers_;
+
+    if (num_containers_ < binary_search_threshold) {
+      for (cuda::std::uint32_t i = 0; i < num_containers_; i++) {
+        if (key_cards_[i * 2] == upper) { return this->contains_container(lower, i); }
+      }
+    } else {
+      while (left < right) {
+        cuda::std::uint32_t mid     = left + (right - left) / 2;
+        cuda::std::uint16_t mid_key = key_cards_[mid * 2];
+
+        if (mid_key == upper) {
+          return this->contains_container(lower, mid);
+        } else if (mid_key < upper) {
+          left = mid + 1;
         } else {
-          if (card <= 4096) {  // TODO check if this is correct
-            return this->contains_array_container(container, lower, card);
-          } else {
-            return this->contains_bitset_container(container, lower, card);
-          }
+          right = mid;
         }
       }
     }
@@ -135,16 +142,48 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
     return run_container_bitmap_[i / 8] & (1 << (i % 8));
   }
 
+  __device__ bool contains_container(cuda::std::uint16_t lower, cuda::std::uint32_t index) const
+  {
+    cuda::std::uint32_t card = key_cards_[index * 2 + 1] + 1;
+    cuda::std::uint16_t const* container =
+      reinterpret_cast<cuda::std::uint16_t const*>(data_.data() + this->container_offset(index));
+    if (this->is_run_container(index)) {
+      return this->contains_run_container(container, lower, card);
+    } else {
+      if (card <= 4096) {  // TODO check if this is correct
+        return this->contains_array_container(container, lower, card);
+      } else {
+        return this->contains_bitset_container(container, lower, card);
+      }
+    }
+  }
+
   __device__ bool contains_array_container(cuda::std::uint16_t const* container,
                                            cuda::std::uint16_t lower,
                                            cuda::std::uint32_t card) const
   {
-    // TODO binary search on container
-    // if (card < 256) -> linear search
-    for (cuda::std::uint32_t i = 0; i < card; i++) {
-      if (container[i] == lower) { return true; }
+    // Use linear search for small arrays, binary search for larger ones
+    if (card < binary_search_threshold) {
+      for (cuda::std::uint32_t i = 0; i < card; i++) {
+        if (container[i] == lower) { return true; }
+      }
+      return false;
+    } else {
+      cuda::std::uint32_t left  = 0;
+      cuda::std::uint32_t right = card;
+
+      while (left < right) {
+        cuda::std::uint32_t mid = left + (right - left) / 2;
+        if (container[mid] == lower) {
+          return true;
+        } else if (container[mid] < lower) {
+          left = mid + 1;
+        } else {
+          right = mid;
+        }
+      }
+      return false;
     }
-    return false;
   }
 
   __device__ bool contains_bitset_container(cuda::std::uint16_t const* container,
@@ -166,8 +205,13 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
   __device__ cuda::std::uint32_t container_offset(cuda::std::int32_t i) const
   {
     cuda::std::uint32_t offset;
-    cuda::std::memcpy(
-      &offset, offsets_ + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t));
+    if (offsets_aligned_) {
+      offset =
+        *reinterpret_cast<cuda::std::uint32_t const*>(offsets_ + i * sizeof(cuda::std::uint32_t));
+    } else {
+      cuda::std::memcpy(
+        &offset, offsets_ + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t));
+    }
     return offset;
   }
 
@@ -241,6 +285,8 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
         return false;
       }
       offsets_ = buf;
+      offsets_aligned_ =
+        (reinterpret_cast<cuda::std::uintptr_t>(offsets_) % sizeof(cuda::std::uint32_t)) == 0;
       buf += num_containers_ * 4;
     }
 
@@ -267,6 +313,7 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
   cuda::std::uint8_t const* run_container_bitmap_;
   cuda::std::uint16_t const* key_cards_;
   cuda::std::byte const* offsets_;
+  bool offsets_aligned_;
   bool has_run_;
 };
 

From 142ac06b758a44fe935bdc2a79cff63a6ad2b8b3 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 7 Jul 2025 17:54:35 -0700
Subject: [PATCH 04/24] v2

---
 benchmarks/roaring_bitmap/contains_bench.cu   |   5 +-
 examples/roaring_bitmap/host_bulk_example.cu  |  14 +-
 .../detail/roaring_bitmap/roaring_bitmap.inl  |  29 +-
 .../roaring_bitmap/roaring_bitmap_impl.cuh    | 256 ++++++++++--------
 .../roaring_bitmap/roaring_bitmap_ref.inl     |  24 +-
 include/cuco/roaring_bitmap.cuh               |   3 +-
 include/cuco/roaring_bitmap_ref.cuh           |  13 +-
 7 files changed, 190 insertions(+), 154 deletions(-)

diff --git a/benchmarks/roaring_bitmap/contains_bench.cu b/benchmarks/roaring_bitmap/contains_bench.cu
index 443136315..2f727c541 100644
--- a/benchmarks/roaring_bitmap/contains_bench.cu
+++ b/benchmarks/roaring_bitmap/contains_bench.cu
@@ -53,9 +53,8 @@ void roaring_bitmap_contains(nvbench::state& state)
   file.read(buffer, file_size);
   file.close();
 
-  cuda::std::span<cuda::std::byte const> bitmap(reinterpret_cast<cuda::std::byte const*>(buffer),
-                                                file_size);
-  cuco::roaring_bitmap<cuda::std::uint32_t> roaring_bitmap(bitmap);
+  cuco::roaring_bitmap<cuda::std::uint32_t> roaring_bitmap(
+    reinterpret_cast<cuda::std::byte const*>(buffer));
 
   std::vector<cuda::std::uint32_t> keys;
   for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) {
diff --git a/examples/roaring_bitmap/host_bulk_example.cu b/examples/roaring_bitmap/host_bulk_example.cu
index 85870f74b..bbbbe6005 100644
--- a/examples/roaring_bitmap/host_bulk_example.cu
+++ b/examples/roaring_bitmap/host_bulk_example.cu
@@ -38,9 +38,8 @@ int main(int argc, char* argv[])
   file.read(buffer, file_size);
   file.close();
 
-  cuda::std::span<cuda::std::byte const> bitmap(reinterpret_cast<cuda::std::byte const*>(buffer),
-                                                file_size);
-  cuco::roaring_bitmap<cuda::std::uint32_t> roaring_bitmap(bitmap);
+  cuco::roaring_bitmap<cuda::std::uint32_t> roaring_bitmap(
+    reinterpret_cast<cuda::std::byte const*>(buffer));
 
   std::vector<cuda::std::uint32_t> keys;
   for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) {
@@ -58,13 +57,18 @@ int main(int argc, char* argv[])
 
   roaring_bitmap.contains(keys_d.begin(), keys_d.end(), contained.begin());
 
+  size_t num_errors = 0;
   for (size_t i = 0; i < keys.size(); i++) {
     if (not contained[i]) {
-      std::cout << "Error: " << keys_d[i] << " is not contained" << std::endl;
+      if (num_errors <= 10) {
+        std::cout << "Error: " << keys_d[i] << " is not contained" << std::endl;
+      }
+      num_errors++;
     }
   }
+  if (num_errors > 0) { std::cout << "num_errors: " << num_errors << std::endl; }
 
-  // check if all elements are contained
+  // check if all elements are contained and written to output
   bool all_contained = thrust::all_of(contained.begin(), contained.end(), ::cuda::std::identity{});
   std::cout << "all_contained: " << all_contained << std::endl;
 
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
index 3a17a82d0..efb64c448 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
@@ -26,25 +26,20 @@
 namespace cuco {
 
 template <class T, cuda::thread_scope Scope, class Allocator>
-__host__ roaring_bitmap<T, Scope, Allocator>::roaring_bitmap(
-  cuda::std::span<cuda::std::byte const> compressed_bitmap,
-  cuda_thread_scope<Scope> scope,
-  Allocator const& alloc,
-  cuda::stream_ref stream)
+__host__ roaring_bitmap<T, Scope, Allocator>::roaring_bitmap(cuda::std::byte const* bitmap,
+                                                             cuda_thread_scope<Scope> scope,
+                                                             Allocator const& alloc,
+                                                             cuda::stream_ref stream)
   : allocator_{alloc},
-    data_{allocator_.allocate(compressed_bitmap.size()),
-          detail::custom_deleter<cuda::std::size_t, allocator_type>{compressed_bitmap.size(),
-                                                                    allocator_}},
-    ref_{compressed_bitmap,
-         cuda::std::span<cuda::std::byte const>(data_.get(), compressed_bitmap.size()),
-         scope}  // TODO move after memcpy?
+    metadata_{ref_type<>::read_metadata(bitmap)},
+    data_{
+      allocator_.allocate(metadata_.size_bytes),
+      detail::custom_deleter<cuda::std::size_t, allocator_type>{metadata_.size_bytes, allocator_}},
+    ref_{data_.get(), metadata_, scope}
 {
-  CUCO_CUDA_TRY(cudaMemcpyAsync(data_.get(),
-                                compressed_bitmap.data(),
-                                compressed_bitmap.size(),
-                                cudaMemcpyHostToDevice,
-                                stream.get()));
-  stream.wait();  // TODO check if this is necessary
+  CUCO_CUDA_TRY(cudaMemcpyAsync(
+    data_.get(), bitmap, metadata_.size_bytes, cudaMemcpyHostToDevice, stream.get()));
+  // stream.wait();  // TODO check if this is necessary
 }
 
 template <class T, cuda::thread_scope Scope, class Allocator>
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
index 7910d485b..8dc6fe633 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
@@ -30,8 +30,30 @@
 #include <thrust/fill.h>
 #include <thrust/transform.h>
 
+#include <nv/target>
+
 namespace cuco::detail {
 
+template <class T>
+struct roaring_bitmap_metadata {
+  static_assert(cuco::dependent_false<T>, "T must be either uint32_t or uint64_t");
+};
+
+template <>
+struct roaring_bitmap_metadata<cuda::std::uint32_t> {
+  cuda::std::size_t size_bytes           = 0;
+  cuda::std::size_t num_keys             = 0;
+  cuda::std::size_t run_container_bitmap = 0;
+  cuda::std::size_t key_cards            = 0;
+  cuda::std::size_t container_offsets    = 0;
+  cuda::std::int32_t num_containers      = 0;
+  bool has_run                           = false;
+  bool offsets_aligned                   = false;
+  bool valid                             = false;
+};
+
+// TODO implement roaring_bitmap_metadata<cuda::std::uint64_t>
+
 // primary template
 template <class T, cuda::thread_scope Scope>
 class roaring_bitmap_impl {
@@ -48,22 +70,33 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
   static constexpr cuda::std::uint32_t binary_search_threshold = 8;  // TODO determine optimal value
 
  public:
+  using metadata_type                = roaring_bitmap_metadata<cuda::std::uint32_t>;
   static constexpr auto thread_scope = Scope;
 
-  __host__ roaring_bitmap_impl(cuda::std::span<cuda::std::byte const> compressed_bitmap_h,
-                               cuda::std::span<cuda::std::byte const> compressed_bitmap_d,
-                               cuda_thread_scope<Scope> /* scope */)
-    : data_{compressed_bitmap_d}
+  __host__ __device__ roaring_bitmap_impl(cuda::std::byte const* bitmap,
+                                          metadata_type metadata,
+                                          cuda_thread_scope<Scope> /* scope */)
   {
-    bool success = this->read_header(compressed_bitmap_h);
-    CUCO_EXPECTS(success, "Failed to read compressed bitmap");
+    NV_IF_TARGET(
+      NV_IS_HOST,
+      CUCO_EXPECTS(metadata.valid, "Invalid bitmap format");)  // TODO device error handling
+
+    if (metadata.valid) {
+      data_           = cuda::std::span<cuda::std::byte const>{bitmap, metadata.size_bytes};
+      size_           = metadata.num_keys;
+      num_containers_ = metadata.num_containers;
+      run_container_bitmap_ =
+        reinterpret_cast<cuda::std::uint8_t const*>(bitmap + metadata.run_container_bitmap);
+      key_cards_ = reinterpret_cast<cuda::std::uint16_t const*>(bitmap + metadata.key_cards);
+      offsets_   = reinterpret_cast<cuda::std::byte const*>(bitmap + metadata.container_offsets);
+      offsets_aligned_ = metadata.offsets_aligned;
+      has_run_         = metadata.has_run;
+    }
   }
 
-  __device__ roaring_bitmap_impl(cuda::std::span<cuda::std::byte const> compressed_bitmap,
-                                 cuda_thread_scope<Scope> /* scope */)
-    : data_{compressed_bitmap}
+  __device__ roaring_bitmap_impl(cuda::std::byte const* bitmap, cuda_thread_scope<Scope> scope)
+    : roaring_bitmap_impl(bitmap, read_metadata(bitmap), scope)
   {
-    this->read_header(compressed_bitmap);  // TODO error handling
   }
 
   template <class InputIt, class OutputIt>
@@ -135,19 +168,102 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
     return data_;
   }
 
+  __host__ __device__ static metadata_type const read_metadata(
+    cuda::std::byte const* bitmap) noexcept
+  {
+    cuda::std::byte const* buf = bitmap;
+    metadata_type metadata;
+
+    cuda::std::uint32_t cookie;
+    cuda::std::memcpy(&cookie, buf, sizeof(cuda::std::uint32_t));
+    buf += sizeof(cuda::std::uint32_t);
+    if ((cookie & 0xFFFF) != serial_cookie && cookie != serial_cookie_no_runcontainer) {
+      metadata.valid = false;
+      return metadata;
+    }
+
+    if ((cookie & 0xFFFF) == serial_cookie)
+      metadata.num_containers = (cookie >> 16) + 1;
+    else {
+      cuda::std::memcpy(&metadata.num_containers, buf, sizeof(cuda::std::uint32_t));
+      buf += sizeof(cuda::std::uint32_t);
+    }
+    if (metadata.num_containers < 0) {
+      metadata.valid = false;
+      return metadata;
+    }
+    if (metadata.num_containers > (1 << 16)) {
+      metadata.valid = false;
+      return metadata;
+    }
+
+    metadata.has_run = (cookie & 0xFFFF) == serial_cookie;
+    if (metadata.has_run) {
+      metadata.valid = false;
+      return metadata;  // TODO run container bitmap is not supported yet
+      cuda::std::size_t s           = (metadata.num_containers + 7) / 8;
+      metadata.run_container_bitmap = cuda::std::distance(bitmap, buf);
+      buf += s;
+    }
+
+    metadata.key_cards = cuda::std::distance(bitmap, buf);
+    buf += metadata.num_containers * 2 * sizeof(cuda::std::uint16_t);
+
+    if ((!metadata.has_run) || (metadata.num_containers >= no_offset_threshold)) {
+      metadata.container_offsets = cuda::std::distance(bitmap, buf);
+      metadata.offsets_aligned =
+        (reinterpret_cast<cuda::std::uintptr_t>(bitmap + metadata.container_offsets) %
+         sizeof(cuda::std::uint32_t)) == 0;
+      buf += metadata.num_containers * 4;
+    }
+
+    metadata.num_keys = 0;
+    cuda::std::uint16_t const* key_cards =
+      reinterpret_cast<cuda::std::uint16_t const*>(bitmap + metadata.key_cards);
+    cuda::std::uint32_t card = 0;
+    for (cuda::std::int32_t i = 0; i < metadata.num_containers; i++) {
+      // cuda::std::uint16_t key  = key_cards[i * 2];
+      card = key_cards[i * 2 + 1] + 1;
+      metadata.num_keys += card;
+    }
+
+    // find end of roaring bitmap
+    cuda::std::byte const* end = bitmap + container_offset(bitmap + metadata.container_offsets,
+                                                           metadata.offsets_aligned,
+                                                           metadata.num_containers - 1);
+    if (is_run_container(
+          reinterpret_cast<cuda::std::uint8_t const*>(bitmap + metadata.run_container_bitmap),
+          metadata.has_run,
+          metadata.num_containers - 1)) {
+      // TODO implement
+    } else {
+      if (card <= 4096) {  // TODO check if this is correct
+        end += card * sizeof(cuda::std::uint16_t);
+      } else {
+        end += 8192;  // fixed size bitset container
+      }
+    }
+
+    metadata.size_bytes = static_cast<cuda::std::size_t>(cuda::std::distance(bitmap, end));
+    metadata.valid      = true;
+    return metadata;
+  }
+
  private:
-  __device__ bool is_run_container(cuda::std::int32_t i) const
+  __host__ __device__ static bool is_run_container(cuda::std::uint8_t const* run_container_bitmap,
+                                                   bool has_run,
+                                                   cuda::std::int32_t i)
   {
-    if (not has_run_) return false;
-    return run_container_bitmap_[i / 8] & (1 << (i % 8));
+    if (not has_run) return false;
+    return run_container_bitmap[i / 8] & (1 << (i % 8));
   }
 
   __device__ bool contains_container(cuda::std::uint16_t lower, cuda::std::uint32_t index) const
   {
-    cuda::std::uint32_t card = key_cards_[index * 2 + 1] + 1;
-    cuda::std::uint16_t const* container =
-      reinterpret_cast<cuda::std::uint16_t const*>(data_.data() + this->container_offset(index));
-    if (this->is_run_container(index)) {
+    cuda::std::uint32_t card             = key_cards_[index * 2 + 1] + 1;
+    cuda::std::uint16_t const* container = reinterpret_cast<cuda::std::uint16_t const*>(
+      data_.data() + container_offset(offsets_, offsets_aligned_, index));
+    if (is_run_container(run_container_bitmap_, has_run_, index)) {
       return this->contains_run_container(container, lower, card);
     } else {
       if (card <= 4096) {  // TODO check if this is correct
@@ -202,116 +318,26 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
     return false;
   }
 
-  __device__ cuda::std::uint32_t container_offset(cuda::std::int32_t i) const
+  __host__ __device__ static cuda::std::uint32_t container_offset(cuda::std::byte const* offsets,
+                                                                  bool offsets_aligned,
+                                                                  cuda::std::int32_t i)
   {
-    cuda::std::uint32_t offset;
-    if (offsets_aligned_) {
+    cuda::std::uint32_t offset = 0;
+    if (offsets_aligned) {
       offset =
-        *reinterpret_cast<cuda::std::uint32_t const*>(offsets_ + i * sizeof(cuda::std::uint32_t));
+        *reinterpret_cast<cuda::std::uint32_t const*>(offsets + i * sizeof(cuda::std::uint32_t));
     } else {
       cuda::std::memcpy(
-        &offset, offsets_ + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t));
+        &offset, offsets + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t));
     }
     return offset;
   }
 
-  __host__ __device__ bool read_header(cuda::std::span<cuda::std::byte const> compressed_bitmap)
-  {
-    cuda::std::size_t length                     = compressed_bitmap.size();
-    cuda::std::byte const* buf                   = compressed_bitmap.data();
-    [[maybe_unused]] cuda::std::size_t readbytes = 0;
-
-    // cookie and num_containers
-    if (length < 4) {
-      // printf("length is less than 4\n");
-      return false;
-    }
-
-    cuda::std::uint32_t cookie;
-    cuda::std::memcpy(&cookie, buf, sizeof(cuda::std::uint32_t));
-    readbytes += sizeof(cuda::std::uint32_t);
-    buf += sizeof(cuda::std::uint32_t);
-    if ((cookie & 0xFFFF) != serial_cookie && cookie != serial_cookie_no_runcontainer) {
-      // printf("cookie is not serial cookie or serial cookie no runcontainer\n");
-      return false;
-    }
-
-    if ((cookie & 0xFFFF) == serial_cookie)
-      num_containers_ = (cookie >> 16) + 1;
-    else {
-      readbytes += sizeof(cuda::std::uint32_t);
-      if (readbytes > length) {
-        // printf("readbytes is greater than length\n");
-        return false;
-      }
-      cuda::std::memcpy(&num_containers_, buf, sizeof(cuda::std::uint32_t));
-      buf += sizeof(cuda::std::uint32_t);
-    }
-    if (num_containers_ < 0) {
-      // printf("num_containers_ is less than 0\n");
-      return false;
-    }
-    if (num_containers_ > (1 << 16)) {
-      // printf("num_containers_ is greater than 65536\n");
-      return false;
-    }
-    // printf("num_containers_: %d\n", num_containers_);
-
-    has_run_ = (cookie & 0xFFFF) == serial_cookie;
-    if (has_run_) {
-      cuda::std::size_t s = (num_containers_ + 7) / 8;
-      readbytes += s;
-      if (readbytes > length) {
-        // printf("readbytes is greater than length\n");
-        return false;
-      }
-      run_container_bitmap_ = reinterpret_cast<cuda::std::uint8_t const*>(buf);
-      buf += s;
-    }
-    // printf("has_run: %d\n", has_run_);
-
-    key_cards_ = reinterpret_cast<cuda::std::uint16_t const*>(buf);
-    readbytes += num_containers_ * 2 * sizeof(cuda::std::uint16_t);
-    if (readbytes > length) {
-      // printf("readbytes is greater than length\n");
-      return false;
-    }
-    buf += num_containers_ * 2 * sizeof(cuda::std::uint16_t);
-
-    if ((!has_run_) || (num_containers_ >= no_offset_threshold)) {
-      readbytes += num_containers_ * 4;
-      if (readbytes > length) {
-        // printf("readbytes is greater than length\n");
-        return false;
-      }
-      offsets_ = buf;
-      offsets_aligned_ =
-        (reinterpret_cast<cuda::std::uintptr_t>(offsets_) % sizeof(cuda::std::uint32_t)) == 0;
-      buf += num_containers_ * 4;
-    }
-
-    readbytes += num_containers_ * 4;
-    if (readbytes > length) {
-      // printf("readbytes is greater than length\n");
-      return false;
-    }
-
-    size_ = 0;
-    for (cuda::std::int32_t i = 0; i < num_containers_; i++) {
-      // cuda::std::uint16_t key  = key_cards_[i * 2];
-      cuda::std::uint32_t card = key_cards_[i * 2 + 1] + 1;
-      size_ += card;
-      // printf("key: %d, card: %d\n", key, card);
-    }
-
-    return true;
-  }
-
   cuda::std::span<cuda::std::byte const> data_;
   cuda::std::size_t size_;
   cuda::std::int32_t num_containers_;
   cuda::std::uint8_t const* run_container_bitmap_;
-  cuda::std::uint16_t const* key_cards_;
+  cuda::std::uint16_t const* key_cards_;  // TODO uint8?
   cuda::std::byte const* offsets_;
   bool offsets_aligned_;
   bool has_run_;
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
index b66ea9e31..08465f215 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
 #include <cuda/std/cstddef>
@@ -24,18 +25,17 @@
 namespace cuco {
 
 template <class T, cuda::thread_scope Scope>
-__host__ roaring_bitmap_ref<T, Scope>::roaring_bitmap_ref(
-  cuda::std::span<cuda::std::byte const> compressed_bitmap_h,
-  cuda::std::span<cuda::std::byte const> compressed_bitmap_d,
-  cuda_thread_scope<Scope> scope)
-  : impl_{compressed_bitmap_h, compressed_bitmap_d, scope}
+__host__ __device__ roaring_bitmap_ref<T, Scope>::roaring_bitmap_ref(cuda::std::byte const* bitmap,
+                                                                     metadata_type const metadata,
+                                                                     cuda_thread_scope<Scope> scope)
+  : impl_{bitmap, metadata, scope}
 {
 }
 
 template <class T, cuda::thread_scope Scope>
-__device__ roaring_bitmap_ref<T, Scope>::roaring_bitmap_ref(
-  cuda::std::span<cuda::std::byte const> compressed_bitmap, cuda_thread_scope<Scope> scope)
-  : impl_{compressed_bitmap, scope}
+__device__ roaring_bitmap_ref<T, Scope>::roaring_bitmap_ref(cuda::std::byte const* bitmap,
+                                                            cuda_thread_scope<Scope> scope)
+  : impl_{bitmap, scope}
 {
 }
 
@@ -77,4 +77,12 @@ __host__ __device__ cuda::std::span<cuda::std::byte const> roaring_bitmap_ref<T,
 {
   return impl_.data();
 }
+
+template <class T, cuda::thread_scope Scope>
+__host__ __device__ typename roaring_bitmap_ref<T, Scope>::metadata_type const
+roaring_bitmap_ref<T, Scope>::read_metadata(cuda::std::byte const* bitmap) noexcept
+{
+  return impl_type::read_metadata(bitmap);
+}
+
 }  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/roaring_bitmap.cuh b/include/cuco/roaring_bitmap.cuh
index b850431a7..8b36ba880 100644
--- a/include/cuco/roaring_bitmap.cuh
+++ b/include/cuco/roaring_bitmap.cuh
@@ -40,7 +40,7 @@ class roaring_bitmap {
   template <cuda::thread_scope NewScope = thread_scope>
   using ref_type = roaring_bitmap_ref<T, NewScope>;
 
-  __host__ roaring_bitmap(cuda::std::span<const cuda::std::byte> compressed_bitmap,
+  __host__ roaring_bitmap(cuda::std::byte const* bitmap,
                           cuda_thread_scope<Scope> scope = {},
                           Allocator const& alloc         = {},
                           cuda::stream_ref stream        = {});
@@ -76,6 +76,7 @@ class roaring_bitmap {
 
  private:
   allocator_type allocator_;
+  typename ref_type<>::metadata_type metadata_;
   std::unique_ptr<cuda::std::byte, detail::custom_deleter<cuda::std::size_t, allocator_type>> data_;
   ref_type<> ref_;
 };
diff --git a/include/cuco/roaring_bitmap_ref.cuh b/include/cuco/roaring_bitmap_ref.cuh
index a26474cd9..d06757fe2 100644
--- a/include/cuco/roaring_bitmap_ref.cuh
+++ b/include/cuco/roaring_bitmap_ref.cuh
@@ -29,15 +29,15 @@ class roaring_bitmap_ref {
   using impl_type = detail::roaring_bitmap_impl<T, Scope>;
 
  public:
+  using metadata_type                = typename impl_type::metadata_type;
   static constexpr auto thread_scope = impl_type::thread_scope;
 
   // This is tricky as it is not clear if compressed_bitmap resides in host or device memory.
-  __host__ roaring_bitmap_ref(cuda::std::span<cuda::std::byte const> compressed_bitmap_h,
-                              cuda::std::span<cuda::std::byte const> compressed_bitmap_d,
-                              cuda_thread_scope<Scope> scope = {});
+  __host__ __device__ roaring_bitmap_ref(cuda::std::byte const* bitmap,
+                                         metadata_type const metadata,
+                                         cuda_thread_scope<Scope> scope = {});
 
-  __device__ roaring_bitmap_ref(cuda::std::span<cuda::std::byte const> compressed_bitmap,
-                                cuda_thread_scope<Scope> scope = {});
+  __device__ roaring_bitmap_ref(cuda::std::byte const* bitmap, cuda_thread_scope<Scope> scope = {});
 
   template <class InputIt, class OutputIt>
   __host__ void contains(InputIt first,
@@ -57,6 +57,9 @@ class roaring_bitmap_ref {
 
   [[nodiscard]] __host__ __device__ cuda::std::span<cuda::std::byte const> data() const noexcept;
 
+  [[nodiscard]] __host__ __device__ static metadata_type const read_metadata(
+    cuda::std::byte const* bitmap) noexcept;
+
  private:
   impl_type impl_;
 };

From 5977bead161c47f9f5e19d54bfee5ac8f5ba7239 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Tue, 8 Jul 2025 17:34:00 -0700
Subject: [PATCH 05/24] Get rid of span and scope

---
 .../detail/roaring_bitmap/roaring_bitmap.inl  | 61 +++++++++--------
 .../roaring_bitmap/roaring_bitmap_impl.cuh    | 38 +++++------
 .../roaring_bitmap/roaring_bitmap_ref.inl     | 65 ++++++++++---------
 include/cuco/roaring_bitmap.cuh               | 26 +++-----
 include/cuco/roaring_bitmap_ref.cuh           | 21 +++---
 5 files changed, 106 insertions(+), 105 deletions(-)

diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
index efb64c448..9c36c2e90 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
@@ -17,72 +17,77 @@
 
 #include <cuco/detail/error.hpp>
 #include <cuco/detail/storage/storage_base.cuh>
-#include <cuco/utility/cuda_thread_scope.cuh>
 
 #include <cuda/std/cstddef>
-#include <cuda/std/span>
+#include <cuda/std/type_traits>
 #include <cuda/stream_ref>
 
 namespace cuco {
 
-template <class T, cuda::thread_scope Scope, class Allocator>
-__host__ roaring_bitmap<T, Scope, Allocator>::roaring_bitmap(cuda::std::byte const* bitmap,
-                                                             cuda_thread_scope<Scope> scope,
-                                                             Allocator const& alloc,
-                                                             cuda::stream_ref stream)
+template <class T, class Allocator>
+__host__ roaring_bitmap<T, Allocator>::roaring_bitmap(cuda::std::byte const* bitmap,
+                                                      Allocator const& alloc,
+                                                      cuda::stream_ref stream)
   : allocator_{alloc},
-    metadata_{ref_type<>::read_metadata(bitmap)},
+    metadata_{ref_type::read_metadata(bitmap)},
     data_{
       allocator_.allocate(metadata_.size_bytes),
       detail::custom_deleter<cuda::std::size_t, allocator_type>{metadata_.size_bytes, allocator_}},
-    ref_{data_.get(), metadata_, scope}
+    ref_{data_.get(), metadata_}
 {
   CUCO_CUDA_TRY(cudaMemcpyAsync(
     data_.get(), bitmap, metadata_.size_bytes, cudaMemcpyHostToDevice, stream.get()));
   // stream.wait();  // TODO check if this is necessary
 }
 
-template <class T, cuda::thread_scope Scope, class Allocator>
+template <class T, class Allocator>
 template <class InputIt, class OutputIt>
-__host__ void roaring_bitmap<T, Scope, Allocator>::contains(InputIt first,
-                                                            InputIt last,
-                                                            OutputIt output,
-                                                            cuda::stream_ref stream) const
+__host__ void roaring_bitmap<T, Allocator>::contains(InputIt first,
+                                                     InputIt last,
+                                                     OutputIt output,
+                                                     cuda::stream_ref stream) const
 {
   ref_.contains(first, last, output, stream);
 }
 
-template <class T, cuda::thread_scope Scope, class Allocator>
+template <class T, class Allocator>
 template <class InputIt, class OutputIt>
-__host__ void roaring_bitmap<T, Scope, Allocator>::contains_async(
-  InputIt first, InputIt last, OutputIt output, cuda::stream_ref stream) const noexcept
+__host__ void roaring_bitmap<T, Allocator>::contains_async(InputIt first,
+                                                           InputIt last,
+                                                           OutputIt output,
+                                                           cuda::stream_ref stream) const noexcept
 {
   ref_.contains_async(first, last, output, stream);
 }
 
-template <class T, cuda::thread_scope Scope, class Allocator>
-__host__ cuda::std::size_t roaring_bitmap<T, Scope, Allocator>::size() const noexcept
+template <class T, class Allocator>
+__host__ cuda::std::size_t roaring_bitmap<T, Allocator>::size() const noexcept
 {
   return ref_.size();
 }
 
-template <class T, cuda::thread_scope Scope, class Allocator>
-__host__ cuda::std::span<cuda::std::byte const> roaring_bitmap<T, Scope, Allocator>::data()
-  const noexcept
+template <class T, class Allocator>
+__host__ cuda::std::byte const* roaring_bitmap<T, Allocator>::data() const noexcept
 {
   return ref_.data();
 }
 
-template <class T, cuda::thread_scope Scope, class Allocator>
-__host__ typename roaring_bitmap<T, Scope, Allocator>::allocator_type
-roaring_bitmap<T, Scope, Allocator>::allocator() const noexcept
+template <class T, class Allocator>
+__host__ cuda::std::size_t roaring_bitmap<T, Allocator>::size_bytes() const noexcept
+{
+  return ref_.size_bytes();
+}
+
+template <class T, class Allocator>
+__host__ typename roaring_bitmap<T, Allocator>::allocator_type
+roaring_bitmap<T, Allocator>::allocator() const noexcept
 {
   return allocator_;
 }
 
-template <class T, cuda::thread_scope Scope, class Allocator>
-__host__ typename roaring_bitmap<T, Scope, Allocator>::ref_type<>
-roaring_bitmap<T, Scope, Allocator>::ref() const noexcept
+template <class T, class Allocator>
+__host__ typename roaring_bitmap<T, Allocator>::ref_type roaring_bitmap<T, Allocator>::ref()
+  const noexcept
 {
   return ref_;
 }
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
index 8dc6fe633..2e527047e 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
@@ -17,14 +17,12 @@
 #pragma once
 
 #include <cuco/detail/error.hpp>
-#include <cuco/utility/cuda_thread_scope.cuh>
 #include <cuco/utility/traits.hpp>
 
 #include <cuda/std/cstddef>
 #include <cuda/std/cstdint>
 #include <cuda/std/functional>
 #include <cuda/std/iterator>
-#include <cuda/std/span>
 #include <cuda/stream_ref>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
@@ -55,13 +53,13 @@ struct roaring_bitmap_metadata<cuda::std::uint32_t> {
 // TODO implement roaring_bitmap_metadata<cuda::std::uint64_t>
 
 // primary template
-template <class T, cuda::thread_scope Scope>
+template <class T>
 class roaring_bitmap_impl {
   static_assert(cuco::dependent_false<T>, "T must be either uint32_t or uint64_t");
 };
 
-template <cuda::thread_scope Scope>
-class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
+template <>
+class roaring_bitmap_impl<cuda::std::uint32_t> {
   // Constants from the Roaring format spec
   static constexpr cuda::std::uint32_t serial_cookie_no_runcontainer = 12346;
   static constexpr cuda::std::uint32_t serial_cookie                 = 12347;
@@ -70,19 +68,18 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
   static constexpr cuda::std::uint32_t binary_search_threshold = 8;  // TODO determine optimal value
 
  public:
-  using metadata_type                = roaring_bitmap_metadata<cuda::std::uint32_t>;
-  static constexpr auto thread_scope = Scope;
+  using metadata_type = roaring_bitmap_metadata<cuda::std::uint32_t>;
 
   __host__ __device__ roaring_bitmap_impl(cuda::std::byte const* bitmap,
-                                          metadata_type metadata,
-                                          cuda_thread_scope<Scope> /* scope */)
+                                          metadata_type const& metadata)
   {
     NV_IF_TARGET(
       NV_IS_HOST,
       CUCO_EXPECTS(metadata.valid, "Invalid bitmap format");)  // TODO device error handling
 
     if (metadata.valid) {
-      data_           = cuda::std::span<cuda::std::byte const>{bitmap, metadata.size_bytes};
+      data_           = bitmap;
+      size_bytes_     = metadata.size_bytes;
       size_           = metadata.num_keys;
       num_containers_ = metadata.num_containers;
       run_container_bitmap_ =
@@ -94,8 +91,8 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
     }
   }
 
-  __device__ roaring_bitmap_impl(cuda::std::byte const* bitmap, cuda_thread_scope<Scope> scope)
-    : roaring_bitmap_impl(bitmap, read_metadata(bitmap), scope)
+  __device__ roaring_bitmap_impl(cuda::std::byte const* bitmap)
+    : roaring_bitmap_impl{bitmap, read_metadata(bitmap)}
   {
   }
 
@@ -163,9 +160,11 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
 
   [[nodiscard]] __host__ __device__ bool empty() const noexcept { return size_ == 0; }
 
-  [[nodiscard]] __host__ __device__ cuda::std::span<cuda::std::byte const> data() const noexcept
+  [[nodiscard]] __host__ __device__ cuda::std::byte const* data() const noexcept { return data_; }
+
+  [[nodiscard]] __host__ __device__ cuda::std::size_t size_bytes() const noexcept
   {
-    return data_;
+    return size_bytes_;
   }
 
   __host__ __device__ static metadata_type const read_metadata(
@@ -262,7 +261,7 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
   {
     cuda::std::uint32_t card             = key_cards_[index * 2 + 1] + 1;
     cuda::std::uint16_t const* container = reinterpret_cast<cuda::std::uint16_t const*>(
-      data_.data() + container_offset(offsets_, offsets_aligned_, index));
+      data_ + container_offset(offsets_, offsets_aligned_, index));
     if (is_run_container(run_container_bitmap_, has_run_, index)) {
       return this->contains_run_container(container, lower, card);
     } else {
@@ -333,7 +332,8 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
     return offset;
   }
 
-  cuda::std::span<cuda::std::byte const> data_;
+  cuda::std::byte const* data_;
+  cuda::std::size_t size_bytes_;
   cuda::std::size_t size_;
   cuda::std::int32_t num_containers_;
   cuda::std::uint8_t const* run_container_bitmap_;
@@ -343,9 +343,9 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
   bool has_run_;
 };
 
-template <cuda::thread_scope Scope>
-class roaring_bitmap_impl<cuda::std::uint64_t, Scope> {
-  using bucket_type = roaring_bitmap_impl<cuda::std::uint32_t, Scope>;
+template <>
+class roaring_bitmap_impl<cuda::std::uint64_t> {
+  using bucket_type = roaring_bitmap_impl<cuda::std::uint32_t>;
   // TODO implement
 };
 
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
index 08465f215..a4f252104 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
@@ -16,71 +16,74 @@
 #pragma once
 
 #include <cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh>
-#include <cuco/utility/cuda_thread_scope.cuh>
 
 #include <cuda/std/cstddef>
-#include <cuda/std/span>
+#include <cuda/std/type_traits>
 #include <cuda/stream_ref>
 
 namespace cuco {
 
-template <class T, cuda::thread_scope Scope>
-__host__ __device__ roaring_bitmap_ref<T, Scope>::roaring_bitmap_ref(cuda::std::byte const* bitmap,
-                                                                     metadata_type const metadata,
-                                                                     cuda_thread_scope<Scope> scope)
-  : impl_{bitmap, metadata, scope}
+template <class T>
+__host__ __device__ roaring_bitmap_ref<T>::roaring_bitmap_ref(cuda::std::byte const* bitmap,
+                                                              metadata_type const& metadata)
+  : impl_{bitmap, metadata}
 {
 }
 
-template <class T, cuda::thread_scope Scope>
-__device__ roaring_bitmap_ref<T, Scope>::roaring_bitmap_ref(cuda::std::byte const* bitmap,
-                                                            cuda_thread_scope<Scope> scope)
-  : impl_{bitmap, scope}
+template <class T>
+template <class U /* = T */,
+          class /* = cuda::std::enable_if_t<cuda::std::is_same_v<U, cuda::std::uint32_t>> */>
+__device__ roaring_bitmap_ref<T>::roaring_bitmap_ref(cuda::std::byte const* bitmap) : impl_{bitmap}
 {
 }
 
-template <class T, cuda::thread_scope Scope>
+template <class T>
 template <class InputIt, class OutputIt>
-__host__ void roaring_bitmap_ref<T, Scope>::contains(InputIt first,
-                                                     InputIt last,
-                                                     OutputIt output,
-                                                     cuda::stream_ref stream) const
+__host__ void roaring_bitmap_ref<T>::contains(InputIt first,
+                                              InputIt last,
+                                              OutputIt output,
+                                              cuda::stream_ref stream) const
 {
   impl_.contains(first, last, output, stream);
 }
 
-template <class T, cuda::thread_scope Scope>
+template <class T>
 template <class InputIt, class OutputIt>
-__host__ void roaring_bitmap_ref<T, Scope>::contains_async(InputIt first,
-                                                           InputIt last,
-                                                           OutputIt output,
-                                                           cuda::stream_ref stream) const noexcept
+__host__ void roaring_bitmap_ref<T>::contains_async(InputIt first,
+                                                    InputIt last,
+                                                    OutputIt output,
+                                                    cuda::stream_ref stream) const noexcept
 {
   impl_.contains_async(first, last, output, stream);
 }
 
-template <class T, cuda::thread_scope Scope>
-__device__ bool roaring_bitmap_ref<T, Scope>::contains(T value) const
+template <class T>
+__device__ bool roaring_bitmap_ref<T>::contains(T value) const
 {
   return impl_.contains(value);
 }
 
-template <class T, cuda::thread_scope Scope>
-__host__ __device__ cuda::std::size_t roaring_bitmap_ref<T, Scope>::size() const noexcept
+template <class T>
+__host__ __device__ cuda::std::size_t roaring_bitmap_ref<T>::size() const noexcept
 {
   return impl_.size();
 }
 
-template <class T, cuda::thread_scope Scope>
-__host__ __device__ cuda::std::span<cuda::std::byte const> roaring_bitmap_ref<T, Scope>::data()
-  const noexcept
+template <class T>
+__host__ __device__ cuda::std::byte const* roaring_bitmap_ref<T>::data() const noexcept
 {
   return impl_.data();
 }
 
-template <class T, cuda::thread_scope Scope>
-__host__ __device__ typename roaring_bitmap_ref<T, Scope>::metadata_type const
-roaring_bitmap_ref<T, Scope>::read_metadata(cuda::std::byte const* bitmap) noexcept
+template <class T>
+__host__ __device__ cuda::std::size_t roaring_bitmap_ref<T>::size_bytes() const noexcept
+{
+  return impl_.size_bytes();
+}
+
+template <class T>
+__host__ __device__ typename roaring_bitmap_ref<T>::metadata_type const
+roaring_bitmap_ref<T>::read_metadata(cuda::std::byte const* bitmap) noexcept
 {
   return impl_type::read_metadata(bitmap);
 }
diff --git a/include/cuco/roaring_bitmap.cuh b/include/cuco/roaring_bitmap.cuh
index 8b36ba880..3f3aa071a 100644
--- a/include/cuco/roaring_bitmap.cuh
+++ b/include/cuco/roaring_bitmap.cuh
@@ -18,32 +18,24 @@
 #include <cuco/detail/storage/storage_base.cuh>
 #include <cuco/roaring_bitmap_ref.cuh>
 #include <cuco/utility/allocator.hpp>
-#include <cuco/utility/cuda_thread_scope.cuh>
 
 #include <cuda/std/cstddef>
-#include <cuda/std/span>
 #include <cuda/stream_ref>
 
 #include <memory>
 
 namespace cuco {
 
-template <class T,
-          cuda::thread_scope Scope = cuda::thread_scope_device,
-          class Allocator          = cuco::cuda_allocator<cuda::std::byte>>
+template <class T, class Allocator = cuco::cuda_allocator<cuda::std::byte>>
 class roaring_bitmap {
  public:
-  static constexpr auto thread_scope = Scope;
-
   using allocator_type = Allocator;
 
-  template <cuda::thread_scope NewScope = thread_scope>
-  using ref_type = roaring_bitmap_ref<T, NewScope>;
+  using ref_type = roaring_bitmap_ref<T>;
 
   __host__ roaring_bitmap(cuda::std::byte const* bitmap,
-                          cuda_thread_scope<Scope> scope = {},
-                          Allocator const& alloc         = {},
-                          cuda::stream_ref stream        = {});
+                          Allocator const& alloc  = {},
+                          cuda::stream_ref stream = {});
 
   roaring_bitmap(roaring_bitmap const& other)            = default;
   roaring_bitmap(roaring_bitmap&& other)                 = default;
@@ -68,17 +60,19 @@ class roaring_bitmap {
 
   [[nodiscard]] __host__ cuda::std::size_t size() const noexcept;
 
-  [[nodiscard]] __host__ cuda::std::span<cuda::std::byte const> data() const noexcept;
+  [[nodiscard]] __host__ cuda::std::byte const* data() const noexcept;
+
+  [[nodiscard]] __host__ cuda::std::size_t size_bytes() const noexcept;
 
   [[nodiscard]] __host__ allocator_type allocator() const noexcept;
 
-  [[nodiscard]] __host__ ref_type<> ref() const noexcept;
+  [[nodiscard]] __host__ ref_type ref() const noexcept;
 
  private:
   allocator_type allocator_;
-  typename ref_type<>::metadata_type metadata_;
+  typename ref_type::metadata_type metadata_;
   std::unique_ptr<cuda::std::byte, detail::custom_deleter<cuda::std::size_t, allocator_type>> data_;
-  ref_type<> ref_;
+  ref_type ref_;
 };
 
 }  // namespace cuco
diff --git a/include/cuco/roaring_bitmap_ref.cuh b/include/cuco/roaring_bitmap_ref.cuh
index d06757fe2..156877c0f 100644
--- a/include/cuco/roaring_bitmap_ref.cuh
+++ b/include/cuco/roaring_bitmap_ref.cuh
@@ -16,28 +16,25 @@
 #pragma once
 
 #include <cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh>
-#include <cuco/utility/cuda_thread_scope.cuh>
 
 #include <cuda/std/cstddef>
-#include <cuda/std/span>
 #include <cuda/stream_ref>
 
 namespace cuco {
 
-template <class T, cuda::thread_scope Scope = cuda::thread_scope_device>
+template <class T>
 class roaring_bitmap_ref {
-  using impl_type = detail::roaring_bitmap_impl<T, Scope>;
+  using impl_type = detail::roaring_bitmap_impl<T>;
 
  public:
-  using metadata_type                = typename impl_type::metadata_type;
-  static constexpr auto thread_scope = impl_type::thread_scope;
+  using metadata_type = typename impl_type::metadata_type;
 
-  // This is tricky as it is not clear if compressed_bitmap resides in host or device memory.
   __host__ __device__ roaring_bitmap_ref(cuda::std::byte const* bitmap,
-                                         metadata_type const metadata,
-                                         cuda_thread_scope<Scope> scope = {});
+                                         metadata_type const& metadata);
 
-  __device__ roaring_bitmap_ref(cuda::std::byte const* bitmap, cuda_thread_scope<Scope> scope = {});
+  template <typename U = T,
+            typename   = cuda::std::enable_if_t<cuda::std::is_same_v<U, cuda::std::uint32_t>>>
+  __device__ roaring_bitmap_ref(cuda::std::byte const* bitmap);
 
   template <class InputIt, class OutputIt>
   __host__ void contains(InputIt first,
@@ -55,7 +52,9 @@ class roaring_bitmap_ref {
 
   [[nodiscard]] __host__ __device__ cuda::std::size_t size() const noexcept;
 
-  [[nodiscard]] __host__ __device__ cuda::std::span<cuda::std::byte const> data() const noexcept;
+  [[nodiscard]] __host__ __device__ cuda::std::byte const* data() const noexcept;
+
+  [[nodiscard]] __host__ __device__ cuda::std::size_t size_bytes() const noexcept;
 
   [[nodiscard]] __host__ __device__ static metadata_type const read_metadata(
     cuda::std::byte const* bitmap) noexcept;

From 18acbed0c7b60ecad3f96303ab5d49f99a5ffb0c Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Tue, 8 Jul 2025 17:39:01 -0700
Subject: [PATCH 06/24] Add empty()

---
 .../detail/roaring_bitmap/roaring_bitmap.inl  | 42 +++++++++++--------
 .../roaring_bitmap/roaring_bitmap_ref.inl     |  7 ++++
 include/cuco/roaring_bitmap.cuh               | 35 +++++++++-------
 include/cuco/roaring_bitmap_ref.cuh           |  7 +++-
 4 files changed, 55 insertions(+), 36 deletions(-)

diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
index 9c36c2e90..fcc4fbd81 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #pragma once
 
 #include <cuco/detail/error.hpp>
@@ -25,9 +26,9 @@
 namespace cuco {
 
 template <class T, class Allocator>
-__host__ roaring_bitmap<T, Allocator>::roaring_bitmap(cuda::std::byte const* bitmap,
-                                                      Allocator const& alloc,
-                                                      cuda::stream_ref stream)
+roaring_bitmap<T, Allocator>::roaring_bitmap(cuda::std::byte const* bitmap,
+                                             Allocator const& alloc,
+                                             cuda::stream_ref stream)
   : allocator_{alloc},
     metadata_{ref_type::read_metadata(bitmap)},
     data_{
@@ -42,52 +43,57 @@ __host__ roaring_bitmap<T, Allocator>::roaring_bitmap(cuda::std::byte const* bit
 
 template <class T, class Allocator>
 template <class InputIt, class OutputIt>
-__host__ void roaring_bitmap<T, Allocator>::contains(InputIt first,
-                                                     InputIt last,
-                                                     OutputIt output,
-                                                     cuda::stream_ref stream) const
+void roaring_bitmap<T, Allocator>::contains(InputIt first,
+                                            InputIt last,
+                                            OutputIt output,
+                                            cuda::stream_ref stream) const
 {
   ref_.contains(first, last, output, stream);
 }
 
 template <class T, class Allocator>
 template <class InputIt, class OutputIt>
-__host__ void roaring_bitmap<T, Allocator>::contains_async(InputIt first,
-                                                           InputIt last,
-                                                           OutputIt output,
-                                                           cuda::stream_ref stream) const noexcept
+void roaring_bitmap<T, Allocator>::contains_async(InputIt first,
+                                                  InputIt last,
+                                                  OutputIt output,
+                                                  cuda::stream_ref stream) const noexcept
 {
   ref_.contains_async(first, last, output, stream);
 }
 
 template <class T, class Allocator>
-__host__ cuda::std::size_t roaring_bitmap<T, Allocator>::size() const noexcept
+cuda::std::size_t roaring_bitmap<T, Allocator>::size() const noexcept
 {
   return ref_.size();
 }
 
 template <class T, class Allocator>
-__host__ cuda::std::byte const* roaring_bitmap<T, Allocator>::data() const noexcept
+bool roaring_bitmap<T, Allocator>::empty() const noexcept
+{
+  return ref_.empty();
+}
+
+template <class T, class Allocator>
+cuda::std::byte const* roaring_bitmap<T, Allocator>::data() const noexcept
 {
   return ref_.data();
 }
 
 template <class T, class Allocator>
-__host__ cuda::std::size_t roaring_bitmap<T, Allocator>::size_bytes() const noexcept
+cuda::std::size_t roaring_bitmap<T, Allocator>::size_bytes() const noexcept
 {
   return ref_.size_bytes();
 }
 
 template <class T, class Allocator>
-__host__ typename roaring_bitmap<T, Allocator>::allocator_type
-roaring_bitmap<T, Allocator>::allocator() const noexcept
+typename roaring_bitmap<T, Allocator>::allocator_type roaring_bitmap<T, Allocator>::allocator()
+  const noexcept
 {
   return allocator_;
 }
 
 template <class T, class Allocator>
-__host__ typename roaring_bitmap<T, Allocator>::ref_type roaring_bitmap<T, Allocator>::ref()
-  const noexcept
+typename roaring_bitmap<T, Allocator>::ref_type roaring_bitmap<T, Allocator>::ref() const noexcept
 {
   return ref_;
 }
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
index a4f252104..088e7e7b4 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #pragma once
 
 #include <cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh>
@@ -69,6 +70,12 @@ __host__ __device__ cuda::std::size_t roaring_bitmap_ref<T>::size() const noexce
   return impl_.size();
 }
 
+template <class T>
+__host__ __device__ bool roaring_bitmap_ref<T>::empty() const noexcept
+{
+  return impl_.empty();
+}
+
 template <class T>
 __host__ __device__ cuda::std::byte const* roaring_bitmap_ref<T>::data() const noexcept
 {
diff --git a/include/cuco/roaring_bitmap.cuh b/include/cuco/roaring_bitmap.cuh
index 3f3aa071a..d8269662b 100644
--- a/include/cuco/roaring_bitmap.cuh
+++ b/include/cuco/roaring_bitmap.cuh
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #pragma once
 
 #include <cuco/detail/storage/storage_base.cuh>
@@ -33,9 +34,9 @@ class roaring_bitmap {
 
   using ref_type = roaring_bitmap_ref<T>;
 
-  __host__ roaring_bitmap(cuda::std::byte const* bitmap,
-                          Allocator const& alloc  = {},
-                          cuda::stream_ref stream = {});
+  roaring_bitmap(cuda::std::byte const* bitmap,
+                 Allocator const& alloc  = {},
+                 cuda::stream_ref stream = {});
 
   roaring_bitmap(roaring_bitmap const& other)            = default;
   roaring_bitmap(roaring_bitmap&& other)                 = default;
@@ -45,28 +46,30 @@ class roaring_bitmap {
   ~roaring_bitmap() = default;
 
   template <class InputIt, class OutputIt>
-  __host__ void contains(InputIt first,
-                         InputIt last,
-                         OutputIt contained,
-                         cuda::stream_ref stream = {}) const;
+  void contains(InputIt first,
+                InputIt last,
+                OutputIt contained,
+                cuda::stream_ref stream = {}) const;
 
   template <class InputIt, class OutputIt>
-  __host__ void contains_async(InputIt first,
-                               InputIt last,
-                               OutputIt contained,
-                               cuda::stream_ref stream = {}) const noexcept;
+  void contains_async(InputIt first,
+                      InputIt last,
+                      OutputIt contained,
+                      cuda::stream_ref stream = {}) const noexcept;
 
   // TODO contains_if, contains_if_async, empty
 
-  [[nodiscard]] __host__ cuda::std::size_t size() const noexcept;
+  [[nodiscard]] cuda::std::size_t size() const noexcept;
+
+  [[nodiscard]] bool empty() const noexcept;
 
-  [[nodiscard]] __host__ cuda::std::byte const* data() const noexcept;
+  [[nodiscard]] cuda::std::byte const* data() const noexcept;
 
-  [[nodiscard]] __host__ cuda::std::size_t size_bytes() const noexcept;
+  [[nodiscard]] cuda::std::size_t size_bytes() const noexcept;
 
-  [[nodiscard]] __host__ allocator_type allocator() const noexcept;
+  [[nodiscard]] allocator_type allocator() const noexcept;
 
-  [[nodiscard]] __host__ ref_type ref() const noexcept;
+  [[nodiscard]] ref_type ref() const noexcept;
 
  private:
   allocator_type allocator_;
diff --git a/include/cuco/roaring_bitmap_ref.cuh b/include/cuco/roaring_bitmap_ref.cuh
index 156877c0f..4e9939c05 100644
--- a/include/cuco/roaring_bitmap_ref.cuh
+++ b/include/cuco/roaring_bitmap_ref.cuh
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #pragma once
 
 #include <cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh>
@@ -32,8 +33,8 @@ class roaring_bitmap_ref {
   __host__ __device__ roaring_bitmap_ref(cuda::std::byte const* bitmap,
                                          metadata_type const& metadata);
 
-  template <typename U = T,
-            typename   = cuda::std::enable_if_t<cuda::std::is_same_v<U, cuda::std::uint32_t>>>
+  template <class U = T,
+            class   = cuda::std::enable_if_t<cuda::std::is_same_v<U, cuda::std::uint32_t>>>
   __device__ roaring_bitmap_ref(cuda::std::byte const* bitmap);
 
   template <class InputIt, class OutputIt>
@@ -52,6 +53,8 @@ class roaring_bitmap_ref {
 
   [[nodiscard]] __host__ __device__ cuda::std::size_t size() const noexcept;
 
+  [[nodiscard]] __host__ __device__ bool empty() const noexcept;
+
   [[nodiscard]] __host__ __device__ cuda::std::byte const* data() const noexcept;
 
   [[nodiscard]] __host__ __device__ cuda::std::size_t size_bytes() const noexcept;

From 64bf0f3b36216845bf17be3ac18d025184ee6fad Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 9 Jul 2025 16:05:01 -0700
Subject: [PATCH 07/24] Add storage class

---
 .../detail/roaring_bitmap/roaring_bitmap.inl  |  16 +-
 .../roaring_bitmap/roaring_bitmap_impl.cuh    | 190 +++---------------
 .../roaring_bitmap/roaring_bitmap_ref.inl     |  12 +-
 .../roaring_bitmap/roaring_bitmap_storage.cuh | 107 ++++++++++
 include/cuco/detail/roaring_bitmap/util.cuh   | 160 +++++++++++++++
 include/cuco/roaring_bitmap.cuh               |  14 +-
 include/cuco/roaring_bitmap_ref.cuh           |   8 +-
 7 files changed, 309 insertions(+), 198 deletions(-)
 create mode 100644 include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh
 create mode 100644 include/cuco/detail/roaring_bitmap/util.cuh

diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
index fcc4fbd81..964f74f45 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
@@ -16,11 +16,7 @@
 
 #pragma once
 
-#include <cuco/detail/error.hpp>
-#include <cuco/detail/storage/storage_base.cuh>
-
 #include <cuda/std/cstddef>
-#include <cuda/std/type_traits>
 #include <cuda/stream_ref>
 
 namespace cuco {
@@ -29,16 +25,8 @@ template <class T, class Allocator>
 roaring_bitmap<T, Allocator>::roaring_bitmap(cuda::std::byte const* bitmap,
                                              Allocator const& alloc,
                                              cuda::stream_ref stream)
-  : allocator_{alloc},
-    metadata_{ref_type::read_metadata(bitmap)},
-    data_{
-      allocator_.allocate(metadata_.size_bytes),
-      detail::custom_deleter<cuda::std::size_t, allocator_type>{metadata_.size_bytes, allocator_}},
-    ref_{data_.get(), metadata_}
+  : storage_{bitmap, alloc, stream}, ref_{storage_.ref()}
 {
-  CUCO_CUDA_TRY(cudaMemcpyAsync(
-    data_.get(), bitmap, metadata_.size_bytes, cudaMemcpyHostToDevice, stream.get()));
-  // stream.wait();  // TODO check if this is necessary
 }
 
 template <class T, class Allocator>
@@ -89,7 +77,7 @@ template <class T, class Allocator>
 typename roaring_bitmap<T, Allocator>::allocator_type roaring_bitmap<T, Allocator>::allocator()
   const noexcept
 {
-  return allocator_;
+  return storage_.allocator();
 }
 
 template <class T, class Allocator>
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
index 2e527047e..29c70b343 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
@@ -17,6 +17,8 @@
 #pragma once
 
 #include <cuco/detail/error.hpp>
+#include <cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh>
+#include <cuco/detail/roaring_bitmap/util.cuh>
 #include <cuco/utility/traits.hpp>
 
 #include <cuda/std/cstddef>
@@ -28,30 +30,8 @@
 #include <thrust/fill.h>
 #include <thrust/transform.h>
 
-#include <nv/target>
-
 namespace cuco::detail {
 
-template <class T>
-struct roaring_bitmap_metadata {
-  static_assert(cuco::dependent_false<T>, "T must be either uint32_t or uint64_t");
-};
-
-template <>
-struct roaring_bitmap_metadata<cuda::std::uint32_t> {
-  cuda::std::size_t size_bytes           = 0;
-  cuda::std::size_t num_keys             = 0;
-  cuda::std::size_t run_container_bitmap = 0;
-  cuda::std::size_t key_cards            = 0;
-  cuda::std::size_t container_offsets    = 0;
-  cuda::std::int32_t num_containers      = 0;
-  bool has_run                           = false;
-  bool offsets_aligned                   = false;
-  bool valid                             = false;
-};
-
-// TODO implement roaring_bitmap_metadata<cuda::std::uint64_t>
-
 // primary template
 template <class T>
 class roaring_bitmap_impl {
@@ -60,39 +40,32 @@ class roaring_bitmap_impl {
 
 template <>
 class roaring_bitmap_impl<cuda::std::uint32_t> {
-  // Constants from the Roaring format spec
-  static constexpr cuda::std::uint32_t serial_cookie_no_runcontainer = 12346;
-  static constexpr cuda::std::uint32_t serial_cookie                 = 12347;
-  static constexpr cuda::std::uint32_t frozen_cookie                 = 13766;
-  static constexpr cuda::std::int32_t no_offset_threshold            = 4;
-  static constexpr cuda::std::uint32_t binary_search_threshold = 8;  // TODO determine optimal value
-
  public:
-  using metadata_type = roaring_bitmap_metadata<cuda::std::uint32_t>;
+  using storage_ref_type = roaring_bitmap_storage_ref<cuda::std::uint32_t>;
 
-  __host__ __device__ roaring_bitmap_impl(cuda::std::byte const* bitmap,
-                                          metadata_type const& metadata)
-  {
-    NV_IF_TARGET(
-      NV_IS_HOST,
-      CUCO_EXPECTS(metadata.valid, "Invalid bitmap format");)  // TODO device error handling
+  static constexpr cuda::std::uint32_t binary_search_threshold = 8;  // TODO determine optimal value
 
-    if (metadata.valid) {
-      data_           = bitmap;
-      size_bytes_     = metadata.size_bytes;
-      size_           = metadata.num_keys;
-      num_containers_ = metadata.num_containers;
+  __host__ __device__ roaring_bitmap_impl(storage_ref_type const& storage_ref)
+  {
+    auto const& meta = storage_ref.metadata();
+    if (meta.valid) {
+      data_           = storage_ref.data();
+      size_bytes_     = meta.size_bytes;
+      size_           = meta.num_keys;
+      num_containers_ = meta.num_containers;
       run_container_bitmap_ =
-        reinterpret_cast<cuda::std::uint8_t const*>(bitmap + metadata.run_container_bitmap);
-      key_cards_ = reinterpret_cast<cuda::std::uint16_t const*>(bitmap + metadata.key_cards);
-      offsets_   = reinterpret_cast<cuda::std::byte const*>(bitmap + metadata.container_offsets);
-      offsets_aligned_ = metadata.offsets_aligned;
-      has_run_         = metadata.has_run;
+        reinterpret_cast<cuda::std::uint8_t const*>(storage_ref.data() + meta.run_container_bitmap);
+      key_cards_ =
+        reinterpret_cast<cuda::std::uint16_t const*>(storage_ref.data() + meta.key_cards);
+      offsets_ =
+        reinterpret_cast<cuda::std::byte const*>(storage_ref.data() + meta.container_offsets);
+      offsets_aligned_ = meta.offsets_aligned;
+      has_run_         = meta.has_run;
     }
   }
 
   __device__ roaring_bitmap_impl(cuda::std::byte const* bitmap)
-    : roaring_bitmap_impl{bitmap, read_metadata(bitmap)}
+    : roaring_bitmap_impl{storage_ref_type{bitmap}}
   {
   }
 
@@ -128,18 +101,21 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
 
   __device__ bool contains(cuda::std::uint32_t value) const
   {
-    cuda::std::uint16_t upper = value >> 16;
-    cuda::std::uint16_t lower = value & 0xFFFF;
-
-    // Binary search on key_cards_ to find container with matching upper key
-    cuda::std::uint32_t left  = 0;
-    cuda::std::uint32_t right = num_containers_;
+    cuda::std::uint16_t const upper = value >> 16;
+    cuda::std::uint16_t const lower = value & 0xFFFF;
 
     if (num_containers_ < binary_search_threshold) {
+// linear search
+#pragma unroll
       for (cuda::std::uint32_t i = 0; i < num_containers_; i++) {
-        if (key_cards_[i * 2] == upper) { return this->contains_container(lower, i); }
+        cuda::std::uint16_t const key = key_cards_[i * 2];
+        if (key == upper) { return this->contains_container(lower, i); }
+        if (key > upper) { return false; }
       }
     } else {
+      // binary search
+      cuda::std::uint32_t left  = 0;
+      cuda::std::uint32_t right = num_containers_;
       while (left < right) {
         cuda::std::uint32_t mid     = left + (right - left) / 2;
         cuda::std::uint16_t mid_key = key_cards_[mid * 2];
@@ -167,96 +143,7 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
     return size_bytes_;
   }
 
-  __host__ __device__ static metadata_type const read_metadata(
-    cuda::std::byte const* bitmap) noexcept
-  {
-    cuda::std::byte const* buf = bitmap;
-    metadata_type metadata;
-
-    cuda::std::uint32_t cookie;
-    cuda::std::memcpy(&cookie, buf, sizeof(cuda::std::uint32_t));
-    buf += sizeof(cuda::std::uint32_t);
-    if ((cookie & 0xFFFF) != serial_cookie && cookie != serial_cookie_no_runcontainer) {
-      metadata.valid = false;
-      return metadata;
-    }
-
-    if ((cookie & 0xFFFF) == serial_cookie)
-      metadata.num_containers = (cookie >> 16) + 1;
-    else {
-      cuda::std::memcpy(&metadata.num_containers, buf, sizeof(cuda::std::uint32_t));
-      buf += sizeof(cuda::std::uint32_t);
-    }
-    if (metadata.num_containers < 0) {
-      metadata.valid = false;
-      return metadata;
-    }
-    if (metadata.num_containers > (1 << 16)) {
-      metadata.valid = false;
-      return metadata;
-    }
-
-    metadata.has_run = (cookie & 0xFFFF) == serial_cookie;
-    if (metadata.has_run) {
-      metadata.valid = false;
-      return metadata;  // TODO run container bitmap is not supported yet
-      cuda::std::size_t s           = (metadata.num_containers + 7) / 8;
-      metadata.run_container_bitmap = cuda::std::distance(bitmap, buf);
-      buf += s;
-    }
-
-    metadata.key_cards = cuda::std::distance(bitmap, buf);
-    buf += metadata.num_containers * 2 * sizeof(cuda::std::uint16_t);
-
-    if ((!metadata.has_run) || (metadata.num_containers >= no_offset_threshold)) {
-      metadata.container_offsets = cuda::std::distance(bitmap, buf);
-      metadata.offsets_aligned =
-        (reinterpret_cast<cuda::std::uintptr_t>(bitmap + metadata.container_offsets) %
-         sizeof(cuda::std::uint32_t)) == 0;
-      buf += metadata.num_containers * 4;
-    }
-
-    metadata.num_keys = 0;
-    cuda::std::uint16_t const* key_cards =
-      reinterpret_cast<cuda::std::uint16_t const*>(bitmap + metadata.key_cards);
-    cuda::std::uint32_t card = 0;
-    for (cuda::std::int32_t i = 0; i < metadata.num_containers; i++) {
-      // cuda::std::uint16_t key  = key_cards[i * 2];
-      card = key_cards[i * 2 + 1] + 1;
-      metadata.num_keys += card;
-    }
-
-    // find end of roaring bitmap
-    cuda::std::byte const* end = bitmap + container_offset(bitmap + metadata.container_offsets,
-                                                           metadata.offsets_aligned,
-                                                           metadata.num_containers - 1);
-    if (is_run_container(
-          reinterpret_cast<cuda::std::uint8_t const*>(bitmap + metadata.run_container_bitmap),
-          metadata.has_run,
-          metadata.num_containers - 1)) {
-      // TODO implement
-    } else {
-      if (card <= 4096) {  // TODO check if this is correct
-        end += card * sizeof(cuda::std::uint16_t);
-      } else {
-        end += 8192;  // fixed size bitset container
-      }
-    }
-
-    metadata.size_bytes = static_cast<cuda::std::size_t>(cuda::std::distance(bitmap, end));
-    metadata.valid      = true;
-    return metadata;
-  }
-
  private:
-  __host__ __device__ static bool is_run_container(cuda::std::uint8_t const* run_container_bitmap,
-                                                   bool has_run,
-                                                   cuda::std::int32_t i)
-  {
-    if (not has_run) return false;
-    return run_container_bitmap[i / 8] & (1 << (i % 8));
-  }
-
   __device__ bool contains_container(cuda::std::uint16_t lower, cuda::std::uint32_t index) const
   {
     cuda::std::uint32_t card             = key_cards_[index * 2 + 1] + 1;
@@ -313,25 +200,10 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
                                          cuda::std::uint16_t lower,
                                          cuda::std::uint32_t card) const
   {
-    // TODO implement
+    // TODO implement linear search
     return false;
   }
 
-  __host__ __device__ static cuda::std::uint32_t container_offset(cuda::std::byte const* offsets,
-                                                                  bool offsets_aligned,
-                                                                  cuda::std::int32_t i)
-  {
-    cuda::std::uint32_t offset = 0;
-    if (offsets_aligned) {
-      offset =
-        *reinterpret_cast<cuda::std::uint32_t const*>(offsets + i * sizeof(cuda::std::uint32_t));
-    } else {
-      cuda::std::memcpy(
-        &offset, offsets + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t));
-    }
-    return offset;
-  }
-
   cuda::std::byte const* data_;
   cuda::std::size_t size_bytes_;
   cuda::std::size_t size_;
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
index 088e7e7b4..9536bb79f 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
@@ -25,9 +25,8 @@
 namespace cuco {
 
 template <class T>
-__host__ __device__ roaring_bitmap_ref<T>::roaring_bitmap_ref(cuda::std::byte const* bitmap,
-                                                              metadata_type const& metadata)
-  : impl_{bitmap, metadata}
+__host__ __device__ roaring_bitmap_ref<T>::roaring_bitmap_ref(storage_ref_type const& storage_ref)
+  : impl_{storage_ref}
 {
 }
 
@@ -88,11 +87,4 @@ __host__ __device__ cuda::std::size_t roaring_bitmap_ref<T>::size_bytes() const
   return impl_.size_bytes();
 }
 
-template <class T>
-__host__ __device__ typename roaring_bitmap_ref<T>::metadata_type const
-roaring_bitmap_ref<T>::read_metadata(cuda::std::byte const* bitmap) noexcept
-{
-  return impl_type::read_metadata(bitmap);
-}
-
 }  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh
new file mode 100644
index 000000000..49805afb8
--- /dev/null
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2025 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/error.hpp>
+#include <cuco/detail/roaring_bitmap/util.cuh>
+#include <cuco/detail/storage/storage_base.cuh>
+#include <cuco/utility/traits.hpp>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/cstdint>
+#include <cuda/stream_ref>
+
+#include <memory>
+#include <nv/target>
+
+namespace cuco::detail {
+
+template <class T>
+struct roaring_bitmap_storage_ref {
+  static_assert(cuco::dependent_false<T>, "T must be either uint32_t or uint64_t");
+};
+
+template <>
+class roaring_bitmap_storage_ref<cuda::std::uint32_t> {
+ public:
+  using metadata_type = roaring_bitmap_metadata<cuda::std::uint32_t>;
+  __host__ __device__ roaring_bitmap_storage_ref(cuda::std::byte const* bitmap,
+                                                 metadata_type const& metadata)
+    : data_{bitmap}, metadata_{metadata}
+  {
+  }
+
+  __device__ roaring_bitmap_storage_ref(cuda::std::byte const* bitmap)
+    : data_{bitmap}, metadata_{metadata_type{bitmap}}
+  {
+  }
+
+  __host__ __device__ metadata_type const& metadata() const noexcept { return metadata_; }
+
+  __host__ __device__ cuda::std::byte const* data() const noexcept { return data_; }
+
+ private:
+  cuda::std::byte const* data_;
+  metadata_type metadata_;
+};
+
+template <class T, class Allocator>
+struct roaring_bitmap_storage {
+  static_assert(cuco::dependent_false<T>, "T must be either uint32_t or uint64_t");
+};
+
+template <class Allocator>
+class roaring_bitmap_storage<cuda::std::uint32_t, Allocator> {
+ public:
+  using allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<cuda::std::byte>;
+  using ref_type = roaring_bitmap_storage_ref<cuda::std::uint32_t>;
+
+  roaring_bitmap_storage(roaring_bitmap_storage const& other)            = default;
+  roaring_bitmap_storage(roaring_bitmap_storage&& other)                 = default;
+  roaring_bitmap_storage& operator=(roaring_bitmap_storage const& other) = default;
+  roaring_bitmap_storage& operator=(roaring_bitmap_storage&& other)      = default;
+
+  ~roaring_bitmap_storage() = default;
+
+  roaring_bitmap_storage(cuda::std::byte const* bitmap,
+                         Allocator const& alloc,
+                         cuda::stream_ref stream)
+    : allocator_{alloc},
+      metadata_{bitmap},
+      data_{allocator_.allocate(metadata_.size_bytes),
+            detail::custom_deleter<cuda::std::size_t, allocator_type>{metadata_.size_bytes,
+                                                                      allocator_}},
+      ref_{data_.get(), metadata_}
+  {
+    CUCO_CUDA_TRY(cudaMemcpyAsync(
+      data_.get(), bitmap, metadata_.size_bytes, cudaMemcpyHostToDevice, stream.get()));
+    // stream.wait();  // TODO check if this is necessary
+  }
+
+  ref_type ref() const noexcept { return ref_; }
+
+ private:
+  allocator_type allocator_;
+  typename ref_type::metadata_type metadata_;
+  std::unique_ptr<cuda::std::byte, custom_deleter<cuda::std::size_t, allocator_type>> data_;
+  ref_type ref_;
+};
+
+// TODO implement roaring_bitmap_metadata<cuda::std::uint64_t>
+
+}  // namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/detail/roaring_bitmap/util.cuh b/include/cuco/detail/roaring_bitmap/util.cuh
new file mode 100644
index 000000000..a9510800b
--- /dev/null
+++ b/include/cuco/detail/roaring_bitmap/util.cuh
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2025 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/utility/traits.hpp>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/cstdint>
+#include <cuda/std/iterator>
+
+#include <nv/target>
+
+namespace cuco::detail {
+
+__host__ __device__ cuda::std::uint32_t container_offset(cuda::std::byte const* offsets,
+                                                         bool offsets_aligned,
+                                                         cuda::std::int32_t i)
+{
+  cuda::std::uint32_t offset = 0;
+  if (offsets_aligned) {
+    offset =
+      *reinterpret_cast<cuda::std::uint32_t const*>(offsets + i * sizeof(cuda::std::uint32_t));
+  } else {
+    cuda::std::memcpy(
+      &offset, offsets + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t));
+  }
+  return offset;
+}
+
+__host__ __device__ bool is_run_container(cuda::std::uint8_t const* run_container_bitmap,
+                                          bool has_run,
+                                          cuda::std::int32_t i)
+{
+  if (not has_run) return false;
+  return run_container_bitmap[i / 8] & (1 << (i % 8));
+}
+
+template <class T>
+struct roaring_bitmap_metadata {
+  static_assert(cuco::dependent_false<T>, "T must be either uint32_t or uint64_t");
+};
+
+template <>
+struct roaring_bitmap_metadata<cuda::std::uint32_t> {
+  cuda::std::size_t size_bytes           = 0;
+  cuda::std::size_t num_keys             = 0;
+  cuda::std::size_t run_container_bitmap = 0;
+  cuda::std::size_t key_cards            = 0;
+  cuda::std::size_t container_offsets    = 0;
+  cuda::std::int32_t num_containers      = 0;
+  bool has_run                           = false;
+  bool offsets_aligned                   = false;
+  bool valid                             = false;
+
+  __host__ __device__ roaring_bitmap_metadata(cuda::std::byte const* bitmap)
+  {
+    constexpr cuda::std::uint32_t serial_cookie_no_runcontainer = 12346;
+    constexpr cuda::std::uint32_t serial_cookie                 = 12347;
+    // constexpr cuda::std::uint32_t frozen_cookie                 = 13766;
+    constexpr cuda::std::int32_t no_offset_threshold = 4;
+
+    cuda::std::byte const* buf = bitmap;
+
+    cuda::std::uint32_t cookie;
+    cuda::std::memcpy(&cookie, buf, sizeof(cuda::std::uint32_t));
+    buf += sizeof(cuda::std::uint32_t);
+    if ((cookie & 0xFFFF) != serial_cookie && cookie != serial_cookie_no_runcontainer) {
+      valid = false;
+      NV_IF_TARGET(NV_IS_HOST,
+                   CUCO_FAIL("Invalid bitmap format");)  // TODO device error handling
+      return;
+    }
+
+    if ((cookie & 0xFFFF) == serial_cookie)
+      num_containers = (cookie >> 16) + 1;
+    else {
+      cuda::std::memcpy(&num_containers, buf, sizeof(cuda::std::uint32_t));
+      buf += sizeof(cuda::std::uint32_t);
+    }
+    if (num_containers < 0) {
+      valid = false;
+      NV_IF_TARGET(NV_IS_HOST,
+                   CUCO_FAIL("Invalid bitmap format");)  // TODO device error handling
+      return;
+    }
+    if (num_containers > (1 << 16)) {
+      valid = false;
+      NV_IF_TARGET(NV_IS_HOST,
+                   CUCO_FAIL("Invalid bitmap format");)  // TODO device error handling
+      return;
+    }
+
+    has_run = (cookie & 0xFFFF) == serial_cookie;
+    if (has_run) {
+      valid = false;  // TODO run container bitmap is not supported yet
+      NV_IF_TARGET(NV_IS_HOST,
+                   CUCO_FAIL("Invalid bitmap format");)  // TODO device error handling
+      return;
+      cuda::std::size_t s  = (num_containers + 7) / 8;
+      run_container_bitmap = cuda::std::distance(bitmap, buf);
+      buf += s;
+    }
+
+    key_cards = cuda::std::distance(bitmap, buf);
+    buf += num_containers * 2 * sizeof(cuda::std::uint16_t);
+
+    if ((!has_run) || (num_containers >= no_offset_threshold)) {
+      container_offsets = cuda::std::distance(bitmap, buf);
+      offsets_aligned   = (reinterpret_cast<cuda::std::uintptr_t>(bitmap + container_offsets) %
+                         sizeof(cuda::std::uint32_t)) == 0;
+      buf += num_containers * 4;
+    }
+
+    num_keys = 0;
+    cuda::std::uint16_t const* cards =
+      reinterpret_cast<cuda::std::uint16_t const*>(bitmap + key_cards);
+    cuda::std::uint32_t card = 0;
+    for (cuda::std::int32_t i = 0; i < num_containers; i++) {
+      // cuda::std::uint16_t key  = key_cards[i * 2];
+      card = cards[i * 2 + 1] + 1;
+      num_keys += card;
+    }
+
+    // find end of roaring bitmap
+    cuda::std::byte const* end =
+      bitmap + container_offset(bitmap + container_offsets, offsets_aligned, num_containers - 1);
+    if (is_run_container(reinterpret_cast<cuda::std::uint8_t const*>(bitmap + run_container_bitmap),
+                         has_run,
+                         num_containers - 1)) {
+      // TODO implement
+    } else {
+      if (card <= 4096) {  // TODO check if this is correct
+        end += card * sizeof(cuda::std::uint16_t);
+      } else {
+        end += 8192;  // fixed size bitset container
+      }
+    }
+
+    size_bytes = static_cast<cuda::std::size_t>(cuda::std::distance(bitmap, end));
+    valid      = true;
+  }
+};
+
+// TODO implement roaring_bitmap_metadata<cuda::std::uint64_t>
+
+}  // namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/roaring_bitmap.cuh b/include/cuco/roaring_bitmap.cuh
index d8269662b..69ad93ae8 100644
--- a/include/cuco/roaring_bitmap.cuh
+++ b/include/cuco/roaring_bitmap.cuh
@@ -16,23 +16,21 @@
 
 #pragma once
 
-#include <cuco/detail/storage/storage_base.cuh>
+#include <cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh>
 #include <cuco/roaring_bitmap_ref.cuh>
 #include <cuco/utility/allocator.hpp>
 
 #include <cuda/std/cstddef>
 #include <cuda/stream_ref>
 
-#include <memory>
-
 namespace cuco {
 
 template <class T, class Allocator = cuco::cuda_allocator<cuda::std::byte>>
 class roaring_bitmap {
  public:
-  using allocator_type = Allocator;
-
-  using ref_type = roaring_bitmap_ref<T>;
+  using storage_type   = detail::roaring_bitmap_storage<T, Allocator>;
+  using allocator_type = typename storage_type::allocator_type;
+  using ref_type       = roaring_bitmap_ref<T>;
 
   roaring_bitmap(cuda::std::byte const* bitmap,
                  Allocator const& alloc  = {},
@@ -72,9 +70,7 @@ class roaring_bitmap {
   [[nodiscard]] ref_type ref() const noexcept;
 
  private:
-  allocator_type allocator_;
-  typename ref_type::metadata_type metadata_;
-  std::unique_ptr<cuda::std::byte, detail::custom_deleter<cuda::std::size_t, allocator_type>> data_;
+  storage_type storage_;
   ref_type ref_;
 };
 
diff --git a/include/cuco/roaring_bitmap_ref.cuh b/include/cuco/roaring_bitmap_ref.cuh
index 4e9939c05..41994099f 100644
--- a/include/cuco/roaring_bitmap_ref.cuh
+++ b/include/cuco/roaring_bitmap_ref.cuh
@@ -28,10 +28,9 @@ class roaring_bitmap_ref {
   using impl_type = detail::roaring_bitmap_impl<T>;
 
  public:
-  using metadata_type = typename impl_type::metadata_type;
+  using storage_ref_type = typename impl_type::storage_ref_type;
 
-  __host__ __device__ roaring_bitmap_ref(cuda::std::byte const* bitmap,
-                                         metadata_type const& metadata);
+  __host__ __device__ roaring_bitmap_ref(storage_ref_type const& storage_ref);
 
   template <class U = T,
             class   = cuda::std::enable_if_t<cuda::std::is_same_v<U, cuda::std::uint32_t>>>
@@ -59,9 +58,6 @@ class roaring_bitmap_ref {
 
   [[nodiscard]] __host__ __device__ cuda::std::size_t size_bytes() const noexcept;
 
-  [[nodiscard]] __host__ __device__ static metadata_type const read_metadata(
-    cuda::std::byte const* bitmap) noexcept;
-
  private:
   impl_type impl_;
 };

From 26e23da628d1741a9820e319d046d2a3b98c83f3 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 9 Jul 2025 17:36:49 -0700
Subject: [PATCH 08/24] Improve member order to reduce struct size

---
 .../cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
index 29c70b343..74839b1b5 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
@@ -49,16 +49,16 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
   {
     auto const& meta = storage_ref.metadata();
     if (meta.valid) {
-      data_           = storage_ref.data();
-      size_bytes_     = meta.size_bytes;
-      size_           = meta.num_keys;
-      num_containers_ = meta.num_containers;
+      data_       = storage_ref.data();
+      size_bytes_ = meta.size_bytes;
+      size_       = meta.num_keys;
       run_container_bitmap_ =
         reinterpret_cast<cuda::std::uint8_t const*>(storage_ref.data() + meta.run_container_bitmap);
       key_cards_ =
         reinterpret_cast<cuda::std::uint16_t const*>(storage_ref.data() + meta.key_cards);
       offsets_ =
         reinterpret_cast<cuda::std::byte const*>(storage_ref.data() + meta.container_offsets);
+      num_containers_  = meta.num_containers;
       offsets_aligned_ = meta.offsets_aligned;
       has_run_         = meta.has_run;
     }
@@ -207,10 +207,10 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
   cuda::std::byte const* data_;
   cuda::std::size_t size_bytes_;
   cuda::std::size_t size_;
-  cuda::std::int32_t num_containers_;
   cuda::std::uint8_t const* run_container_bitmap_;
   cuda::std::uint16_t const* key_cards_;  // TODO uint8?
   cuda::std::byte const* offsets_;
+  cuda::std::int32_t num_containers_;
   bool offsets_aligned_;
   bool has_run_;
 };

From 90b6fc56bc492a18858575aeb11d651c780f9a32 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 13 Aug 2025 10:37:26 -0700
Subject: [PATCH 09/24] 64-bit roaring bitmap

---
 examples/roaring_bitmap/bitmapwithruns.bin    | Bin 0 -> 48056 bytes
 examples/roaring_bitmap/host_bulk_example.cu  | 144 ++++++---
 examples/roaring_bitmap/portable_bitmap64.bin | Bin 0 -> 16506 bytes
 .../detail/roaring_bitmap/roaring_bitmap.inl  |  16 +-
 .../roaring_bitmap/roaring_bitmap_impl.cuh    | 273 ++++++++++++++----
 .../roaring_bitmap/roaring_bitmap_storage.cuh | 133 ++++++++-
 include/cuco/detail/roaring_bitmap/util.cuh   | 182 ++++++++----
 include/cuco/roaring_bitmap.cuh               | 110 ++++++-
 include/cuco/roaring_bitmap_ref.cuh           |  87 +++++-
 9 files changed, 749 insertions(+), 196 deletions(-)
 create mode 100644 examples/roaring_bitmap/bitmapwithruns.bin
 create mode 100644 examples/roaring_bitmap/portable_bitmap64.bin

diff --git a/examples/roaring_bitmap/bitmapwithruns.bin b/examples/roaring_bitmap/bitmapwithruns.bin
new file mode 100644
index 0000000000000000000000000000000000000000..5ed243753e169295a32d6251db66180f23ceac06
GIT binary patch
literal 48056
zcmeIuQyb)3w5Z{vVpUYBB$bL&skUv~wr$(CZQHhO+qPG)cJFo0f7tsf^BLb5*YnMt
zfdl{y015&iU;{8fdI(UZDhgDsiv4%-|000=3<=DyNCAKU(!k#=^#A~P^j|*sU-o|$
z{I7=pHSoU{0>HA}0I=r2HvJa>ynrrXJE#Th2(g1)Ln~mWaJl5SR1eaf$h0Bbj9dfq
zwJ21fSdLOwIlR(`syAvLsJo)!gr+@OHfUR-V}`CuPoVFM0T+hs7_nf?hzT90)R<9V
zPKpKGl5fi&tUj@R!{!;=2kdUKzrx`h$F0-&`45+FTsv@U#k~oSdOU0Js>HhtpOmk_
zj~Bl$fIE-{#0aJXUqdROrqDV2oA)5ai8LEB%*ZkzM~gfa3gjqamB7n<D1W2!f$A%2
zPpIFcaf9Y1T4!jlbOyS==yjprjzRzLC&3E@0)apv5C{YUfj}S-2m}IwKp+qZ1OkCT
zAP@)y{x<|lPV#~Y1OkCTAP@)y0)apv5C{YUfj}S-2m}IwKp+qZ1OkCTAP@)y0)apv
z5C{YUfj}S-2m}IwKp+qZ1OkCTAP@)y0)apv5C{YUfj}S-2m}IwKp+qZ1OkCTAP@)y
z0)apv5C{YUfj}S-2m}IwKp+qZ1OkCTAP@)y0)apv5C{YUfj}S-2m}IwKp+qZ1OkCT
zAP@)y0)apv5C{YUfj}S-2m}IwKp+qZ1OkCTAP@)y0)apv5C{YUfj}S-2m}IwKp+qZ
z1OkCTAP@)y0)apv5C{YUfj}S-2m}IwKp+qZ1OkCTAP@)y0)apv5C{YUfk5E@OaK5N
z00jtu23SA<BtQW)zyNH(1Cl^0AT5vq$P8oyasqjP{6Ha~C{O|@4U_{a0#$(OKrNsy
z&;V!*Gy_@!ZGiSbC!j0P1LzI(0|o*^fZ@O>U@R~Jm<&t<W&(47`M@GzDX;=q4Xgt;
z0$YIXz%F1fZ~!<A90N`QXMpp-CEzM>1Go*`10DiTfaky~;4Sb0_zZjnz5~C2zd#5?
zKpdpNFvx=uPzGb54qBiK`d~^h4VWIx1ZD+ufVshZU_r15SR5<`mIW(-mBDIYO|TAF
zA8Z6R1zUiv!FFIrunX87>;?7(2Y`dYVc<w`3^*Q~1WpBKfV07Q;6iW-xEx#st_3%M
zo55}1PH+#nA3Ove1y6vd!E@k6@CtYxyanC`AApa+XW&cl4fr1X1pWhl1Al^lKrjS{
zun-wyLR?4;MMG*R9x_8tC=p5#N*ziU${5NL${xxU${Q*WDjX^nDj6yhDj%v8sv4>h
zsvW8qY8Yw~Y94A8Y8&bh>Ky78>KW=2>K_^u8X6iA8XX!Jni!fAnjV@Jnj2aWS{zyy
zS{YgsS|8dJ+8Wvs+8x>#Iv6?<IvzR|IvctWx*WO|x*56?x*vKJdK!8WdL4Qf`WX5W
z`Zx3=^g9GV5QIVmL_;hjKoX=t8e~8=<UvU&6_ggr0A+@<K{=s3P=2ToR1_)!m4?be
z6`?9nb*L6p7is`ChMGYwp*B!^s1wu`>H+nJ`auJsA<%GW6f_o^08NIbK{KH_(0phS
zv=mwat%lY?8=)=Gc4!y07dikPhK@lep)=5V=n`}lx&hsW?m-WsC(v`~74#PR0DXqO
zLf@fZ&|fG7BQOqAa2V#{2rR=fScfgxg?%_BoCZ!0XM(fBIpExIKDZ!U1TGGjg3H1c
z;L30{xF%c&t`9eYo5C&N)^IzxBise<4)=ol!UN#J@Gy8JJO&;QPlBhyGvL|qJa{3z
z1YQoWg4e<u;LY$hcqhCE-VYywkHRP5)9^X?B76nD4&Q?B!Vlob@H6-&{04pxe}eyk
zzrjD@KQM^E2!@adgK&t5L=hE<BPQY?2_yxQ8cBy_M6w{+kz7b#qySPFDTb6p${^*D
zN=Q|t22vZThcrZ*AkC3hNL!=>(i!Q7^hEj~{gFY)P-Fx$8X1R7M5Z9qky*%GWC5}m
zS%$1c)*$PVO~_Vc2eKR4ha5zXAjgqY$XVn9av8aX+(hmm_mM})Q{)Bm8hM9&M7|*Z
zB0rGd2!KK;iV`S|vZ#PcsDf&!f!e5tCec)AS~LTi8O?^~MDw8e(L!iZv;<ljEr(V_
ztDx1<T4-Ig0ooXChPFi8pzYC4Xjilc+8gbM4n&8b!_iUbSabq98J&jCMCYLM(M9M|
zbOpK^U59Q&x1ih6UFcr)0D2fbhMq*vpy$y`=vDLvdK<lmK183O&(T-tTl53^8U2cW
zM}MJz(GZ4UI7VS%jK?CFjKwe=voII)v6NUEEIpP9%ZlZ|a%1_hf>;r(I93WPi&elX
zW7V*lSRJfB)(C5gwZK|q?XZqm7pyzh3+sywzy@Q(u#wmpY&<pzn~KfAW@GcPh1e2o
zIkpO0i*3L*W81Ku*dA;@b_hF)oxo0G=dg>|73?~83%iRwz#e1Iu$R~y>^=4g`v?1m
z{lxxYAP(agPT~yC;UXTzRXmQHxPvF~6nJVp9i9=-f@jBb;d$`_cwxL4UJ@^Zm&YsN
zRq+~lZM+`d5O0Dv$6MiT@eX)ryc^yV?}PWp2jN5U5%_3)96k}Bf=|b1;dAi?_+oq+
zz7k)9ug5pxTk#$EZhRkp5I=$+$4}vB@eBB6{2G1}zk}b$AK_2&7x-)Z9sUvjg8z&E
zz<=Wa0TC!c5H!IO0wEC!p%DgQ6CRNyQW0s13`Axk8<CU9L*yq45k-j-L}{WNQIV)Z
zR3~Z?b%_Q<W1<<+l4wJ;Cpr;bi5^66q8~Ak7(xsuMiFC)3B+V#8ZncYL(C@@5le{`
z#A;$4v60w9Y$tXRdx-<YVd5BZk~l-0CoU0Ji5tXi;vVskctSiUUJ-AJ55#BUEAgH9
zMf@c~BtqgOMTSY9jF2)JBX!auUD790l4;2FWF|5znS;zt<|7M|Mabf0DY7hCfvikc
zBWsd%$oga>vMJeuY)!T!JCa?<?qn~rFFAl5Ob#PQl4Hp6<Ro$`IfI-{&LbC+OUUKq
zDsnBkf!s`PBX^Q}$o=FY@+f(NJWZY>FOpZt>*Ou+F8P3bOg<xDl5fcO<R|hU@*DY+
z{6m5iOkosBF%(CMRFqPwIAu}}m7r2ksi|~SMk))HoytY!r3z4msbW+~sti@0szg<#
zYEZSQdQ?NI3DulxMYW|mP@So6R8Oi8)t?$f4W&j<qp5M!L~05(otj0>r4~?&sb$nk
zY7MoX+C*)oc2K*iebhnf2z8t~MV+NCP?xD|)J^IRb)R}fJ*8eyuc>#`N9qgpFZF}^
zO#w7SqclO&G)oJ#L@TsL8?;S(bdpX*r=>H{ndxkFPC5^rpDsifrAyGI>2h>Mx(Z#L
zu0_|S8_<pEW^_xs4c(sZM0cfo(7ow?^gwzDJ)9mzkEJKjlj&*nOnMGIpI$^SrB~3a
z>2>r*dJDas-bL@F5739{WAsV-41J!yL|>(E(6{M(^h5dy{hWS9zokFWpXsmkclsCo
zmku!qgEJHpW_TvT$V`mU8H;flpGnE2VbU|1n5;|=CO4ChDaaIIiZi8{vP=c0GE<GI
z$<$%$GmV(0Obezp(~jxLbYZ$Py_mkt0A?^Vj2X#{Va79)n5oPRW;QdAS;#D5mNToE
zwaf-)Gqa7^$?ReFGl!U?%n9Z+bB?*lTw$&=x0t)k1LiUFjCsktVcs*Jn17gW%unVI
z1BT%+7AC_?m<x;HXjl!$!)DkCC&DShsl(~Q8N*q^*~7WQdBX+5g~P?dCBtRH<-?W2
zRl_yHwZrwo4Z}^s&BLw2ZNnYHox|P2J;Qy%{lkO8L&GD&qr>CE6T?%&)5EjEbHfY5
zi^I#pE5mET>%*JETf;lTyTkj!2g66g$HS+>XTul5m&4b>H^X<r_rs6EPs1<5ufy-c
zAH!e5|Av2re}@4UVo{c0X_jRLR$>)aV-413JvPavV$-r2*vxD;HYb~h&CeEMi?Suy
z(rh`lB3p&6&eme<vJKeAY%{hc+lFn=c4E7-J=orCKXxEHgdNU~V#l%**vafPb|yQA
zozE^}m$EC^)$BTUBfEv&&hBFOvIp41>@oHvdxkyFUShAZH`v?kJ@z5{gniDwV&AeK
z*w5@&_B;EF{mX_pgu^+C3v)ad;bbnx>72#6oX@4?(s1dyOk7qj2bY`6#}(v?aK*V&
zTv@IHSDCBE)#U1M^|?k|Q?3Qqnrp{(<hpR(xn5jfZU8r!8^(>~#&F}gN!(O!1~;3V
z$1UWRaLc(>+*)n}x0&0<?d0}w`?*8hQSJnHnmfl`<gRenxm(;_?g96hd&a%w-f-`^
zPuxG;H|{6*hXZ+-$9R%wc#aqOD6jHy-sBxV!KdI;^Xd4Ed=@@CpNr4S7vKx?#rTqZ
z8NNJUiLc7n;A`{s_=bEFzB%8DZ_9V!JM-Q6o_rs^KR<{c%8%ej^W*r5{1kpVKZ~Es
zFW?vR%lMW28h$;$iQmfa;CJ);_=Efr{y2Y%Kg(a>FZ0*<oBSRAKL3b+%D>=W^Y8eN
z{1^UT{s;e?2Lwn!1wx<&RuBY9Py|gd1Y7Wgq>xHTD`XHd3)zI6LLMQ%P)H~$ln_b_
z<%Eht6`{INOQ<U}5E={3gqA`Zp}o*a=qmIOdJFx8fx-}BxG+i>D@+h33)6&|!W?0~
zut-=ctPoZU>x7NM7Gb-vOV}$M5Dp8+gp<M<;k<B3xGLNbZVUH>hr$!#x$sJOD|`?>
z3txrr!Y|>k5E2m)7b!6;@?u1k#h9pzmgtJUm{Lq5rWZ4bS;ZV;ZZV%&P%I)A7fXp{
z#R_6&v6@&@tRvPJ8;MQD7Gi6$o!C+AB6b&hiG9TZ;$U%@I8q!Vju$71Q^gtLY;m5r
zP+TG|7gvdE#SP+SahteP+#~K64~a*`6XI#{oOn^ZB3>77iFd^Z;$!id_)>f$z861<
z|A^njpW+`8jKC2rLPnSf7ZD@Th#HAU%!m_7L{dajN76+yMzTb*M{-5-MhZj<M~X#C
zM#@CWM=C|CMruTAN9siyMw&#LM_NVNMmj_~N4iCNM*2khM+QZPMn*(NN5(}aMy5oj
zM`lIlMixXCN0voaM%F~uM>a*aMs`GYNA^VyMvg>|M@~h~MlM7yN3KO~M(#xJM;=9<
zMqWf-N8Uv~M!rP;jr@rGjsOxQp%Nj{5-SOkBq@?68Imn|Qc_AKrIj*BnWbz}PAQL+
zUn(ROl}bpZrE*e5sftuxswLHx8c2<$W>QP3jnrQ1Bz2W~NWG<g(m-j5G+Y`bjg=-y
zlcj0WOlgiZUs@zBl~zcrrFGIqX^XU7+9mCk4oHWkW70|KjC5YQBwdwmNVlbX(nINq
z^jvx+y_G&lpQW$Tcj=e(R|-XuC?2Jv;V2)CMCE8Ksz<G;8}*|pqiLe)qnV;vqdB6v
zqxqr*qeY^{qotx{qZOi+qt&7{qjjS7qm80Xqb;JXqwS&{qg|riqrIYiqXVLYqr;*j
zqhq4uqm!ajqcftjqw}H*qf4U8qpPB8qZ^`|quZi8qkE$JqlcnLqbH)LqvxU*qgSHW
zqqm}WqYt8wqtBu*qi>?`qo1PxM88FUM*l=X8J01blo^?mML8;~a$GiLM^4Bo<kWIH
zIis9K&MxPY^U4L}!g4XWq+CWWFISSQ$~ENLay_}B+(d3Jx02h+9puh(H@T<WNA52V
zl84G8<k9jtd7?Z;o-WUl=gJG@#qu(FrMyO7FK?2!$~)xU@;>>Xd_+DjpOVkY7v#(G
zHTkA|N4_sVlAp>i<k#{$`J?<r{#X7X|CRv-Qc#6ZXoXb-MN$++Qw+scJSC~5Qqn3J
zl*~#tC8v@{$*&YriYg_P(n>j{qEbbvuGCWMDh-szN;9RU(ne{obW*x1J(S)`KV_gY
zL>aD(QpPG1l*!68Wu`JmnXfETmMSZh)yg_$qq0TWuIy6wDhHIq$}#1naz;6?TvDzo
zH<a7TJ>{YDM0u{fQr;>bl+Vgn<-77r`KyFfM8#D~4XeBwQDrrz>Z+x>s;{P0)2Qjy
zOlnp&hnic>rxsL;sKwP%YFV{{T3M~8)>P}L_0>jdQ?-TKT5YFxRJ*9%)m~~}b$~ip
z9j1;{$Ef4gN$ON}hB{lFr!G{NsLRz=>RNS!x>?<(?o{`v`_)70QT2p+T0N&;RIjMl
z)m!Ra^?~|WeWt!t->C1^PwGGFH}$9bM+IYW42zL5CdS3YSTv@_;xRMk#1gR-vDC42
zv5c`SvFx#2vAnSYvBI%pv68VevGTD>v8u5evD&eEv4*iGvF5Q>v9_@evCgq>v7WI$
zvHr0^v7xaMvC*+{v5B!MvFWi{vAMAYvBj}vv6ZnkvGuV{v8}NkvE8wKv4gQAvE#8*
zv9qxYvCFY*v750wvHP({v8S;YvDdM8v5&DYv43MfV!va625G29Xtc&^f+lH-rfG&|
zYo3<WQfX<m3|eL_o0e0{qvh8MX+^aXT4}AER#B^>Ro7~1b+rasW38FiQfs5N*E(rk
zwH{h;t)Dhf8=?)@MrmWU3EE_Bnl@9Lqs`YAX-l;g+G=f`wo%)nZP#{bd$j}FVeOc9
zQahua*Dh&SwHw-P?Vk2fd!jwpUTJT&587w#tM*;{rTx`HaU_n%sdzZf$0Kn$9*gU7
zEAGbqc*=O1c=~vzc-DB1c<y+<c)@s)c=33tc-eS`c;$Gtc+Gg7c>Q>zc++@`c<Xq(
zc*l5`c=vd(c;EPd_~7`k_{jK}`1tsw_|*7}`0V(+_`>*-`11Iw_}ch}_~!Vw_|EvA
z`2P5z_|f=@`04n$_{I2@`1Sa$_}%z}_~ZDq_{;d4`1|;$_&@P)@t^TOaZrbKOeb|l
z=X6nz>Z%^sP2JHGdI~+Yo=(rGXVJ6kx%9kx0llzZOfRXI(aY<V^s0Idy|!LYZ>TrX
zo9nIgwt5G>v))bbsrS+Q>x1;6`UrirK2D#gPtm9Av-G+80)4T*Okb(5(bwyn^sV|1
zeYd_(Kd2wkkL#!Ov-$=7vVKj!so&A>>yPxO`V0NF{!ag>f6@Qdf9St;z<><YAPm}I
z4Z)BM#n24Huno^h8mWx5Mg}9Zk<G|y<T3Iag^Z#`38S=8&ZuZqF{&H2jJiexqp{J<
zXlb-D+8dpWu0{`|x6#iSXbdri8>5V|#sp)sG0m81%rWL0i;Shl3S+gg&e&*dF}54K
zjJ?JI<FIkeIBA?Q&KsAEtHurEwsFsRXgo2V8?TJF#s}lG@zwZl{4)L;ArmoilQP35
zZ$?bnjG4M=nXc)ZDa|xydNY%m)y!e$HuISU%_3%Tvy@rZtYB6)tC=;;I%a*dk=fL2
zVYW8gnH|k8W_Pof+1DIk4mO9GBh4}9cyp3D)tq6@Hs_fO%_Zh?bCtQ)++c1tx0yT5
zJ?4J%ka^TRVV*Y6nHS9~=5_OydDnbkJ~p43FU>dRd-Id|kNM5~Y5p-m3$`$ev>1!C
zL@R2kR@^cz$4Xc!tkhOIE2EXg%5LSd@>&I~!d5Y>q*cZ$Z&k9YS~aZNRz0hs)x>IU
zwX)h;9jwk)H>;=B$LenlvW8kCtkKpuYoay9nr_Xq=2{D^#nv)wrM1RdZ*8)+T05-W
z);{Z?b;LSuowCka7p%+HHS4Bz$GUGlvYuKmtk>2%>!bC>`q%nl{k8xbvQe9`X`8hL
zTe1~fvklv}Jv(WqveVib?96sHJExt;&Tkj8i`pgZ(snt!qFu$VZr8Hw+70Z+b~C%B
z-NtTjce1<MJ?!3gKYO4(#2#*svd7vJ?8){td!{|do^LO*m)a}r)%H4jqrJu6Ztt@9
z+6U~z_A&dUea1d-U$U>-H|*Q?J^P{k#C~qSvftVt?9cXB`@8+i{%eOE#K9fP2|K(K
zabzdv=#J&Mj_;&&(m3gzOiorOhm+gM=M;2`IK`b(PFbgdQ`xEJ)O6}N^_@meQ>TT~
z+G*!>bh<d*onB5~XMi)<8Rm?1#yI1hNzPPfhBMol=PY!VILn<?&RS=Kv)S3^>~!`x
z`<+A1QRjqn+BxT3bgnqpom<Xb=YjLsdFH%y-Z<}_PtHHiH|MAG#{pf~#az;5T+S8U
zsH?hh*K{2>;ihm?yXoAFZWcGYo6F7X7H|u@#oUr^8MnM!$*t<vaBI8u+=gxwx4GNO
zZR>V$JG<T7o^BtvzdOhs>W*+nyW`x6?i6>rJIkHxE^rsS%iNXj8h5?B$=&MiaCf`=
z+=K2B_qcn?J?ma@FT2;=o9-R=zWd01>b`JayYJkO?icr8_lNu21w6<@J;I|s))PF*
zQ#{QxJlpfUq?gJ|>t*mVd)d64ULG&MSI8^smGDY?<-Ces6|cHi%d6`(@EUu~yp~=Y
zuf5mF>+1FJdVBr6f!+{rxHrlh>rL<`d(*s`-W+efx5!)St?*WR>%5KL7H_+^%iHT6
z@D6*&yp!G;@4R=(yXxKWZhQB<hu#zKx%bL@>wWM(dtbfp-Y@U37fK)rJV7PG2|f`?
z$cb1&Pgn^z;U`ii(j?L+G9|JmawKvm@+Ar;iX@6BN+rrBDkLf=swHYB>LltX8YP-0
zS|nO0+9f(Bx+J<MdL{ZM1|$Y2h9yQO#w5lkCMBjOW+Y}O<|P&;mL!%ZRwdRZHY7GD
zwk38Z_9XTv4keBzP9#nz&Lu7;t|YD}ZYAy}9wZ(oo+Vx;-Xz{9J|+H1d`tXH{7Ha5
z>|;LZGd|~we$-d}xNrK7pYT)osr__*Mn8+6-OuIc^$Ylg{bGJezl>krujE(tYxuSO
zdVWK{iQn9B<+t@a_?`W3eow!T-`^kP5A{d*qy2IIM1P7u-Jj*p^%wYy{bl}2e~rK1
z-{f!gclf*gef~lJh=1Ha<)8I0_?P`_{!Ramf8T%PKlNYuul;xaNB@ieum8jU?E^_D
zi6)68on(_jQc5aGEomg}q?b%4Qzg?TGbA%7vn6vT^Ca^p3nhytOC(Dt%Oxu&t0b!@
zYbEO@8zdVin<ZN&+a%j3J0-g&dn9`&`y~e^ha`t5M<vH5CnP5)rzK}5=OpJR7bTY_
kS0q;_*CjV5w<NbGcO~~GL7>69`4vF`0RH~{|2xb70P7F}00000

literal 0
HcmV?d00001

diff --git a/examples/roaring_bitmap/host_bulk_example.cu b/examples/roaring_bitmap/host_bulk_example.cu
index bbbbe6005..3309881a2 100644
--- a/examples/roaring_bitmap/host_bulk_example.cu
+++ b/examples/roaring_bitmap/host_bulk_example.cu
@@ -1,28 +1,92 @@
-#include <cuco/detail/error.hpp>
+/*
+ * Copyright (c) 2025 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 #include <cuco/roaring_bitmap.cuh>
+#include <cuco/utility/traits.hpp>
 
-#include <cuda/std/span>
+#include <cuda/std/type_traits>
+#include <thrust/device_vector.h>
 #include <thrust/logical.h>
 #include <thrust/universal_vector.h>
 
-#include <cuda_runtime.h>
-
 #include <fstream>
 #include <iostream>
+#include <string>
 #include <vector>
 
-int main(int argc, char* argv[])
+/**
+ * @file host_bulk_example.cu
+ * @brief Demonstrates usage of the roaring_bitmap "bulk" lookup host APIs.
+ *
+ * In this example we load two 32-bit bitmaps and one 64-bit bitmap (portable format) from the
+ * [RoaringBitmapFormatSpec](https://github.com/RoaringBitmap/RoaringFormatSpec) repository and
+ * check if the bulk lookup API returns the correct results. Namely, we test the following files:
+ * -
+ * [examples/roaring_bitmap/bitmapwithoutruns.bin](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata/bitmapwithoutruns.bin)
+ * -
+ * [examples/roaring_bitmap/bitmapwithruns.bin](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata/bitmapwithruns.bin)
+ * -
+ * [examples/roaring_bitmap/portable_bitmap64.bin](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata64/portable_bitmap64.bin)
+ *
+ */
+
+template <typename KeyType>
+bool check(std::string const& bitmap_file_path)
 {
-  if (argc != 2) {
-    std::cerr << "Usage: " << argv[0] << " <bitmap_file_path>" << std::endl;
-    return -1;
-  }
+  auto generate_keys = []() -> thrust::device_vector<KeyType> {
+    if constexpr (cuda::std::is_same_v<KeyType, cuda::std::uint32_t>) {
+      // reference:
+      // https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata/README.md#test-data
+      std::vector<cuda::std::uint32_t> keys;
+      for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) {
+        keys.push_back(k);
+      }
+      for (int k = 100000; k < 200000; ++k) {
+        keys.push_back(3 * k);
+      }
+      for (int k = 700000; k < 800000; ++k) {
+        keys.push_back(k);
+      }
+      return thrust::device_vector<cuda::std::uint32_t>(keys.begin(), keys.end());
+    } else if constexpr (cuda::std::is_same_v<KeyType, cuda::std::uint64_t>) {
+      // reference:
+      // https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata64/README.md#portable_bitmap64bin
+      std::vector<cuda::std::uint64_t> keys;
+      for (cuda::std::uint64_t k = 0x00000ull; k < 0x09000ull; ++k) {
+        keys.push_back(k);
+      }
+      for (cuda::std::uint64_t k = 0x0A000ull; k < 0x10000ull; ++k) {
+        keys.push_back(k);
+      }
+      keys.push_back(0x20000ull);
+      keys.push_back(0x20005ull);
+      for (cuda::std::uint64_t i = 0; i < 0x10000ull; i += 2ull) {
+        keys.push_back(0x80000ull + i);
+      }
+      return thrust::device_vector<cuda::std::uint64_t>(keys.begin(), keys.end());
+    } else {
+      static_assert(cuco::dependent_false<KeyType>, "KeyType must be uint32_t or uint64_t");
+      return {};
+    }
+  };
 
   // Open file
-  std::ifstream file(argv[1], std::ios::binary);
+  std::ifstream file(bitmap_file_path, std::ios::binary);
   if (!file.is_open()) {
-    std::cerr << "Failed to open " << argv[1] << std::endl;
-    return -1;
+    std::cerr << "Failed to open " << bitmap_file_path << std::endl;
+    return false;
   }
 
   // Get file size
@@ -30,50 +94,36 @@ int main(int argc, char* argv[])
   std::streamsize file_size = file.tellg();
   file.seekg(0, std::ios::beg);
 
-  // Allocate pinned host memory using cudaMallocHost
-  char* buffer;
-  CUCO_CUDA_TRY(cudaMallocHost(&buffer, file_size));
+  thrust::universal_host_pinned_vector<cuda::std::byte> buffer(file_size);
 
   // Read file into memory
-  file.read(buffer, file_size);
+  file.read(reinterpret_cast<char*>(thrust::raw_pointer_cast(buffer.data())), file_size);
   file.close();
 
-  cuco::roaring_bitmap<cuda::std::uint32_t> roaring_bitmap(
-    reinterpret_cast<cuda::std::byte const*>(buffer));
+  cuco::roaring_bitmap<KeyType> roaring_bitmap(thrust::raw_pointer_cast(buffer.data()));
 
-  std::vector<cuda::std::uint32_t> keys;
-  for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) {
-    keys.push_back(k);
-  }
-  for (int k = 100000; k < 200000; ++k) {
-    keys.push_back(3 * k);
-  }
-  for (int k = 700000; k < 800000; ++k) {
-    keys.push_back(k);
-  }
+  auto keys = generate_keys();
+  thrust::device_vector<bool> contained(keys.size(), false);
 
-  thrust::universal_vector<cuda::std::uint32_t> keys_d(keys.begin(), keys.end());
-  thrust::universal_vector<bool> contained(keys.size(), false);
+  roaring_bitmap.contains(keys.begin(), keys.end(), contained.begin());
 
-  roaring_bitmap.contains(keys_d.begin(), keys_d.end(), contained.begin());
+  bool all_contained = thrust::all_of(contained.begin(), contained.end(), ::cuda::std::identity{});
+  return all_contained;
+}
 
-  size_t num_errors = 0;
-  for (size_t i = 0; i < keys.size(); i++) {
-    if (not contained[i]) {
-      if (num_errors <= 10) {
-        std::cout << "Error: " << keys_d[i] << " is not contained" << std::endl;
-      }
-      num_errors++;
-    }
-  }
-  if (num_errors > 0) { std::cout << "num_errors: " << num_errors << std::endl; }
+int main()
+{
+  auto data_dir_prefix = []() -> std::string {
+    std::string source_path = __FILE__;
+    auto pos                = source_path.find_last_of("/\\");
+    return (pos == std::string::npos) ? std::string(".") : source_path.substr(0, pos);
+  };
 
-  // check if all elements are contained and written to output
-  bool all_contained = thrust::all_of(contained.begin(), contained.end(), ::cuda::std::identity{});
-  std::cout << "all_contained: " << all_contained << std::endl;
+  bool success = check<cuda::std::uint32_t>(data_dir_prefix() + "/bitmapwithoutruns.bin");
+  success &= check<cuda::std::uint32_t>(data_dir_prefix() + "/bitmapwithruns.bin");
+  success &= check<cuda::std::uint64_t>(data_dir_prefix() + "/portable_bitmap64.bin");
 
-  // Free the allocated memory
-  CUCO_CUDA_TRY(cudaFreeHost(buffer));
+  std::cout << "success: " << (success ? "true" : "false") << std::endl;
 
-  return 0;
+  return success ? 0 : 1;
 }
\ No newline at end of file
diff --git a/examples/roaring_bitmap/portable_bitmap64.bin b/examples/roaring_bitmap/portable_bitmap64.bin
new file mode 100644
index 0000000000000000000000000000000000000000..acd0f9007d6902f2fa29b82d8f5ee6662a4291d2
GIT binary patch
literal 16506
zcmeI&F%Cdb3;@s~5*IOJFgb_WQCz_h9MKIZi`^!9P1@i8x4yr&j5nsfiXyMaUCL~m
zIM+7&E_28npZ6?V?B|ka)G-SJ1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ
z;P(Reu7JgX-+!Y42oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+z&C*l;jK-t

literal 0
HcmV?d00001

diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
index 964f74f45..7159cc6ae 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
@@ -25,7 +25,7 @@ template <class T, class Allocator>
 roaring_bitmap<T, Allocator>::roaring_bitmap(cuda::std::byte const* bitmap,
                                              Allocator const& alloc,
                                              cuda::stream_ref stream)
-  : storage_{bitmap, alloc, stream}, ref_{storage_.ref()}
+  : storage_{bitmap, alloc, stream}
 {
 }
 
@@ -36,7 +36,7 @@ void roaring_bitmap<T, Allocator>::contains(InputIt first,
                                             OutputIt output,
                                             cuda::stream_ref stream) const
 {
-  ref_.contains(first, last, output, stream);
+  ref_type{storage_.ref()}.contains(first, last, output, stream);
 }
 
 template <class T, class Allocator>
@@ -46,31 +46,31 @@ void roaring_bitmap<T, Allocator>::contains_async(InputIt first,
                                                   OutputIt output,
                                                   cuda::stream_ref stream) const noexcept
 {
-  ref_.contains_async(first, last, output, stream);
+  ref_type{storage_.ref()}.contains_async(first, last, output, stream);
 }
 
 template <class T, class Allocator>
 cuda::std::size_t roaring_bitmap<T, Allocator>::size() const noexcept
 {
-  return ref_.size();
+  return ref_type{storage_.ref()}.size();
 }
 
 template <class T, class Allocator>
 bool roaring_bitmap<T, Allocator>::empty() const noexcept
 {
-  return ref_.empty();
+  return ref_type{storage_.ref()}.empty();
 }
 
 template <class T, class Allocator>
 cuda::std::byte const* roaring_bitmap<T, Allocator>::data() const noexcept
 {
-  return ref_.data();
+  return ref_type{storage_.ref()}.data();
 }
 
 template <class T, class Allocator>
 cuda::std::size_t roaring_bitmap<T, Allocator>::size_bytes() const noexcept
 {
-  return ref_.size_bytes();
+  return ref_type{storage_.ref()}.size_bytes();
 }
 
 template <class T, class Allocator>
@@ -83,6 +83,6 @@ typename roaring_bitmap<T, Allocator>::allocator_type roaring_bitmap<T, Allocato
 template <class T, class Allocator>
 typename roaring_bitmap<T, Allocator>::ref_type roaring_bitmap<T, Allocator>::ref() const noexcept
 {
-  return ref_;
+  return ref_type{storage_.ref()};
 }
 }  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
index 74839b1b5..42752f2d6 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
@@ -46,26 +46,15 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
   static constexpr cuda::std::uint32_t binary_search_threshold = 8;  // TODO determine optimal value
 
   __host__ __device__ roaring_bitmap_impl(storage_ref_type const& storage_ref)
-  {
-    auto const& meta = storage_ref.metadata();
-    if (meta.valid) {
-      data_       = storage_ref.data();
-      size_bytes_ = meta.size_bytes;
-      size_       = meta.num_keys;
-      run_container_bitmap_ =
-        reinterpret_cast<cuda::std::uint8_t const*>(storage_ref.data() + meta.run_container_bitmap);
-      key_cards_ =
-        reinterpret_cast<cuda::std::uint16_t const*>(storage_ref.data() + meta.key_cards);
-      offsets_ =
-        reinterpret_cast<cuda::std::byte const*>(storage_ref.data() + meta.container_offsets);
-      num_containers_  = meta.num_containers;
-      offsets_aligned_ = meta.offsets_aligned;
-      has_run_         = meta.has_run;
-    }
-  }
-
-  __device__ roaring_bitmap_impl(cuda::std::byte const* bitmap)
-    : roaring_bitmap_impl{storage_ref_type{bitmap}}
+    : storage_ref_{storage_ref},
+      offsets_aligned_{(reinterpret_cast<cuda::std::uintptr_t>(
+                         storage_ref_.data() + storage_ref_.metadata().container_offsets)) %
+                         sizeof(cuda::std::uint32_t) ==
+                       0},
+      aligned_16_{(reinterpret_cast<cuda::std::uintptr_t>(storage_ref_.data() +
+                                                          storage_ref_.metadata().key_cards)) %
+                    sizeof(cuda::std::uint16_t) ==
+                  0}  // if base address of key_cards is aligned, then all containers are aligned
   {
   }
 
@@ -100,29 +89,54 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
   }
 
   __device__ bool contains(cuda::std::uint32_t value) const
+  {
+    if (storage_ref_.metadata().num_keys == 0) { return false; }
+
+    if (aligned_16_) {
+      return this->dispatch_contains<true>(value);
+    } else {
+      return this->dispatch_contains<false>(value);
+    }
+  }
+
+  template <bool Aligned>
+  __device__ bool dispatch_contains(cuda::std::uint32_t value) const
   {
     cuda::std::uint16_t const upper = value >> 16;
     cuda::std::uint16_t const lower = value & 0xFFFF;
+    cuda::std::uint16_t key;
 
-    if (num_containers_ < binary_search_threshold) {
+    if (storage_ref_.metadata().num_containers < binary_search_threshold) {
 // linear search
 #pragma unroll
-      for (cuda::std::uint32_t i = 0; i < num_containers_; i++) {
-        cuda::std::uint16_t const key = key_cards_[i * 2];
-        if (key == upper) { return this->contains_container(lower, i); }
+      for (cuda::std::uint32_t i = 0; i < storage_ref_.metadata().num_containers; i++) {
+        if constexpr (Aligned) {
+          key = aligned_load<cuda::std::uint16_t>(storage_ref_.key_cards() +
+                                                  (i * 2) * sizeof(cuda::std::uint16_t));
+        } else {
+          key = misaligned_load<cuda::std::uint16_t>(storage_ref_.key_cards() +
+                                                     (i * 2) * sizeof(cuda::std::uint16_t));
+        }
+        if (key == upper) { return this->contains_container<Aligned>(lower, i); }
         if (key > upper) { return false; }
       }
     } else {
       // binary search
       cuda::std::uint32_t left  = 0;
-      cuda::std::uint32_t right = num_containers_;
+      cuda::std::uint32_t right = storage_ref_.metadata().num_containers;
       while (left < right) {
-        cuda::std::uint32_t mid     = left + (right - left) / 2;
-        cuda::std::uint16_t mid_key = key_cards_[mid * 2];
+        cuda::std::uint32_t mid = left + (right - left) / 2;
+        if constexpr (Aligned) {
+          key = aligned_load<cuda::std::uint16_t>(storage_ref_.key_cards() +
+                                                  (mid * 2) * sizeof(cuda::std::uint16_t));
+        } else {
+          key = misaligned_load<cuda::std::uint16_t>(storage_ref_.key_cards() +
+                                                     (mid * 2) * sizeof(cuda::std::uint16_t));
+        }
 
-        if (mid_key == upper) {
-          return this->contains_container(lower, mid);
-        } else if (mid_key < upper) {
+        if (key == upper) {
+          return this->contains_container<Aligned>(lower, mid);
+        } else if (key < upper) {
           left = mid + 1;
         } else {
           right = mid;
@@ -132,42 +146,70 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
     return false;
   }
 
-  [[nodiscard]] __host__ __device__ cuda::std::size_t size() const noexcept { return size_; }
+  [[nodiscard]] __host__ __device__ cuda::std::size_t size() const noexcept
+  {
+    return storage_ref_.metadata().num_keys;
+  }
 
-  [[nodiscard]] __host__ __device__ bool empty() const noexcept { return size_ == 0; }
+  [[nodiscard]] __host__ __device__ bool empty() const noexcept { return this->size() == 0; }
 
-  [[nodiscard]] __host__ __device__ cuda::std::byte const* data() const noexcept { return data_; }
+  [[nodiscard]] __host__ __device__ cuda::std::byte const* data() const noexcept
+  {
+    return storage_ref_.data();
+  }
 
   [[nodiscard]] __host__ __device__ cuda::std::size_t size_bytes() const noexcept
   {
-    return size_bytes_;
+    return storage_ref_.metadata().size_bytes;
   }
 
- private:
+  template <bool Aligned>
   __device__ bool contains_container(cuda::std::uint16_t lower, cuda::std::uint32_t index) const
   {
-    cuda::std::uint32_t card             = key_cards_[index * 2 + 1] + 1;
-    cuda::std::uint16_t const* container = reinterpret_cast<cuda::std::uint16_t const*>(
-      data_ + container_offset(offsets_, offsets_aligned_, index));
-    if (is_run_container(run_container_bitmap_, has_run_, index)) {
-      return this->contains_run_container(container, lower, card);
+    cuda::std::uint32_t offset;
+    if (offsets_aligned_) {
+      offset = aligned_load<cuda::std::uint32_t>(storage_ref_.container_offsets() +
+                                                 index * sizeof(cuda::std::uint32_t));
+    } else {
+      offset = misaligned_load<cuda::std::uint32_t>(storage_ref_.container_offsets() +
+                                                    index * sizeof(cuda::std::uint32_t));
+    }
+    cuda::std::byte const* container = storage_ref_.data() + offset;
+    if (storage_ref_.metadata().has_run and
+        (storage_ref_.run_container_bitmap()[index / 8] & (1 << (index % 8)))) {
+      return this->contains_run_container<Aligned>(container, lower);
     } else {
-      if (card <= 4096) {  // TODO check if this is correct
-        return this->contains_array_container(container, lower, card);
+      cuda::std::uint32_t card;
+      if constexpr (Aligned) {
+        card = 1u + aligned_load<cuda::std::uint16_t>(
+                      storage_ref_.key_cards() + (index * 2 + 1) * sizeof(cuda::std::uint16_t));
+      } else {
+        card = 1u + misaligned_load<cuda::std::uint16_t>(
+                      storage_ref_.key_cards() + (index * 2 + 1) * sizeof(cuda::std::uint16_t));
+      }
+      if (card <= 4096) {
+        return this->contains_array_container<Aligned>(container, lower, card);
       } else {
         return this->contains_bitset_container(container, lower, card);
       }
     }
   }
 
-  __device__ bool contains_array_container(cuda::std::uint16_t const* container,
+  template <bool Aligned>
+  __device__ bool contains_array_container(cuda::std::byte const* container,
                                            cuda::std::uint16_t lower,
                                            cuda::std::uint32_t card) const
   {
+    cuda::std::uint16_t elem;
     // Use linear search for small arrays, binary search for larger ones
     if (card < binary_search_threshold) {
       for (cuda::std::uint32_t i = 0; i < card; i++) {
-        if (container[i] == lower) { return true; }
+        if constexpr (Aligned) {
+          elem = aligned_load<cuda::std::uint16_t>(container + i * sizeof(cuda::std::uint16_t));
+        } else {
+          elem = misaligned_load<cuda::std::uint16_t>(container + i * sizeof(cuda::std::uint16_t));
+        }
+        if (elem == lower) { return true; }
       }
       return false;
     } else {
@@ -176,9 +218,15 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
 
       while (left < right) {
         cuda::std::uint32_t mid = left + (right - left) / 2;
-        if (container[mid] == lower) {
+        if constexpr (Aligned) {
+          elem = aligned_load<cuda::std::uint16_t>(container + mid * sizeof(cuda::std::uint16_t));
+        } else {
+          elem =
+            misaligned_load<cuda::std::uint16_t>(container + mid * sizeof(cuda::std::uint16_t));
+        }
+        if (elem == lower) {
           return true;
-        } else if (container[mid] < lower) {
+        } else if (elem < lower) {
           left = mid + 1;
         } else {
           right = mid;
@@ -188,37 +236,136 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
     }
   }
 
-  __device__ bool contains_bitset_container(cuda::std::uint16_t const* container,
+  __device__ bool contains_bitset_container(cuda::std::byte const* container,
                                             cuda::std::uint16_t lower,
                                             cuda::std::uint32_t card) const
   {
-    // check if bit at position lower is set
-    return container[lower / 16] & (1 << (lower % 16));
+    return static_cast<cuda::std::uint8_t>(container[lower / 8]) &
+           (cuda::std::uint8_t(1) << (lower % 8));
   }
 
-  __device__ bool contains_run_container(cuda::std::uint16_t const* container,
-                                         cuda::std::uint16_t lower,
-                                         cuda::std::uint32_t card) const
+  template <bool Aligned>
+  __device__ bool contains_run_container(cuda::std::byte const* container,
+                                         cuda::std::uint16_t lower) const
   {
-    // TODO implement linear search
+    // TODO implement binary search
+    cuda::std::uint16_t num_runs;
+    if constexpr (Aligned) {
+      num_runs = aligned_load<cuda::std::uint16_t>(container);
+    } else {
+      num_runs = misaligned_load<cuda::std::uint16_t>(container);
+    }
+
+    cuda::std::uint16_t start;
+    cuda::std::uint32_t end;
+
+    for (cuda::std::uint32_t i = 0; i < num_runs; i++) {
+      // TODO load start+end in one instruction
+      if constexpr (Aligned) {
+        start =
+          aligned_load<cuda::std::uint16_t>(container + (i * 2 + 1) * sizeof(cuda::std::uint16_t));
+        end =
+          static_cast<cuda::std::uint32_t>(start) +
+          aligned_load<cuda::std::uint16_t>(container + (i * 2 + 2) * sizeof(cuda::std::uint16_t));
+      } else {
+        start = misaligned_load<cuda::std::uint16_t>(container +
+                                                     (i * 2 + 1) * sizeof(cuda::std::uint16_t));
+        end   = static_cast<cuda::std::uint32_t>(start) +
+              misaligned_load<cuda::std::uint16_t>(container +
+                                                   (i * 2 + 2) * sizeof(cuda::std::uint16_t));
+      }
+      if (start <= lower && end >= lower) { return true; }
+      if (start > lower) { break; }
+    }
     return false;
   }
 
-  cuda::std::byte const* data_;
-  cuda::std::size_t size_bytes_;
-  cuda::std::size_t size_;
-  cuda::std::uint8_t const* run_container_bitmap_;
-  cuda::std::uint16_t const* key_cards_;  // TODO uint8?
-  cuda::std::byte const* offsets_;
-  cuda::std::int32_t num_containers_;
+  storage_ref_type storage_ref_;
   bool offsets_aligned_;
-  bool has_run_;
+  bool aligned_16_;
 };
 
 template <>
 class roaring_bitmap_impl<cuda::std::uint64_t> {
-  using bucket_type = roaring_bitmap_impl<cuda::std::uint32_t>;
-  // TODO implement
+ public:
+  using bucket_type      = roaring_bitmap_impl<cuda::std::uint32_t>;
+  using storage_ref_type = roaring_bitmap_storage_ref<cuda::std::uint64_t>;
+
+  __host__ __device__ roaring_bitmap_impl(storage_ref_type const& storage_ref)
+    : storage_ref_{storage_ref}
+  {
+  }
+
+  template <class InputIt, class OutputIt>
+  __host__ void contains(InputIt first,
+                         InputIt last,
+                         OutputIt contained,
+                         cuda::stream_ref stream = {}) const
+  {
+    this->contains_async(first, last, contained, stream);
+    stream.wait();
+  }
+
+  template <class InputIt, class OutputIt>
+  __host__ void contains_async(InputIt first,
+                               InputIt last,
+                               OutputIt contained,
+                               cuda::stream_ref stream = {}) const noexcept
+  {
+    auto nosync_exec_policy = thrust::cuda::par_nosync.on(stream.get());
+    if (this->empty()) {
+      thrust::fill(
+        nosync_exec_policy, contained, contained + cuda::std::distance(first, last), false);
+    } else {
+      thrust::transform(nosync_exec_policy,
+                        first,
+                        last,
+                        contained,
+                        cuda::proclaim_return_type<bool>(
+                          [*this] __device__(auto key) { return this->contains(key); }));
+    }
+  }
+
+  __device__ bool contains(cuda::std::uint64_t value) const
+  {
+    cuda::std::uint32_t bucket_key   = value >> 32;
+    cuda::std::uint32_t bucket_value = value & 0xFFFFFFFF;
+
+    // binary search in storage_ref_.buckets()
+    cuda::std::uint32_t left  = 0;
+    cuda::std::uint32_t right = storage_ref_.metadata().num_buckets;
+    while (left < right) {
+      cuda::std::uint32_t mid = left + (right - left) / 2;
+      if (storage_ref_.buckets()[mid].first == bucket_key) {
+        return bucket_type{storage_ref_.buckets()[mid].second}.contains(
+          bucket_value);  // TODO is constructing the ref in-place a bad idea?
+      } else if (storage_ref_.buckets()[mid].first < bucket_key) {
+        left = mid + 1;
+      } else {
+        right = mid;
+      }
+    }
+    return false;
+  }
+
+  [[nodiscard]] __host__ __device__ cuda::std::size_t size() const noexcept
+  {
+    return storage_ref_.metadata().num_keys;
+  }
+
+  [[nodiscard]] __host__ __device__ bool empty() const noexcept { return this->size() == 0; }
+
+  [[nodiscard]] __host__ __device__ cuda::std::byte const* data() const noexcept
+  {
+    return storage_ref_.data();
+  }
+
+  [[nodiscard]] __host__ __device__ cuda::std::size_t size_bytes() const noexcept
+  {
+    return storage_ref_.metadata().size_bytes;
+  }
+
+  storage_ref_type storage_ref_;
 };
 
 }  // namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh
index 49805afb8..349f1bb83 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh
@@ -27,6 +27,8 @@
 
 #include <memory>
 #include <nv/target>
+#include <utility>
+#include <vector>
 
 namespace cuco::detail {
 
@@ -41,12 +43,18 @@ class roaring_bitmap_storage_ref<cuda::std::uint32_t> {
   using metadata_type = roaring_bitmap_metadata<cuda::std::uint32_t>;
   __host__ __device__ roaring_bitmap_storage_ref(cuda::std::byte const* bitmap,
                                                  metadata_type const& metadata)
-    : data_{bitmap}, metadata_{metadata}
+    : metadata_{metadata},
+      data_{bitmap},
+      run_container_bitmap_{
+        reinterpret_cast<cuda::std::uint8_t const*>(bitmap + metadata.run_container_bitmap)},
+      key_cards_{bitmap + metadata.key_cards},
+      container_offsets_{bitmap + metadata.container_offsets}
   {
+    assert(metadata.valid);
   }
 
   __device__ roaring_bitmap_storage_ref(cuda::std::byte const* bitmap)
-    : data_{bitmap}, metadata_{metadata_type{bitmap}}
+    : roaring_bitmap_storage_ref{bitmap, metadata_type{bitmap}}
   {
   }
 
@@ -54,9 +62,58 @@ class roaring_bitmap_storage_ref<cuda::std::uint32_t> {
 
   __host__ __device__ cuda::std::byte const* data() const noexcept { return data_; }
 
+  __host__ __device__ cuda::std::size_t size_bytes() const noexcept { return metadata_.size_bytes; }
+
+  __host__ __device__ cuda::std::uint8_t const* run_container_bitmap() const noexcept
+  {
+    return run_container_bitmap_;
+  }
+
+  __host__ __device__ cuda::std::byte const* key_cards() const noexcept { return key_cards_; }
+
+  __host__ __device__ cuda::std::byte const* container_offsets() const noexcept
+  {
+    return container_offsets_;
+  }
+
  private:
+  metadata_type metadata_;
   cuda::std::byte const* data_;
+  cuda::std::uint8_t const* run_container_bitmap_;
+  cuda::std::byte const* key_cards_;
+  cuda::std::byte const* container_offsets_;
+};
+
+template <>
+class roaring_bitmap_storage_ref<cuda::std::uint64_t> {
+ public:
+  using metadata_type = roaring_bitmap_metadata<cuda::std::uint64_t>;
+
+  __host__ __device__ roaring_bitmap_storage_ref(
+    cuda::std::byte const* bitmap,
+    metadata_type const& metadata,
+    cuda::std::pair<cuda::std::uint32_t, roaring_bitmap_storage_ref<cuda::std::uint32_t>>* buckets)
+    : metadata_{metadata}, data_{bitmap}, buckets_{buckets}
+  {
+  }
+
+  __host__ __device__ metadata_type const& metadata() const noexcept { return metadata_; }
+
+  __host__ __device__ cuda::std::byte const* data() const noexcept { return data_; }
+
+  __host__ __device__ cuda::std::size_t size_bytes() const noexcept { return metadata_.size_bytes; }
+
+  __host__ __device__
+    cuda::std::pair<cuda::std::uint32_t, roaring_bitmap_storage_ref<cuda::std::uint32_t>>*
+    buckets() const noexcept
+  {
+    return buckets_;
+  }
+
+ private:
   metadata_type metadata_;
+  cuda::std::byte const* data_;
+  cuda::std::pair<cuda::std::uint32_t, roaring_bitmap_storage_ref<cuda::std::uint32_t>>* buckets_;
 };
 
 template <class T, class Allocator>
@@ -90,7 +147,6 @@ class roaring_bitmap_storage<cuda::std::uint32_t, Allocator> {
   {
     CUCO_CUDA_TRY(cudaMemcpyAsync(
       data_.get(), bitmap, metadata_.size_bytes, cudaMemcpyHostToDevice, stream.get()));
-    // stream.wait();  // TODO check if this is necessary
   }
 
   ref_type ref() const noexcept { return ref_; }
@@ -102,6 +158,75 @@ class roaring_bitmap_storage<cuda::std::uint32_t, Allocator> {
   ref_type ref_;
 };
 
-// TODO implement roaring_bitmap_metadata<cuda::std::uint64_t>
+template <class Allocator>
+class roaring_bitmap_storage<cuda::std::uint64_t, Allocator> {
+ public:
+  using allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<cuda::std::byte>;
+  using ref_type              = roaring_bitmap_storage_ref<cuda::std::uint64_t>;
+  using bucket_ref_type       = roaring_bitmap_storage_ref<cuda::std::uint32_t>;
+  using bucket_allocator_type = typename std::allocator_traits<Allocator>::template rebind_alloc<
+    cuda::std::pair<cuda::std::uint32_t, bucket_ref_type>>;
+
+  roaring_bitmap_storage(roaring_bitmap_storage const& other)            = default;
+  roaring_bitmap_storage(roaring_bitmap_storage&& other)                 = default;
+  roaring_bitmap_storage& operator=(roaring_bitmap_storage const& other) = default;
+  roaring_bitmap_storage& operator=(roaring_bitmap_storage&& other)      = default;
+
+  ~roaring_bitmap_storage() = default;
+
+  roaring_bitmap_storage(cuda::std::byte const* bitmap,
+                         Allocator const& alloc,
+                         cuda::stream_ref stream)
+    : allocator_{alloc},
+      bucket_allocator_{alloc},
+      bucket_metadata_{},
+      buckets_h_{},
+      metadata_{
+        [bitmap](std::vector<typename ref_type::metadata_type::bucket_metadata>& bucket_metadata) {
+          return typename ref_type::metadata_type{bitmap, bucket_metadata};
+        }(bucket_metadata_)},
+      data_{allocator_.allocate(metadata_.size_bytes),
+            detail::custom_deleter<cuda::std::size_t, allocator_type>{metadata_.size_bytes,
+                                                                      allocator_}},
+      buckets_{bucket_allocator_.allocate(metadata_.num_buckets),
+               detail::custom_deleter<cuda::std::size_t, bucket_allocator_type>{
+                 metadata_.num_buckets, bucket_allocator_}},
+      ref_{data_.get(), metadata_, buckets_.get()}
+  {
+    assert(metadata_.valid);
+    buckets_h_.reserve(bucket_metadata_.size());
+    for (auto const& meta : bucket_metadata_) {
+      buckets_h_.emplace_back(meta.key,
+                              bucket_ref_type{data_.get() + meta.byte_offset, meta.metadata});
+    }
+    CUCO_CUDA_TRY(cudaMemcpyAsync(
+      data_.get(), bitmap, metadata_.size_bytes, cudaMemcpyHostToDevice, stream.get()));
+    CUCO_CUDA_TRY(cudaMemcpyAsync(
+      buckets_.get(),
+      buckets_h_.data(),
+      metadata_.num_buckets * sizeof(cuda::std::pair<cuda::std::uint32_t, bucket_ref_type>),
+      cudaMemcpyHostToDevice,
+      stream.get()));
+    // stream.wait();
+    // clear intermediate data
+    // bucket_metadata.clear();
+    // buckets_h.clear();
+  }
+
+  ref_type ref() const noexcept { return ref_; }
+
+ private:
+  allocator_type allocator_;
+  bucket_allocator_type bucket_allocator_;
+  std::vector<typename ref_type::metadata_type::bucket_metadata> bucket_metadata_;
+  std::vector<cuda::std::pair<cuda::std::uint32_t, bucket_ref_type>> buckets_h_;
+  typename ref_type::metadata_type metadata_;
+  std::unique_ptr<cuda::std::byte, custom_deleter<cuda::std::size_t, allocator_type>> data_;
+  std::unique_ptr<cuda::std::pair<cuda::std::uint32_t, bucket_ref_type>,
+                  custom_deleter<cuda::std::size_t, bucket_allocator_type>>
+    buckets_;
+  ref_type ref_;
+};
 
 }  // namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/detail/roaring_bitmap/util.cuh b/include/cuco/detail/roaring_bitmap/util.cuh
index a9510800b..01892e73a 100644
--- a/include/cuco/detail/roaring_bitmap/util.cuh
+++ b/include/cuco/detail/roaring_bitmap/util.cuh
@@ -21,32 +21,25 @@
 #include <cuda/std/cstddef>
 #include <cuda/std/cstdint>
 #include <cuda/std/iterator>
+#include <cuda/std/memory>
 
 #include <nv/target>
+#include <vector>
 
 namespace cuco::detail {
 
-__host__ __device__ cuda::std::uint32_t container_offset(cuda::std::byte const* offsets,
-                                                         bool offsets_aligned,
-                                                         cuda::std::int32_t i)
+template <class T>
+__host__ __device__ __forceinline__ T aligned_load(cuda::std::byte const* ptr)
 {
-  cuda::std::uint32_t offset = 0;
-  if (offsets_aligned) {
-    offset =
-      *reinterpret_cast<cuda::std::uint32_t const*>(offsets + i * sizeof(cuda::std::uint32_t));
-  } else {
-    cuda::std::memcpy(
-      &offset, offsets + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t));
-  }
-  return offset;
+  return *reinterpret_cast<T const*>(cuda::std::assume_aligned<alignof(T)>(ptr));
 }
 
-__host__ __device__ bool is_run_container(cuda::std::uint8_t const* run_container_bitmap,
-                                          bool has_run,
-                                          cuda::std::int32_t i)
+template <class T>
+__host__ __device__ __forceinline__ T misaligned_load(cuda::std::byte const* ptr)
 {
-  if (not has_run) return false;
-  return run_container_bitmap[i / 8] & (1 << (i % 8));
+  T value;
+  cuda::std::memcpy(&value, ptr, sizeof(T));
+  return value;
 }
 
 template <class T>
@@ -56,21 +49,20 @@ struct roaring_bitmap_metadata {
 
 template <>
 struct roaring_bitmap_metadata<cuda::std::uint32_t> {
-  cuda::std::size_t size_bytes           = 0;
-  cuda::std::size_t num_keys             = 0;
-  cuda::std::size_t run_container_bitmap = 0;
-  cuda::std::size_t key_cards            = 0;
-  cuda::std::size_t container_offsets    = 0;
-  cuda::std::int32_t num_containers      = 0;
-  bool has_run                           = false;
-  bool offsets_aligned                   = false;
-  bool valid                             = false;
+  cuda::std::size_t size_bytes             = 0;
+  cuda::std::uint32_t num_keys             = 0;
+  cuda::std::uint32_t run_container_bitmap = 0;
+  cuda::std::uint32_t key_cards            = 0;
+  cuda::std::uint32_t container_offsets    = 0;
+  cuda::std::int32_t num_containers        = 0;
+  bool has_run                             = false;
+  bool valid                               = false;
 
   __host__ __device__ roaring_bitmap_metadata(cuda::std::byte const* bitmap)
   {
     constexpr cuda::std::uint32_t serial_cookie_no_runcontainer = 12346;
     constexpr cuda::std::uint32_t serial_cookie                 = 12347;
-    // constexpr cuda::std::uint32_t frozen_cookie                 = 13766;
+    // constexpr cuda::std::uint32_t frozen_cookie                 = 13766; // not implemented
     constexpr cuda::std::int32_t no_offset_threshold = 4;
 
     cuda::std::byte const* buf = bitmap;
@@ -80,8 +72,11 @@ struct roaring_bitmap_metadata<cuda::std::uint32_t> {
     buf += sizeof(cuda::std::uint32_t);
     if ((cookie & 0xFFFF) != serial_cookie && cookie != serial_cookie_no_runcontainer) {
       valid = false;
-      NV_IF_TARGET(NV_IS_HOST,
-                   CUCO_FAIL("Invalid bitmap format");)  // TODO device error handling
+      NV_IF_TARGET(
+        NV_IS_HOST,
+        CUCO_FAIL(
+          "Invalid bitmap format: cookie type invalid or not supported");)  // TODO device error
+                                                                            // handling
       return;
     }
 
@@ -91,57 +86,61 @@ struct roaring_bitmap_metadata<cuda::std::uint32_t> {
       cuda::std::memcpy(&num_containers, buf, sizeof(cuda::std::uint32_t));
       buf += sizeof(cuda::std::uint32_t);
     }
-    if (num_containers < 0) {
+    if (num_containers < 0 or num_containers > (1 << 16)) {
       valid = false;
-      NV_IF_TARGET(NV_IS_HOST,
-                   CUCO_FAIL("Invalid bitmap format");)  // TODO device error handling
-      return;
-    }
-    if (num_containers > (1 << 16)) {
-      valid = false;
-      NV_IF_TARGET(NV_IS_HOST,
-                   CUCO_FAIL("Invalid bitmap format");)  // TODO device error handling
+      NV_IF_TARGET(
+        NV_IS_HOST,
+        CUCO_FAIL(
+          "Invalid bitmap format: num_containers out of range");)  // TODO device error handling
       return;
     }
 
     has_run = (cookie & 0xFFFF) == serial_cookie;
     if (has_run) {
-      valid = false;  // TODO run container bitmap is not supported yet
-      NV_IF_TARGET(NV_IS_HOST,
-                   CUCO_FAIL("Invalid bitmap format");)  // TODO device error handling
-      return;
       cuda::std::size_t s  = (num_containers + 7) / 8;
       run_container_bitmap = cuda::std::distance(bitmap, buf);
       buf += s;
     }
 
-    key_cards = cuda::std::distance(bitmap, buf);
+    key_cards             = cuda::std::distance(bitmap, buf);
+    bool const aligned_16 = (reinterpret_cast<cuda::std::uintptr_t>(bitmap + key_cards) %
+                             sizeof(cuda::std::uint16_t)) == 0;
     buf += num_containers * 2 * sizeof(cuda::std::uint16_t);
 
     if ((!has_run) || (num_containers >= no_offset_threshold)) {
       container_offsets = cuda::std::distance(bitmap, buf);
-      offsets_aligned   = (reinterpret_cast<cuda::std::uintptr_t>(bitmap + container_offsets) %
-                         sizeof(cuda::std::uint32_t)) == 0;
-      buf += num_containers * 4;
+      buf += num_containers * sizeof(cuda::std::uint32_t);
+    } else {
+      valid = false;
+      NV_IF_TARGET(
+        NV_IS_HOST,
+        CUCO_FAIL("Invalid bitmap format: not implemented");)  // TODO device error handling
+      return;
     }
 
-    num_keys = 0;
-    cuda::std::uint16_t const* cards =
-      reinterpret_cast<cuda::std::uint16_t const*>(bitmap + key_cards);
     cuda::std::uint32_t card = 0;
     for (cuda::std::int32_t i = 0; i < num_containers; i++) {
-      // cuda::std::uint16_t key  = key_cards[i * 2];
-      card = cards[i * 2 + 1] + 1;
+      if (aligned_16) {
+        card = aligned_load<cuda::std::uint16_t>(bitmap + key_cards +
+                                                 (i * 2 + 1) * sizeof(cuda::std::uint16_t)) +
+               1u;
+      } else {
+        card = misaligned_load<cuda::std::uint16_t>(bitmap + key_cards +
+                                                    (i * 2 + 1) * sizeof(cuda::std::uint16_t)) +
+               1u;
+      }
       num_keys += card;
     }
 
-    // find end of roaring bitmap
+    // find end of roaring bitmap (re-use card from last container)
     cuda::std::byte const* end =
-      bitmap + container_offset(bitmap + container_offsets, offsets_aligned, num_containers - 1);
-    if (is_run_container(reinterpret_cast<cuda::std::uint8_t const*>(bitmap + run_container_bitmap),
-                         has_run,
-                         num_containers - 1)) {
-      // TODO implement
+      bitmap + misaligned_load<cuda::std::uint32_t>(
+                 bitmap + container_offsets + (num_containers - 1) * sizeof(cuda::std::uint32_t));
+    if (has_run and (static_cast<cuda::std::uint8_t>(
+                       (bitmap + run_container_bitmap)[(num_containers - 1) / 8]) &
+                     (cuda::std::uint8_t(1) << ((num_containers - 1) % 8)))) {
+      cuda::std::uint16_t const num_runs = misaligned_load<cuda::std::uint16_t>(end);
+      end += sizeof(cuda::std::uint16_t) + num_runs * 2 * sizeof(cuda::std::uint16_t);
     } else {
       if (card <= 4096) {  // TODO check if this is correct
         end += card * sizeof(cuda::std::uint16_t);
@@ -155,6 +154,73 @@ struct roaring_bitmap_metadata<cuda::std::uint32_t> {
   }
 };
 
-// TODO implement roaring_bitmap_metadata<cuda::std::uint64_t>
+template <>
+struct roaring_bitmap_metadata<cuda::std::uint64_t> {
+  cuda::std::size_t num_buckets = 0;
+  cuda::std::size_t size_bytes  = 0;
+  cuda::std::size_t num_keys    = 0;
+  bool valid                    = false;
+
+  struct bucket_metadata {
+    cuda::std::size_t byte_offset;
+    cuda::std::uint32_t key;
+    roaring_bitmap_metadata<cuda::std::uint32_t> metadata;
+
+    bucket_metadata(cuda::std::size_t offset,
+                    cuda::std::uint32_t k,
+                    roaring_bitmap_metadata<cuda::std::uint32_t> const& meta)
+      : byte_offset{offset}, key{k}, metadata{meta}
+    {
+    }
+  };
 
+  __host__ roaring_bitmap_metadata(cuda::std::byte const* bitmap,
+                                   std::vector<bucket_metadata>& bucket_metadata)
+  {
+    cuda::std::size_t byte_offset     = 0;
+    cuda::std::byte const* bitmap_ptr = bitmap;
+    cuda::std::memcpy(&num_buckets, bitmap_ptr, sizeof(cuda::std::uint64_t));
+    byte_offset += sizeof(cuda::std::uint64_t);  // skip num_buckets
+
+    bucket_metadata.clear();
+    bucket_metadata.reserve(num_buckets);
+
+    for (cuda::std::size_t i = 0; i < num_buckets; ++i) {
+      cuda::std::uint32_t bucket_key;
+      cuda::std::memcpy(&bucket_key, bitmap_ptr + byte_offset, sizeof(cuda::std::uint32_t));
+      byte_offset += sizeof(cuda::std::uint32_t);  // skip bucket key
+      roaring_bitmap_metadata<cuda::std::uint32_t> bucket_meta{bitmap_ptr + byte_offset};
+      if (!bucket_meta.valid) {
+        valid = false;
+        return;
+      }
+      bucket_metadata.emplace_back(byte_offset, bucket_key, bucket_meta);
+      num_keys += bucket_meta.num_keys;
+      byte_offset += bucket_meta.size_bytes;  // skip bucket
+    }
+    size_bytes = byte_offset;
+    valid      = true;
+  }
+
+  __host__ __device__ roaring_bitmap_metadata(cuda::std::byte const* bitmap)
+  {
+    cuda::std::size_t byte_offset     = 0;
+    cuda::std::byte const* bitmap_ptr = bitmap;
+    cuda::std::memcpy(&num_buckets, bitmap_ptr, sizeof(cuda::std::uint64_t));
+    byte_offset += sizeof(cuda::std::uint64_t);  // skip num_buckets
+
+    for (cuda::std::size_t i = 0; i < num_buckets; ++i) {
+      byte_offset += sizeof(cuda::std::uint32_t);  // skip bucket key
+      roaring_bitmap_metadata<cuda::std::uint32_t> bucket_meta{bitmap_ptr + byte_offset};
+      if (!bucket_meta.valid) {
+        valid = false;
+        return;
+      }
+      num_keys += bucket_meta.num_keys;
+      byte_offset += bucket_meta.size_bytes;  // skip bucket
+    }
+    size_bytes = byte_offset;
+    valid      = true;
+  }
+};
 }  // namespace cuco::detail
\ No newline at end of file
diff --git a/include/cuco/roaring_bitmap.cuh b/include/cuco/roaring_bitmap.cuh
index 69ad93ae8..4ca3fb8a2 100644
--- a/include/cuco/roaring_bitmap.cuh
+++ b/include/cuco/roaring_bitmap.cuh
@@ -25,53 +25,135 @@
 
 namespace cuco {
 
+/**
+ * @brief GPU-accelerated container that owns a serialized Roaring bitmap.
+ *
+ * The `roaring_bitmap` provides host-side bulk membership queries over a bitmap stored in the
+ * [Roaring bitmap format specification](https://github.com/RoaringBitmap/RoaringFormatSpec).
+ * The serialized bytes are copied to device-accessible storage upon construction, and queries are
+ * executed on the GPU.
+ *
+ * In addition to bulk host APIs such as `contains`/`contains_async`, this container exposes a
+ * non-owning reference object via `ref()` that can be used for device-side per-thread queries.
+ *
+ * @tparam T Key type. Must be `cuda::std::uint32_t` or `cuda::std::uint64_t`.
+ * @tparam Allocator Allocator type used to manage device-accessible storage for the serialized
+ *                   bytes.
+ */
 template <class T, class Allocator = cuco::cuda_allocator<cuda::std::byte>>
 class roaring_bitmap {
  public:
-  using storage_type   = detail::roaring_bitmap_storage<T, Allocator>;
-  using allocator_type = typename storage_type::allocator_type;
-  using ref_type       = roaring_bitmap_ref<T>;
-
+  using value_type     = T;                                             ///< Key type
+  using storage_type   = detail::roaring_bitmap_storage<T, Allocator>;  ///< Storage implementation
+  using allocator_type = typename storage_type::allocator_type;         ///< Allocator type
+  using ref_type       = roaring_bitmap_ref<value_type>;  ///< Non-owning reference type
+
+  /**
+   * @brief Constructs a `roaring_bitmap` by copying the serialized bytes to device-accessible
+   *        storage.
+   *
+   * @param bitmap Pointer to the beginning of the serialized bitmap in host memory
+   * @param alloc Allocator used to allocate device-accessible storage
+   * @param stream CUDA stream used for device memory operations during construction
+   */
   roaring_bitmap(cuda::std::byte const* bitmap,
                  Allocator const& alloc  = {},
                  cuda::stream_ref stream = {});
 
-  roaring_bitmap(roaring_bitmap const& other)            = default;
-  roaring_bitmap(roaring_bitmap&& other)                 = default;
-  roaring_bitmap& operator=(roaring_bitmap const& other) = default;
-  roaring_bitmap& operator=(roaring_bitmap&& other)      = default;
-
-  ~roaring_bitmap() = default;
-
+  roaring_bitmap(roaring_bitmap const& other)            = default;  ///< Copy constructor
+  roaring_bitmap(roaring_bitmap&& other)                 = default;  ///< Move constructor
+  roaring_bitmap& operator=(roaring_bitmap const& other) = default;  ///< Copy assignment
+  roaring_bitmap& operator=(roaring_bitmap&& other)      = default;  ///< Move assignment
+
+  ~roaring_bitmap() = default;  ///< Destructor
+
+  /**
+   * @brief Bulk membership query for keys in `[first, last)`.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   *       `contains_async`.
+   *
+   * @tparam InputIt  Device-accessible random access input iterator of keys convertible to `T`
+   * @tparam OutputIt Device-accessible random access output iterator to `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last  End of the sequence of keys
+   * @param contained Output iterator where results are written; `true` iff the corresponding key
+   *                  is present in the bitmap
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   */
   template <class InputIt, class OutputIt>
   void contains(InputIt first,
                 InputIt last,
                 OutputIt contained,
                 cuda::stream_ref stream = {}) const;
 
+  /**
+   * @brief Asynchronously performs a bulk membership query for keys in `[first, last)`.
+   *
+   * @tparam InputIt  Device-accessible random access input iterator of keys convertible to `T`
+   * @tparam OutputIt Device-accessible random access output iterator to `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last  End of the sequence of keys
+   * @param contained Output iterator where results are written; `true` iff the corresponding key
+   *                  is present in the bitmap
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   */
   template <class InputIt, class OutputIt>
   void contains_async(InputIt first,
                       InputIt last,
                       OutputIt contained,
                       cuda::stream_ref stream = {}) const noexcept;
 
-  // TODO contains_if, contains_if_async, empty
-
+  /**
+   * @brief Number of keys stored in the bitmap.
+   *
+   * @return Count of keys in the bitmap
+   */
   [[nodiscard]] cuda::std::size_t size() const noexcept;
 
+  /**
+   * @brief Checks whether the bitmap contains no keys.
+   *
+   * @return `true` iff `size() == 0`
+   */
   [[nodiscard]] bool empty() const noexcept;
 
+  /**
+   * @brief Returns a pointer to the beginning of the serialized bitmap bytes in device-accessible
+   *        storage.
+   *
+   * @return Pointer to the serialized storage
+   */
   [[nodiscard]] cuda::std::byte const* data() const noexcept;
 
+  /**
+   * @brief Size in bytes of the serialized bitmap storage.
+   *
+   * @return Number of bytes occupied by the serialized bitmap
+   */
   [[nodiscard]] cuda::std::size_t size_bytes() const noexcept;
 
+  /**
+   * @brief Returns the allocator used to manage device-accessible storage.
+   *
+   * @return Allocator instance
+   */
   [[nodiscard]] allocator_type allocator() const noexcept;
 
+  /**
+   * @brief Returns a non-owning reference to the underlying bitmap suitable for device-side use.
+   *
+   * The returned reference type provides device functions such as `contains(T)` for per-thread
+   * membership testing.
+   *
+   * @return Non-owning reference to the underlying bitmap
+   */
   [[nodiscard]] ref_type ref() const noexcept;
 
  private:
   storage_type storage_;
-  ref_type ref_;
 };
 
 }  // namespace cuco
diff --git a/include/cuco/roaring_bitmap_ref.cuh b/include/cuco/roaring_bitmap_ref.cuh
index 41994099f..88b704c28 100644
--- a/include/cuco/roaring_bitmap_ref.cuh
+++ b/include/cuco/roaring_bitmap_ref.cuh
@@ -23,39 +23,122 @@
 
 namespace cuco {
 
+/**
+ * @brief Non-owning reference to a Roaring bitmap stored in its serialized format.
+ *
+ * A `roaring_bitmap_ref` provides device and host APIs to query membership against a bitmap that
+ * is laid out according to the [Roaring bitmap format
+ * specification](https://github.com/RoaringBitmap/RoaringFormatSpec). The object does not own the
+ * underlying storage; it simply provides algorithms over the referenced bytes.
+ *
+ * @note The reference reads directly from the serialized representation without deserializing.
+ *       It supports 32-bit and 64-bit key types. For 32-bit bitmaps the layout follows the
+ *       "Standard 32-bit Roaring Bitmap" format; for 64-bit bitmaps, the "portable" format is
+ * supported.
+ *
+ * @tparam T Key type stored in the bitmap. Must be `cuda::std::uint32_t` or `cuda::std::uint64_t`.
+ */
 template <class T>
 class roaring_bitmap_ref {
   using impl_type = detail::roaring_bitmap_impl<T>;
 
  public:
-  using storage_ref_type = typename impl_type::storage_ref_type;
-
+  using value_type       = T;                                     ///< Key type stored in the bitmap
+  using storage_ref_type = typename impl_type::storage_ref_type;  ///< Implementation storage ref
+
+  /**
+   * @brief Constructs a non-owning reference from an implementation-specific storage reference.
+   *
+   * @param storage_ref Reference to the underlying serialized bitmap storage
+   */
   __host__ __device__ roaring_bitmap_ref(storage_ref_type const& storage_ref);
 
+  /**
+   * @brief Constructs a device-side reference from a raw pointer to a 32-bit Roaring bitmap.
+   *
+   * @note This constructor is only available when `T == cuda::std::uint32_t` and can be used in
+   *       device code to create a lightweight view over device-resident serialized bytes.
+   *
+   * @param bitmap Pointer to the beginning of the serialized bitmap in device memory
+   */
   template <class U = T,
             class   = cuda::std::enable_if_t<cuda::std::is_same_v<U, cuda::std::uint32_t>>>
   __device__ roaring_bitmap_ref(cuda::std::byte const* bitmap);
 
+  /**
+   * @brief Bulk membership query for keys in `[first, last)`.
+   *
+   * @note This function synchronizes the given stream. For asynchronous execution use
+   *       `contains_async`.
+   *
+   * @tparam InputIt  Device-accessible random access input iterator of keys convertible to `T`
+   * @tparam OutputIt Device-accessible random access output iterator to `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last  End of the sequence of keys
+   * @param contained Output iterator where results are written; `true` iff the corresponding key
+   *                  is present in the bitmap
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   */
   template <class InputIt, class OutputIt>
   __host__ void contains(InputIt first,
                          InputIt last,
                          OutputIt contained,
                          cuda::stream_ref stream = {}) const;
 
+  /**
+   * @brief Asynchronously performs a bulk membership query for keys in `[first, last)`.
+   *
+   * @tparam InputIt  Device-accessible random access input iterator of keys convertible to `T`
+   * @tparam OutputIt Device-accessible random access output iterator to `bool`
+   *
+   * @param first Beginning of the sequence of keys
+   * @param last  End of the sequence of keys
+   * @param contained Output iterator where results are written; `true` iff the corresponding key
+   *                  is present in the bitmap
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   */
   template <class InputIt, class OutputIt>
   __host__ void contains_async(InputIt first,
                                InputIt last,
                                OutputIt contained,
                                cuda::stream_ref stream = {}) const noexcept;
 
+  /**
+   * @brief Device-side membership query for a single key.
+   *
+   * @param value Key to test for membership
+   *
+   * @return `true` iff `value` is contained in the bitmap
+   */
   __device__ bool contains(T value) const;
 
+  /**
+   * @brief Number of keys stored in the bitmap.
+   *
+   * @return Count of keys in the bitmap
+   */
   [[nodiscard]] __host__ __device__ cuda::std::size_t size() const noexcept;
 
+  /**
+   * @brief Checks whether the bitmap contains no keys.
+   *
+   * @return `true` iff `size() == 0`
+   */
   [[nodiscard]] __host__ __device__ bool empty() const noexcept;
 
+  /**
+   * @brief Returns a pointer to the beginning of the serialized bitmap bytes.
+   *
+   * @return Pointer to the serialized storage
+   */
   [[nodiscard]] __host__ __device__ cuda::std::byte const* data() const noexcept;
 
+  /**
+   * @brief Size in bytes of the serialized bitmap storage.
+   *
+   * @return Number of bytes occupied by the serialized bitmap
+   */
   [[nodiscard]] __host__ __device__ cuda::std::size_t size_bytes() const noexcept;
 
  private:

From a56e3a9392aed3ef6ec9c41daa97660ef773000c Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 13 Aug 2025 10:54:52 -0700
Subject: [PATCH 10/24] Update readme

---
 README.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c66f76f30..546ebf99a 100644
--- a/README.md
+++ b/README.md
@@ -259,4 +259,11 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
 `cuco::bloom_filter` implements a Blocked Bloom Filter for approximate set membership queries.
 
 #### Examples:
-- [Host-bulk APIs (Default fingerprinting policy)](https://github.com/NVIDIA/cuCollections/blob/dev/examples/bloom_filter/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJydVmtvGjkU_StXsx8WmuEVbVUJQiSapLtoK5IF2qpaVsjj8TBWBnvqBwRF-e977ZmBgZBqtVRqwL6Pc889vvZzoJnWXAod9P9-Dngc9HthkBGxsmTFgn5AbUyCMNDSKup-d94tBLyDG5nvFF-lBhq0CZfdy99CmHwd345HcHM_fbifjubj-0nb2Xr7z5wyoVkMVsRMgUkZjHJC8U-5E8JXphwQuGx3oeEMFkG5twiaAx9lJy2syQ6ENGA1wzBcQ8IzBuyJstwAF0DlOs84EZTBlpvUpyrjeDjwvQwiI0PQnqBHjr-SuiUQs4fuPqkxeb_T2W63beJht6VadbLCWHc-j2_uJrO7FkLfu30RGTILiv2wXGHh0Q5IjsgoiRBvRrYgFZCVYrhnpEO-VdxwsQpBy8RsiWI-Tsy1UTyy5oi8CifWXzdA-ohA4kYzGM8WAXwczcaz0Mf5Np7_cf9lDt9G0-loMh_fzeB-is2a3I5dq_DXJxhNvsOf48ltCAypw1TsKVeuCoTKHa0sLjicMXYEI5EFLJ0zyhNOoVIQrOSGKYFlQc7UmhdaQ5Cxj5PxNTfE-LVXxflUnYVYiF-4oJmNGVxRS2UnyqRcL7Hvhqk2ten1sY1JldWmQ6UVpu02X23FbIMplhtGjVTnTdgTo9YBW-YSm7Y7b6Wxuwyl1j7FwCU2hZG1X-bCoOK4aGwkj5sL8YyFgVukWLdxHIOw6-Uj22kntiH0ur92u90B7D-dTucKfmeCKWJYuQ3O_nwkkxduw0Pcd9Btvx-UkcbIrjKe64QrbSAlWeLjuWCy3PD0vpFAvErQKjMPXntoG5Xd0uhx6SvzOPCr22xVmygR8M0tszvynKmDBmaXF0I7tgCcBE5eaLL0JkOXfbB3neVk60553auYDDFLiM0MFA12mjwF5Kt3kuv365q7qnJdl_GeaxW-lKm1ift91KCBqys8kR9t9ojAPO-e47fxJFgOU7lyJK6Knru5WMDsY6yFgNoHw_tcTMRZmbtQZ79_pPMaatewRtU5P1v3LpWiG26rHbEVyrYZeo82ZnDfe80yDbFOK_nSWyHvdZdB3QAdvVj2thd1sRRWYh-mcDjaqgIcYBwafF7M5Tz3Ds6wlDOJ40aFIiwzNX_KWiRldu0scRBidxoF8BASkml2zN1ZR1F3PDoptRhlKX9Zpna107efqziFN1xane1KDeGs35eGjnN3EepU2iyGIh34q80oy1q51HixbBj44YHEzB-mw16dFTyq7i7Ur6gJD4VXjf0vcH2ZJiUG8Bbzt7W7RpjwqE6Gizui3N3W9QOhvWOuZEQyvN7wQomJIahzZamxGCushSmjsKeUR9yga8HrSd2fHqbXXTxjOZbl5oUskGArIkSOrHhfXtVWq8oNhmPTlOiUafcEif3MwnrP0yn2dIqSTvEGnUkmkS7HtgM7LI64X2xU8vIXWuNYbGcaVF8qT6yTQbMJnTJgIb9Cu0Xe5H_kPa2kvvSTvKJ5fkA6UeKLBb9XJPhl17dyPTmsnw48xVAVArr48wVfq-4NiK9CdXjUBmJDae_yve3htsxN8eINWhhoSC8ueh-gRRRNh3q9_NCFVgvvLYP_GczB4lZG1pF_Bmc8qsWklGa4uCkerriA9YrH4CWs9vHmONpH7oKXf_y_fwHeCexw))
\ No newline at end of file
+- [Host-bulk APIs (Default fingerprinting policy)](https://github.com/NVIDIA/cuCollections/blob/dev/examples/bloom_filter/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJydVmtvGjkU_StXsx8WmuEVbVUJQiSapLtoK5IF2qpaVsjj8TBWBnvqBwRF-e977ZmBgZBqtVRqwL6Pc889vvZzoJnWXAod9P9-Dngc9HthkBGxsmTFgn5AbUyCMNDSKup-d94tBLyDG5nvFF-lBhq0CZfdy99CmHwd345HcHM_fbifjubj-0nb2Xr7z5wyoVkMVsRMgUkZjHJC8U-5E8JXphwQuGx3oeEMFkG5twiaAx9lJy2syQ6ENGA1wzBcQ8IzBuyJstwAF0DlOs84EZTBlpvUpyrjeDjwvQwiI0PQnqBHjr-SuiUQs4fuPqkxeb_T2W63beJht6VadbLCWHc-j2_uJrO7FkLfu30RGTILiv2wXGHh0Q5IjsgoiRBvRrYgFZCVYrhnpEO-VdxwsQpBy8RsiWI-Tsy1UTyy5oi8CifWXzdA-ohA4kYzGM8WAXwczcaz0Mf5Np7_cf9lDt9G0-loMh_fzeB-is2a3I5dq_DXJxhNvsOf48ltCAypw1TsKVeuCoTKHa0sLjicMXYEI5EFLJ0zyhNOoVIQrOSGKYFlQc7UmhdaQ5Cxj5PxNTfE-LVXxflUnYVYiF-4oJmNGVxRS2UnyqRcL7Hvhqk2ten1sY1JldWmQ6UVpu02X23FbIMplhtGjVTnTdgTo9YBW-YSm7Y7b6Wxuwyl1j7FwCU2hZG1X-bCoOK4aGwkj5sL8YyFgVukWLdxHIOw6-Uj22kntiH0ur92u90B7D-dTucKfmeCKWJYuQ3O_nwkkxduw0Pcd9Btvx-UkcbIrjKe64QrbSAlWeLjuWCy3PD0vpFAvErQKjMPXntoG5Xd0uhx6SvzOPCr22xVmygR8M0tszvynKmDBmaXF0I7tgCcBE5eaLL0JkOXfbB3neVk60553auYDDFLiM0MFA12mjwF5Kt3kuv365q7qnJdl_GeaxW-lKm1ift91KCBqys8kR9t9ojAPO-e47fxJFgOU7lyJK6Knru5WMDsY6yFgNoHw_tcTMRZmbtQZ79_pPMaatewRtU5P1v3LpWiG26rHbEVyrYZeo82ZnDfe80yDbFOK_nSWyHvdZdB3QAdvVj2thd1sRRWYh-mcDjaqgIcYBwafF7M5Tz3Ds6wlDOJ40aFIiwzNX_KWiRldu0scRBidxoF8BASkml2zN1ZR1F3PDoptRhlKX9Zpna107efqziFN1xane1KDeGs35eGjnN3EepU2iyGIh34q80oy1q51HixbBj44YHEzB-mw16dFTyq7i7Ur6gJD4VXjf0vcH2ZJiUG8Bbzt7W7RpjwqE6Gizui3N3W9QOhvWOuZEQyvN7wQomJIahzZamxGCushSmjsKeUR9yga8HrSd2fHqbXXTxjOZbl5oUskGArIkSOrHhfXtVWq8oNhmPTlOiUafcEif3MwnrP0yn2dIqSTvEGnUkmkS7HtgM7LI64X2xU8vIXWuNYbGcaVF8qT6yTQbMJnTJgIb9Cu0Xe5H_kPa2kvvSTvKJ5fkA6UeKLBb9XJPhl17dyPTmsnw48xVAVArr48wVfq-4NiK9CdXjUBmJDae_yve3htsxN8eINWhhoSC8ueh-gRRRNh3q9_NCFVgvvLYP_GczB4lZG1pF_Bmc8qsWklGa4uCkerriA9YrH4CWs9vHmONpH7oKXf_y_fwHeCexw))
+
+### roaring_bitmap
+
+`cuco::roaring_bitmap` implements a Roaring bitmap following the [Roaring bitmap format specification](https://github.com/RoaringBitmap/RoaringFormatSpec).
+
+#### Examples:
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/roaring_bitmap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy9WAtv2zYQ_is3DSjkxrac9Dk3zuY16WasS4Yk21A0hUBJtE1YEjWSiuMF-e-7IyVbcp31uTlAJJHH7x787njSrae51kLm2hu-vfVE4g33u17K8lnJZtwbenGZMK_raVmqmJ6Dh1c5PISXslgpMZsb8OMOHAwOnsDpH5PjyRhenp3_dnY-vpycnfZJ1Iq_FjHPNU-gzBOuwMw5jAsW46Wa6cIfXJEdcNAfgE8CV141d-V1XliUlSwhYyvIpYFSc4QRGqYi5cBvYl4YEDnEMitSwfKYw1KYuVVV4Vhz4E0FIiPDUJ7higKfpk1JYGZtOv3mxhTDIFgul31mze5LNQtSJ6yD15OXJ6cXJz00fb3s9zzFwILif5VCoePRCliBlsUsQntTtgSpgM0UxzkjyfKlEkbksy5oOTVLprjFSYQ2SkSlaQWvthP9bwpg-FiOgRtfwOTiyoMfxxeTi67F-XNy-fPZ75fw5_j8fHx6OTm5gLNz3KzT4wltFT69gvHpG_hlcnrcBY6hQ1X8plDkBZoqKKw8cTG84LxlxlQ6s3TBYzEVMdQEgpm85ipHt6DgKhOOamhkYnFSkQnDjB17zzmrKrjKvxV5nJYJh8O4jGWgJFOIF0bCZKzox-X86D2Z0ohUmFVgFBNG9-dFgTJtqYQF2iSBWRU8dGItGDNXpTZBwq_RmPCax0aq_nyXSCpnuKnp7skyF-i_ZmkToik3xc3jLGutFXLHIG1yPmsNOUgLGDx0rPvBZsMcAcKoTBchv2G4bxyj5KYjJfgUjnmGEUevDce4a9qniv_t4CKTCAWJlEq5KAsLDOPfJnqT2ZPcpWGlCZZIbsmQ00sJjw56CAQOzO46MpTD08eNYfALqYxNCmRRxkwHpkpmZI3Ff3vuTPrRSr-yIhdIs3c-JaXGrJwhV8uoj4kftGTrp82aDqZjIbXAqK3WHMRsjhcgnP_kbu0s-onyplRITpqLpVIYcRzTZYqsglOW8XTVJZcxkMYKTWWayiXxnXZCD62KnvOkCpHeonDgLlStZGlUmet-JPLP8S-IUhkFGdOGq4BMSphhu-E7n2bYf2nVJ5lUk6V6fvr4q1r19PFuBZ2a7gElm-FoGuYOZjlWjxxZAL_w1SXeYy5GUqaOUz5iDocub5E8mHAPKs6HxI2wYGaOwLeIC6zEM2DGc045GS74SsMI3r7zO9A7AldLhsNWMTqsVYIFAGKwVUI1Gw9lLHCkHC0QOtRoY3hdL-lCY7YUuXl0EJqjTg0EEARI8ilXHE_RYWPwa-39-cn4-NeTfpZ8S0M9Gqu1WJsqD3dZCRSbF7U0nTv-DjFYYPgGL_ByCPsD-tH93sg-NBwFC9cvSj0PI4Zbtuisse9aShDYgm7QDrHrcfd7e4t_w3yEfP4g7rMG7vOPw92F6erVPYzZFU_fgkZ8JnK_03UqeJ74nRr8DnhKjcbns-vp4_-VXZjDG369l82YzB9BNWvyh6lGYo5qN3bLyjR1O4jP39XPn7GLH9I1buvaH3yBri2Rwc1BhbaR3SnypCVyr8XCJaLYMlRQMh4QxL-YO7h5Xi2APRBfyHa7pR_L9tsNR7A7jUOmNVfGp96S1GDRT3huwilD4XUh7mK7VN1DhgZBxGFdkDA-tRXV60zTg9u7Wj9d6IGumBNnqMp2EvTs8q3qGO2ov32edCshqYdDpDpTK6cL09f_hqT6mK8SQdHl2ku7IuZKweEhuvCKoZh9JyE5HKDhbT00ZtdhJNLK9soXGxQ7dLd24ydu3HuaFn9bV6wpmvPFzB80bUY4Z3B9dqKntMYuCO3dyC02PE1nvhO-Dw03ulPFsubIpiG3TXIh8pwnOygTrQwerVE5xULlr5V3NjtzzrHJtT7htkrIsKVWq7UxaHfiK45TXOErlAljLFKH8Zyph0d-bYtiy7CQVsbO-05dn6oY7g8StKW4go5TqblfW-Io2W6SNq1Be_yjFVfYtimpepFWb1KFfXfeUQN0RIcFvV7zxKUc-UAZZ8lR42-_ybkl-t4k7W5Q6-kaynZdLE3DtQTaXNtH43Lqv7e4iVcpwERoHG2U5PgaeXvn_K0I3lKDE5bmdI5njFCbHR0FNEyECpEDU3HT6OmavWEzD6sh97nFpdoIwvDV5PVJGFaJZqHxPQa2fqPmsv5U5EmY4u6S71decEW_dempfPEJZjRq6h4OcxzswPfNQQTo42IYtlToMsJpyjla8aJZu-yG6DKO6fPByDXEuxuQrRhhePaw6Ox-a6ntr4EffBnyp8K6M-Qe2J0vDg55XdBi9MVV2UrVsKqvfq36exxAdzkO05zNFwr8Vr1t0HGzcoBL9omQXtejz2BYLdTms56XX8fx_sGTch-nZWHcNz-vh7CjeG9v_xn0mIrnI52FzwbQ62GnZ_CfobMu6aUsi-yHwFREDcw4jlMcvHbf7nCAtmrh3XXreSxPrXmsFd7dO_v3DxFG5ro=))
\ No newline at end of file

From 078ea6758b51af06fc1003a34aad251105cef118 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 13 Aug 2025 17:26:17 -0700
Subject: [PATCH 11/24] Download roaring bitmap .bin files at build time

---
 CMakeLists.txt                                |   6 ++
 README.md                                     |   2 +-
 benchmarks/roaring_bitmap/contains_bench.cu   |  88 ++++++++++--------
 cmake/roaring_testdata.cmake                  |  39 ++++++++
 examples/roaring_bitmap/bitmapwithoutruns.bin | Bin 72616 -> 0 bytes
 examples/roaring_bitmap/bitmapwithruns.bin    | Bin 48056 -> 0 bytes
 examples/roaring_bitmap/host_bulk_example.cu  |  48 ++++++----
 examples/roaring_bitmap/portable_bitmap64.bin | Bin 16506 -> 0 bytes
 8 files changed, 125 insertions(+), 58 deletions(-)
 create mode 100644 cmake/roaring_testdata.cmake
 delete mode 100644 examples/roaring_bitmap/bitmapwithoutruns.bin
 delete mode 100644 examples/roaring_bitmap/bitmapwithruns.bin
 delete mode 100644 examples/roaring_bitmap/portable_bitmap64.bin

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a00476b4e..2bf5f58e3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -88,6 +88,12 @@ target_include_directories(cuco INTERFACE
 target_link_libraries(cuco INTERFACE CCCL::CCCL CUDA::toolkit)
 target_compile_features(cuco INTERFACE cxx_std_17 cuda_std_17)
 
+###################################################################################################
+# - Optionally download RoaringFormatSpec test data -----------------------------------------------
+
+option(CUCO_DOWNLOAD_ROARING_TESTDATA "Download RoaringFormatSpec test data" ON)
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/roaring_testdata.cmake)
+
 ###################################################################################################
 # - optionally build tests ------------------------------------------------------------------------
 
diff --git a/README.md b/README.md
index 546ebf99a..f9cc3efc0 100644
--- a/README.md
+++ b/README.md
@@ -266,4 +266,4 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
 `cuco::roaring_bitmap` implements a Roaring bitmap following the [Roaring bitmap format specification](https://github.com/RoaringBitmap/RoaringFormatSpec).
 
 #### Examples:
-- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/roaring_bitmap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy9WAtv2zYQ_is3DSjkxrac9Dk3zuY16WasS4Yk21A0hUBJtE1YEjWSiuMF-e-7IyVbcp31uTlAJJHH7x787njSrae51kLm2hu-vfVE4g33u17K8lnJZtwbenGZMK_raVmqmJ6Dh1c5PISXslgpMZsb8OMOHAwOnsDpH5PjyRhenp3_dnY-vpycnfZJ1Iq_FjHPNU-gzBOuwMw5jAsW46Wa6cIfXJEdcNAfgE8CV141d-V1XliUlSwhYyvIpYFSc4QRGqYi5cBvYl4YEDnEMitSwfKYw1KYuVVV4Vhz4E0FIiPDUJ7higKfpk1JYGZtOv3mxhTDIFgul31mze5LNQtSJ6yD15OXJ6cXJz00fb3s9zzFwILif5VCoePRCliBlsUsQntTtgSpgM0UxzkjyfKlEkbksy5oOTVLprjFSYQ2SkSlaQWvthP9bwpg-FiOgRtfwOTiyoMfxxeTi67F-XNy-fPZ75fw5_j8fHx6OTm5gLNz3KzT4wltFT69gvHpG_hlcnrcBY6hQ1X8plDkBZoqKKw8cTG84LxlxlQ6s3TBYzEVMdQEgpm85ipHt6DgKhOOamhkYnFSkQnDjB17zzmrKrjKvxV5nJYJh8O4jGWgJFOIF0bCZKzox-X86D2Z0ohUmFVgFBNG9-dFgTJtqYQF2iSBWRU8dGItGDNXpTZBwq_RmPCax0aq_nyXSCpnuKnp7skyF-i_ZmkToik3xc3jLGutFXLHIG1yPmsNOUgLGDx0rPvBZsMcAcKoTBchv2G4bxyj5KYjJfgUjnmGEUevDce4a9qniv_t4CKTCAWJlEq5KAsLDOPfJnqT2ZPcpWGlCZZIbsmQ00sJjw56CAQOzO46MpTD08eNYfALqYxNCmRRxkwHpkpmZI3Ff3vuTPrRSr-yIhdIs3c-JaXGrJwhV8uoj4kftGTrp82aDqZjIbXAqK3WHMRsjhcgnP_kbu0s-onyplRITpqLpVIYcRzTZYqsglOW8XTVJZcxkMYKTWWayiXxnXZCD62KnvOkCpHeonDgLlStZGlUmet-JPLP8S-IUhkFGdOGq4BMSphhu-E7n2bYf2nVJ5lUk6V6fvr4q1r19PFuBZ2a7gElm-FoGuYOZjlWjxxZAL_w1SXeYy5GUqaOUz5iDocub5E8mHAPKs6HxI2wYGaOwLeIC6zEM2DGc045GS74SsMI3r7zO9A7AldLhsNWMTqsVYIFAGKwVUI1Gw9lLHCkHC0QOtRoY3hdL-lCY7YUuXl0EJqjTg0EEARI8ilXHE_RYWPwa-39-cn4-NeTfpZ8S0M9Gqu1WJsqD3dZCRSbF7U0nTv-DjFYYPgGL_ByCPsD-tH93sg-NBwFC9cvSj0PI4Zbtuisse9aShDYgm7QDrHrcfd7e4t_w3yEfP4g7rMG7vOPw92F6erVPYzZFU_fgkZ8JnK_03UqeJ74nRr8DnhKjcbns-vp4_-VXZjDG369l82YzB9BNWvyh6lGYo5qN3bLyjR1O4jP39XPn7GLH9I1buvaH3yBri2Rwc1BhbaR3SnypCVyr8XCJaLYMlRQMh4QxL-YO7h5Xi2APRBfyHa7pR_L9tsNR7A7jUOmNVfGp96S1GDRT3huwilD4XUh7mK7VN1DhgZBxGFdkDA-tRXV60zTg9u7Wj9d6IGumBNnqMp2EvTs8q3qGO2ov32edCshqYdDpDpTK6cL09f_hqT6mK8SQdHl2ku7IuZKweEhuvCKoZh9JyE5HKDhbT00ZtdhJNLK9soXGxQ7dLd24ydu3HuaFn9bV6wpmvPFzB80bUY4Z3B9dqKntMYuCO3dyC02PE1nvhO-Dw03ulPFsubIpiG3TXIh8pwnOygTrQwerVE5xULlr5V3NjtzzrHJtT7htkrIsKVWq7UxaHfiK45TXOErlAljLFKH8Zyph0d-bYtiy7CQVsbO-05dn6oY7g8StKW4go5TqblfW-Io2W6SNq1Be_yjFVfYtimpepFWb1KFfXfeUQN0RIcFvV7zxKUc-UAZZ8lR42-_ybkl-t4k7W5Q6-kaynZdLE3DtQTaXNtH43Lqv7e4iVcpwERoHG2U5PgaeXvn_K0I3lKDE5bmdI5njFCbHR0FNEyECpEDU3HT6OmavWEzD6sh97nFpdoIwvDV5PVJGFaJZqHxPQa2fqPmsv5U5EmY4u6S71decEW_dempfPEJZjRq6h4OcxzswPfNQQTo42IYtlToMsJpyjla8aJZu-yG6DKO6fPByDXEuxuQrRhhePaw6Ox-a6ntr4EffBnyp8K6M-Qe2J0vDg55XdBi9MVV2UrVsKqvfq36exxAdzkO05zNFwr8Vr1t0HGzcoBL9omQXtejz2BYLdTms56XX8fx_sGTch-nZWHcNz-vh7CjeG9v_xn0mIrnI52FzwbQ62GnZ_CfobMu6aUsi-yHwFREDcw4jlMcvHbf7nCAtmrh3XXreSxPrXmsFd7dO_v3DxFG5ro=))
\ No newline at end of file
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/roaring_bitmap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy9WAtv4zYS_itzWqCQN7blZB9pncfVG2d7Rrd2YXu7KDYLgZJom7AsqiQVxxfkv3dISpbkKN1He-cAsUUOv3nwm-FQ946kUjKeSKf_8d5hkdM_bjsxSZYZWVKn74RZRJy2I3kmQv3sPb9J4Dlc8XQn2HKlwA1bcNI7eQXj30bD0QCuJtNfJ9PBfDQZd7WoEX_HQppIGkGWRFSAWlEYpCTEr3ymDb9Roe2Ak24PXC1w4-RzN07rzKDseAYbsoOEK8gkRRgmYcFiCvQupKkClkDIN2nMSBJS2DK1MqpyHGMO_J6D8EARlCe4IsWnRVUSiNqbrj8rpdK-52232y4xZne5WHqxFZbeu9HV9Xh23UHT98veJzEGFgT9I2MCHQ92QFK0LCQB2huTLXABZCkozimuLd8KpliybIPkC7UlghqciEklWJCpWvAKO9H_qgCGjyQYuMEMRrMbB94MZqNZ2-B8GM3_M3k_hw-D6XQwno-uZzCZ4maNhyO9Vfj0Fgbj3-Hn0XjYBoqhQ1X0LhXaCzSV6bDSyMZwRmnNjAW3ZsmUhmzBQigIBEt-S0WCbkFKxYZZqqGRkcGJ2YYposzYI-eMKu8muUmesSSMs4jCeZiF3BOcCET0A6Y2JO2G2erykUymWMzUzlOCMCW7qzS9PESKiCdV5IX4L6KLyycnWaKaJ9Uupb5VUBNQK5FJ5UX0Fh3xb2mouOiumkRivkRCxM2TWcIwdpLEVYiq3AI3npJNbS3jDYOaIMmyNmQhDaD33DL2R5NJKwTwgyxe-_SO4J5TjK-dDgSjCxjSDe4Weq0o7pnUe5znTn1bkIUaBUkYc77OUgMMg19HsqwKo8SmcK4JtpgYnGA-bDm8OOkgEFgwwxhkN4XXLyvD4KZcKJNQyMANUS1YCL7R1hj8j1Nr0hsj_daIzJCin1yd0BIzeok8z4IuFg2vJls8lWtamMoplwyjttvzFytBuAZm_dfuFs6inyivMoHE1nMhFwIjjmMyi5GPMCYbGu_a2mUMpDJCCx7HfKtzRe-E7BsVHetJHiJ5QH7PfulKxzMlskR2A5aYJa6NYOtbnPWCmAfeq-PTUxL94GkDI6JIs7LW15n5_7PxqwwsiJQ_v35Z2mgp9w_a-Ppls7rWPjF-xBOOwryaG_lBktNpQ9aYd6munNAZXr2_mvjDyYfxu8lg6E8ng-lo_JM_v57Nh4P54GIy1idMgOWZqn3ymcKqKGJjJmPNwVqWICfhZ7qb42-sDAHnsWW4i3b3-7aKIJUx_b_LM9DXTPVTolZo_D3iAslQ15ImVFcIf013Ei7g4ye3BZ1LsJWt36-VxvNCJRgA0PlklOjTB9sLLLdaOVrApC_RRv-2WNKGymyGhfrFia8uWwUQgOfBFZZD9PCPjGLiGnuK06qkyj3y-aHgS5l9BcQ_Rs7p9WD4y3V3Ez3TQx09VqgxLuQBaXLKmH5WSGsX3AYxWGO0e2f4dQ7HPf3Rv48uzEMlLmDgumkmV35AcIfXrT32Q00JAhvQEu0c2z37--ho_VeYL5DJn8U9reB-_2W4TZi22D5BsKZ4ugY0oEuWuK22VUGTyG0V4A9AY91hfTsZX7_8CjI2VgRDxf8FE7EElVx8pNqWvs_R0rj3eVpqMUvLO7O9WRzb3cbnH4rnb9jxz-ka1HUd9_6GrgOR3t1JjlbKNoq8qok8aTGzScsODGU6cU80xF-Y27v7Pl8AR8D-ZmaYLf3SzLgvOYItfOgTKalQrm6_tRo8TyKaKH9BUHhf49vYF-a_YYMG6WNpX7wwPoUV-Z2v6sH9Q6Fff-kH_Y1JMUFVJlP0s83NvDU2o-7hUdXOhbjs95HqROysLkx1919aqou5zREUXS68NCtCKgScn6MLbwmKmYublsMBPXyoR4-ZdRiJOLc998UExQw97N34iSp7mZXsv8YVY4qkdL10e1WbEc4aXBzL6KleYxb45teFXaxoHC9dK_wUGm50K49lwZHy5mFuAylLEho1UCbYKTy1g2yxoMLdK2-VOzOl2M0bn3BbOWzw7iB2e2PQ7sgVFKeowHum8kMi1Xm4IuL5pVvYIsjWT7mRMfOuVdfVVQz3BwlaU5xDhzGX1K1YkhfevOsr7g_FbWHPHkveem9Y9if18S828ayyxbYvqh4ALsHUldhRx5FOBjxs9HsJJBdLKl1Ka99b5S1VrcVq8JSA3S_NUYnf9sKe3z-qm10vCLrpuyyNsLVAB1eXAsPaiqo3eOPp5DeeikdkiWvzO421Xssf3tqtCvlktWmXVhTTVS-Lm5eOnlZkNYsnA6jXmY4WF_ilzMU-DnqcL9xHSqt25IZhJaj0AbrKMbW7f7D8yzO8pgYnTJ7rpmdDNKrplp-xRYRXatO-F1277tj94Whaze992w2aUn7EBNrduOps76bMwlC_uql_LmxD39wR7cGPsKA9cbPMy3KB_t03AX4tmj2W6mjNFzanoIit12i5rde5hn5eqd1C479xAJ2jOKznDMER5LByV_a1XNnDJcc4-UyfhqXO_RnRfH1r3Lb8foZkMKTEAsKwGpj3l7Wr3o1T9gH558b5gvtf63Dho5Mp9876k0RsgYx12o5-m4qlUZRvh53kNgyPT15lxzhtzcJJp4NwF-HR0fEpdIgIVxdy45_2oNPBvlnhP6W7gagTk01g3ifHLKhghmEY4-CtfQWMA5pwa-ehXcxjWa7NY9FyHj6Zvz8Bdgyhgg==))
\ No newline at end of file
diff --git a/benchmarks/roaring_bitmap/contains_bench.cu b/benchmarks/roaring_bitmap/contains_bench.cu
index 2f727c541..da66bea4d 100644
--- a/benchmarks/roaring_bitmap/contains_bench.cu
+++ b/benchmarks/roaring_bitmap/contains_bench.cu
@@ -18,74 +18,80 @@
 #include <benchmark_utils.hpp>
 
 #include <cuco/roaring_bitmap.cuh>
+#include <cuco/utility/key_generator.cuh>
 
 #include <nvbench/nvbench.cuh>
 
+#include <cuda/std/cstddef>
+#include <cuda/std/cstdint>
 #include <thrust/device_vector.h>
+#include <thrust/universal_vector.h>
 
-#include <filesystem>
 #include <fstream>
-#include <iostream>
-#include <vector>
+#include <string>
 
-void roaring_bitmap_contains(nvbench::state& state)
-{
-  namespace fs = std::filesystem;
-
-  // Get the path of the current source file
-  fs::path source_file_path = __FILE__;
-  fs::path source_dir       = source_file_path.parent_path();
+using namespace cuco::benchmark;  // defaults
+using namespace cuco::utility;    // key_generator, distribution
 
-  fs::path path      = source_dir / "../../examples/roaring_bitmap/bitmapwithoutruns.bin";
-  fs::path full_path = path.lexically_normal();
+template <typename T>
+void roaring_bitmap_contains(nvbench::state& state, nvbench::type_list<T>)
+{
+  auto const num_items   = state.get_int64("NumInputs");
+  auto const bitmap_file = state.get_string_or_default("BitmapFile", {});
 
-  std::ifstream file(full_path, std::ios::binary);
-  if (!file.is_open()) { state.skip("Failed to open bitmap file"); }
+  std::ifstream file(bitmap_file, std::ios::binary);
+  if (!file.is_open()) { state.skip("Bitmap file not found"); }
 
   // Get file size
   file.seekg(0, std::ios::end);
   std::streamsize file_size = file.tellg();
   file.seekg(0, std::ios::beg);
 
-  char* buffer;
-  CUCO_CUDA_TRY(cudaMallocHost(&buffer, file_size));
+  thrust::universal_host_pinned_vector<cuda::std::byte> buffer(file_size);
 
-  file.read(buffer, file_size);
+  file.read(reinterpret_cast<char*>(thrust::raw_pointer_cast(buffer.data())), file_size);
   file.close();
 
-  cuco::roaring_bitmap<cuda::std::uint32_t> roaring_bitmap(
-    reinterpret_cast<cuda::std::byte const*>(buffer));
+  cuco::roaring_bitmap<T> roaring_bitmap(thrust::raw_pointer_cast(buffer.data()));
 
-  std::vector<cuda::std::uint32_t> keys;
-  for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) {
-    keys.push_back(k);
-  }
-  for (cuda::std::uint32_t k = 100000; k < 200000; ++k) {
-    keys.push_back(3 * k);
-  }
-  for (cuda::std::uint32_t k = 700000; k < 800000; ++k) {
-    keys.push_back(k);
-  }
+  thrust::device_vector<T> items(num_items);
 
-  // multiply the keys for more accurate benchmark numbers
-  for (int i = 0; i < 13; i++) {
-    keys.insert(keys.end(), keys.begin(), keys.end());
-  }
+  key_generator gen{};
+  gen.generate(distribution::unique{}, items.begin(), items.end());
 
-  thrust::device_vector<cuda::std::uint32_t> keys_d(keys.begin(), keys.end());
-  thrust::device_vector<bool> contained(keys.size(), false);
+  thrust::device_vector<bool> contained(items.size(), false);
 
-  state.add_element_count(keys.size());
-  state.add_global_memory_reads<cuda::std::uint32_t>(keys.size(), "InputSize");
+  state.add_element_count(items.size());
+  state.add_global_memory_reads<T>(items.size(), "InputSize");
+
+  auto& summ = state.add_summary("BitmapSizeMB");
+  summ.set_string("hint", "BitmapSize");
+  summ.set_string("short_name", "BitmapSizeMB");
+  summ.set_string("description", "Bitmap size in MB");
+  summ.set_float64("value", static_cast<double>(file_size) / (1024 * 1024));
 
   state.exec([&](nvbench::launch& launch) {
     roaring_bitmap.contains_async(
-      keys_d.begin(), keys_d.end(), contained.begin(), {launch.get_stream()});
+      items.begin(), items.end(), contained.begin(), {launch.get_stream()});
   });
-
-  CUCO_CUDA_TRY(cudaFreeHost(buffer));
 }
 
-NVBENCH_BENCH(roaring_bitmap_contains)
+NVBENCH_BENCH_TYPES(roaring_bitmap_contains,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::uint32_t>))
+  .set_name("roaring_bitmap_contains")
+  .add_int64_power_of_two_axis("NumInputs", {32})
+// Default benchmark is only available if the Roaring bitmap testdata has been downloaded
+#ifdef CUCO_ROARING_DATA_DIR
+  .add_string_axis("BitmapFile", {std::string(CUCO_ROARING_DATA_DIR) + "/bitmapwithruns.bin"})
+#endif
+  .set_max_noise(cuco::benchmark::defaults::MAX_NOISE);
+
+NVBENCH_BENCH_TYPES(roaring_bitmap_contains,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::uint64_t>))
   .set_name("roaring_bitmap_contains")
+  .add_int64_power_of_two_axis("NumInputs", {31})
+// Default benchmark is only available if the Roaring bitmap testdata has been downloaded
+#ifdef CUCO_ROARING_DATA_DIR
+  .add_string_axis("BitmapFile", {std::string(CUCO_ROARING_DATA_DIR) + "/portable_bitmap64.bin"})
+#endif
   .set_max_noise(cuco::benchmark::defaults::MAX_NOISE);
\ No newline at end of file
diff --git a/cmake/roaring_testdata.cmake b/cmake/roaring_testdata.cmake
new file mode 100644
index 000000000..8dded834c
--- /dev/null
+++ b/cmake/roaring_testdata.cmake
@@ -0,0 +1,39 @@
+# =============================================================================
+# Copyright (c) 2025, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Only act if enabled
+if(NOT CUCO_DOWNLOAD_ROARING_TESTDATA)
+  return()
+endif()
+
+set(CUCO_ROARING_DATA_DIR "${CMAKE_BINARY_DIR}/data/roaring_bitmap")
+
+file(MAKE_DIRECTORY "${CUCO_ROARING_DATA_DIR}")
+
+set(ROARING_FORMATSPEC_BASE "https://raw.githubusercontent.com/RoaringBitmap/RoaringFormatSpec/5177ad9")
+
+rapids_cmake_download_with_retry("${ROARING_FORMATSPEC_BASE}/testdata/bitmapwithoutruns.bin"
+                                 "${CUCO_ROARING_DATA_DIR}/bitmapwithoutruns.bin"
+                                 "d719ae2e0150a362ef7cf51c361527585891f01460b1a92bcfb6a7257282a442")
+
+rapids_cmake_download_with_retry("${ROARING_FORMATSPEC_BASE}/testdata/bitmapwithruns.bin"
+                                 "${CUCO_ROARING_DATA_DIR}/bitmapwithruns.bin"
+                                 "1f1909bfdd354fa2f0694fe88b8076833ca5383ad9fc3f68f2709c84a2ab70e3")
+
+rapids_cmake_download_with_retry("${ROARING_FORMATSPEC_BASE}/testdata64/portable_bitmap64.bin"
+                                 "${CUCO_ROARING_DATA_DIR}/portable_bitmap64.bin"
+                                 "b5a553a759167f5f9ccb3fa21552d943b4c73235635b753376f4faf62067d178")
+
+# Define macro only when data is available
+add_compile_definitions(CUCO_ROARING_DATA_DIR="${CUCO_ROARING_DATA_DIR}")
\ No newline at end of file
diff --git a/examples/roaring_bitmap/bitmapwithoutruns.bin b/examples/roaring_bitmap/bitmapwithoutruns.bin
deleted file mode 100644
index a99fd50aff79b98fa93b3219fc6226fa238d72e2..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 72616
zcmeI((;Mqb-01P7u^Kf^(llz)c6l$`wr$(CZQHi(wU=$%wte>dN1Susi}^kCedcB^
zW<K|mGd%<`(Em&U2!>n;g=imxDp$dwDpiR8cJjX{C?7{d^C~dV-@h#MH*@{}PCxpu
z6!3q!|5f0>8vNIQ|5^Y+%Xj_P&HwuO-y!H3a*o)}%;$&8?bYf=emjLHB)_G)m+n}`
z^;xIq?4P%J!SY43mvqXK72j34R%3si<qgI+>E5D#o8lePchP!4eV+9{H+cK-`J;!A
zZ$G*E^!&3^%oi4aTXt{Nv32V=P2bjkSMz=44`x5=oFvb_yKwEw{u|5hjDOJmN&Oea
z-=zPbeSv;F`+W}Hj?JfrbM3|IN`5_sDY)PKdnt~kS)YD-=Kk56=PsW=dts+IS^8bM
zYnAp_TV88?z3z?bH!I#MeLJla)a_Z%bA7iD{GXSA0Rlh(2mk>f00e*l5C8%|00;m9
zAOHk_01yBI|2G6mO!UD50zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00;m9AOHk_
z01yBIKmZ5;0U!VbfB+Bx0zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00;m9AOHk_
z01yBIKmZ5;0U!VbfB+Bx0zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00;m9AOHk_
z01yBIKmZ5;0U!VbfB+Bx0zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00;m9AOHk_
z01yBIKmZ5;0U!VbfB+Bx0zd!=00AHX1b_e#00KY&2>d@2fFK0IAqrw49+Dsh(jWt}
zAQ$qXB$NtD3#ErLL0O?3P;Mw6R1hix6^BYeWuXdCWvCid6RHE%hZ;dmp%zeUs2$W1
z>H>9#dO>}m0nlJ*7&H<Z1C56!K~td_&}?WPv=CYXEr(V?YoQI$W@sC<6WRmqhYmtV
zpySXf=qz*rx(r=|ZbEmU`_LokDf9w*4ZVXtLSLZo&@bpO6o4_9gc&#pi*N{5;RtNP
z4(!1RI3=70{s+zoXMwZBx!}BT0k|++3@!<mfy={{;Hq#9xHeo5ZU{Glo5QW(wr~fy
zGu#dC3HO2f!-L?V@CbM`JPw`+Pl2byv*5Y#0(ddJ3|<MZf!D*E;H~fucsINc{ue$B
zAA?WAXW;YjCHN|Q1HKL4gCD|A;OFow_$~Ya{tSPEf5LxYIDiI-03F~0LO>3L19~7D
zumiC`Jdh%gI*=}qA&@zcEs!&iCy+l-C{Q#|B2YR|E>JO0B~U$3D^NGkAka9_EYLF0
zCeS|6DbO|0BhWk0FEB7LBrrTMDlj%MAuu^GEif}MCon&-D6llJBCtBJF0e7MC9plP
zE3h|kAaE#fG;kttI&dy<F>obtJ#Z^<H}D|vIPfg+GVmtwKJY2<HSi<wI{+aFf+G~d
zB0M4?3ZfwfVj(W#BS|C`k`_siWJ0ncIgs2)KBOR01SyV`Ldqf)kjh9kq$W}asgE>5
znj$Tb)<`>~Bhm%wj`TwMA_I`Y$S`CiG6orsOhTq2GmzQHJY*rV1X+%(Le?T1kj=<8
zWGAu**^eAVjv&X8Q^;B50&*F-hTKH%Aor0+$W!D6@)~)Ed_=w=-;rO)UnGEHD2Xy?
z5Eao7s-h9pL><&a6KF~_4f+q75zT^TM{}Wh(E?~;v=~|vErXUvE1^}<8fa~_9@-FX
zf;LB6p>5F)XlJw=+7s=A_D2VyL(viFXmlJp5uJigM`xjP(FN#YbQ!u5U4yPiH=$e6
z9q4X!ANntP7(IrbM9-k-(M#x6^agqxy@x(TpP<jtSLj>x1Ns^LhW<qVpfHAF1V&>V
zCSWoa#&j%-*;ovVV=1uISUM~NmKn>2<;3z}`LRM+QLF@38Y_oY#HwJ`v07MNtO3>-
zYlgMN+F<RmPFPp02i6<whYiGrV8gLd*jQ`=HW{0S&BW$l^RY$PQfvjb8e4~L#I|7D
zv0d0+>;QHMJBppaPGje=i`W(HI(7@Yi#@;|W6!Xc*c<FU_6hrn{lI=>5RTwDPT?%h
z;}Wjm8gAeg?&3b4#8csE@$`5mJS(09&yDB93*tra;&>^%EM5Vxj90^J;&t%)cq6<i
z-U4rpx5GQ)UGVOBFT5{403VDG!$;y{@bUO0d@4QzpN-GM7vf9s<@hRmExrNYjBmqt
z;(PG@_(A*#ejGoApT#fWm+@=(P5cgiAAf{D#b4mB@pt$~{0sga|Aqg>0|Z8p1VaP~
zkq8kg5g|;%Av_{Mq$JW1{}368EJSu97m=4JKolm55haN-M0uhTQI)7c)F$c?4T&a1
zbD|Z|mgqoqCb|(li9SSsVh}Nu7(t9C#t{>VDa3SQ7BQDtKrAMf5i5x`#Cl>Av6a|C
z>?ZaR{}P9ZW5h|~3~`>gL|i3q5VwhY#6#i<@tk-?yd^#mpNVh8PvQ>&lPF1$G|7<y
zDU)GRC!?fI#>hCCf=o@OBQubh$!ug!G7p)bEJPM1OOU0>a%4ra3R#`3Mb;%7kd4V^
zWJ|IQ*`Dk~b|rg|y~%#$KynB<oE$}tB`1)R$!X+Fat=A4TtqG<SCFg8b>v2J3%Q-#
zMeZdJkcY^l<O%XLd5*kDULmiOx5&HX1M)HXjC@JHA>Wgq$gkuN@;3=l2!&G=#Zo*a
zQ3|C|24zt$<x@#26_u7sPi3OAQaPyHR6eR8RfH-|m7>a06{yNoHL506hpJCCqMA}I
zsMb_Fsw35f>Q42d`cea^!PGEnBsGQ_PfenxQZuO8)I4e-wS-zut)kXa8>r3HHfkre
zhuTjaq>fO>sZ-Qh>H>9{x<=ik?ojusN7Pg51@)SGM}4HeP~WLv)L$w<V>C%KbdVP5
z5UtV?+N2%YqZ4#WIt~2~osrH$XQy+~dFcXlVY(Pyk}gA+rz_D_=^Auxx*pw-ZbCPw
zThVRl4s>U_8{L!cL-(f#(L?DG^k{k<J&~S5Pp4<mbLj>2VtN_9l3qivr#I1C=^gZL
zdLR8SeV9H*pQO*w=jluIRr&^fo4!Xsq@U2w=~why`UCx${zm_#|IjdlG6X|293wC?
z6J~TK%GgYdi8Cpf)J!@i1CyD_#^hx3F!`B6Oi`u;Q<^EqRAj0!)tOpMU8Vukm}$ne
zWZE$8nNCbsrU%oT>BkIYhA_jKQOsCo0yCMJ#>`~qF!Py3%u;3rvzl4QY-F}D+nHU=
zUgiLEh&jrfU`{jVn2XF6<~nnWxyw9Y9y8CFm&_aHJ@bkA%KTt{GZ2fgI7_iC%d--z
zuo`Qy7VEM;n`Be5Y1#B_CN?XZgU!w6V+*oH*y3y{wk%tLt;|+qYqE9N`fMY%Dcgc=
z&9-AZvR&BjY%jJiJAfU`4r52MW7zTRBz7u0gPqOJV;8bZ*yZdhb}hSs-OO%dcd~of
z{p>;Z2z#78#hzs^u$S3u>`nF#d!K#8K4o99ui1C(NA?T*o&ClBWdj_>ksQMXIgtx-
zDi`5Q&fz>R!KLKVaQ|=_xh!0EE*F=VE5H@zig6{mGF*AC5?7V0!PVyKaSgd9Tyw4!
z*Ou$Rb>_NpJ-I$ye{K*rlpDc~=EiXoxhdRqZWcF}Tfi;mmT@b&HQah`6StMy!R_Yu
zasP6MxntZ(?hJRHyTo1PZg97`d)!0r3HO|P#l7V|aG$wv+)wTg2M5t05u}4$PzcJw
za8M6MgLW_$j0aN$QwP%pGXygSvjuYo^91t;3k8b?O9V>?%LOY2s|2eDYX$2D8w48%
zn+012+XUMOI|aK2djxw2`vnIEhXjWQM+L_QCj=)4rv+yQ=LF{m7X_CFR|HoF*9A8Q
zw*<EbcLnzb4+IYdj|NW!PY2HhF9xp!uLo}h?*<<P9|xZWUk2X<-v>VhzXpE<e+MBR
z;c=egS)S)5Ug0&~;4R+eeLl&j;?wf!`AmFPJ_nzh&&L<!i}1zyQhZsy0$-W0#@FQQ
z@b&pdd{e#!-<og7cjUY9-T7X8Uw!~Tm><TE<j3&i`APg#eg;3ApT{rcm+;H^Rs33h
z1HYNy#_#0!@ca3L{1N^*e~LfLU*IqE*Z7<K9sWN5h=0nz;9v9a_>cS-{yYDR|H}si
zOdthD2nwPQ5>z1~n1UmCLPAI>q!IoRG74FQ>_RRfuTVfJEEE$;3T1@yLM5T9P(!FK
z)Ds#CO@!t`E1|8>LFg=W6M71Lg#N-HVW==d7%hwwCJIx8>B1~wuCPE@EG!dN3TuS*
z!X{y>utV4_>=XVK4hzSGlfoI{yl_dlD%=om3-^SF!V}@S@Je_qd=Neh--MsS9|0Co
zkq~K-69rKg!=f%mMO%!CaWRFMT1+Qq5HpL}#GGOtF~3+yEGm`|ON-^iieeS9x>!rB
zD>e`ti_OHAVjHo&*h%av_7HoE{ltOd5OKITN*pUr5GRY%#F^q8alW`nTq>>*SBvY!
zjp7z@ySPi-D;^LJiATi~;%V`mcu~9}UKekPcf|+dWAT~zQhX!67e9$##UJ8t5t0xI
zmneyqcuA5JNs|o8l3dA`l2R%ut(0EMBxRLyNV%naQbDPRR9q@0m6a+;m8EJ@O{tDl
zUuq;Zm0C!xrFK$Bsf*NI>LvA+21tXYVbVxxj5J=FBu$lONVBDR(n4v8v|L&xt(7)N
zo26~iPHB&{Upgoqk&a8Jq_ffm>9TZ9x+&d}?n{rPr_u}Qwe(K<D1DK>OTVPQQb5LJ
zQfB0!EXpBSl_Rn#JF+Jy<dkw6`5!r>oJGzq=aTcv1?0kVF}b8%MlLT`lB>!!<l1sQ
zxuM)dZZ5Zy+sYl}&T==or`$*GFAtK3$|K~_@;G^-JVl-^&ywfL3*^P}GI^!EMqV#(
zlDEn`<lXW<`Cs|4d`vzmpOMeYm*lJR4f(cwPktyrk)O-2<hSw%`Lp~@{we>F;Sd@k
zLUf1=2_ZQY4(Xw2$PUFq@lc9T>QK5+hEV2EwouMco>2Z!p-|CKiBRcKxlqMWl~DCi
ztx(-igHYp8vrx-Wn^5~ur%=~Wk5KPWztF(YkkIhZsL<HZgwW*Bw9w4ZoY4HxqR`UN
ziqPuNy3oeZmeBUluF&4lfzYAQ(a?#|>Cm~*#n6?|_0X-*-Oz*3<IuCv%g~$9`_QM*
z*U*p9?+~OQ3a(HJtMH1XD2k>Silw-UuOyXJN?Ikol1a&`<WO=e`ILf65v90NN-3*U
zP%10cl$uH%rM}WgX{xkPS}X08j!GA$yV6VPs|-*EE5nqL${1z5GD(@L%ur@4^OS|k
z5@orvN?EIHP&O;ul%2{RWxsMzIieg_PAO-V3(95XnsQUQquf^>DNmIb%4_AF@=^Js
zd{=%cf0aNO3zK0c91M%$P*@E|!e-bBd*MVlWjIavpK!)-mT>lPu5jLPfpFn)v2e+7
znQ-}VrEt}7jd1O7y>P>DlW_BJt8m+Jhj8a`w{XvJpK$;1pzzS}i16s}xbVdAl<@TM
ztnl3Mg7D(-vhd3An(+GYrtsGAj_~gAzVN@{!{KA$li@Sr^WjV3tKl2r+u?iRhv6sT
z=iyi3x8V=r&*5+3pW#1YSVdJrrBzN9R9OwHx*An&HKxYZ6l!WUoti<-tY%Yls(IA>
zY9Y0#T0$+YmQyRLRn+QgEw!%NKy9oxQ(LNS)b?s8wX51g?XC7x2dYEV;p!-LtU5uR
ztWHyBs&mx&>LPWix<Xy8u2VOvTh#69E_JVZKs}@$RZpm=)pP1a^@@63y`|n&AE=Mj
zXX;D!jrv~wq<&R@sJ~T6Lo{5YG*;s^NmDdUGc-$cHD60=skF3OdM%TdRm-8}*79iu
zwIW(^t&~<)tDsfZs%bT~I$C|Lk=9gep|#f9X&tpLT6e9N)>j*#4c3NfBegNwcx{q4
zRhyyB*5+vowI$keZI!lG+n{aMwrM-HJ=%Wlpmsz%uAS1(Y8SN2+BNN_c1OFfJ<^_P
zFSOU%JME+PMf<M((*9}z9n(pj(Sy3Ehjdks=%()Io}SQC>S^?U^o)8IJ-ePu&#M>E
z3+u)7l6o1vyk1GKs@KqK>-F@8dK105-b!z)chEcQ-SnP%AHBamNFS<?&`0az^ojZu
zeY!qNpQ|s>7wgOPmHHZey}n7`s_)Qu>-+S7^~3rx{iJ?IKd)cXuj)7S+xk8Iq5edF
zuD{aX>L2va`ZxWj{zr!+XoQH+5iTM`<VZN8N1_os5{twmDI%#O=^_~-nIqXEIU{)@
z`6GoQMI$95r6c7c6(dz5)g!ecbt4TTjU&w>EhB9r?IWEcT_Zgry(9f110zEs!y}_2
zV<QtHlOxk2Gb3{%^COEQOCu{Ht0U_o8zWmH+atRodm{%ThayKKCnBdK=OPy)S0dLV
zw<32V4<e5v&mu1)ZzAs_pCVr)KO(;)kbxMuK^d&U8<L?Inqe50;Tpb?G*TI9jr2w)
zBdd|a$Zg~^3K~U>;zlW>tWm+JY*aI98g-2NMkAxC(ZXnLv@<#yU5xHVFQcz9z!+={
zGe#O?jPb@KW2!O3m~G5678*;8<;E&wt+Bz_Y-}@j8hecW#zEtVaojj%oHZ^OmyK)2
zP2-Mn-*{v^HC`C6jd#XJ<BRd#_+|Vx0#PhVMww_ZDn>(5H5!SUQ77s}6Va5>G|_*e
z8KYUE*`v9ld7}lQg`>ryC8K4c<)f9NRiibcwWIZ-4Wmt>&7-ZNZKEBcoul2NJ)?c1
z{iB1TL!%?2qod=Z6Qfh2)1$MZbE6BQi=)e;E2C?o>!X{ZTcbOoyQBM}|3(i-k3~;L
z&qU8hFGa6LZ$xiL??oR*pG2QWUq#<WKSV!AzeRsW|3qOEH3^e8Ia4rYGi>T+)U?f*
z88=gysm*j|1~apn&CF@$G4q>+%%Wxqv$R>xtY}s-tDCjVx@H5jvDwUQX|^%jo1M(A
zW)HKs+0Ptk4l#$Dqs+191aq=E&75h@G3T3$%%$cEbG5n7+-Pnwx0}1nz2*V)ka^TR
zVV*Y6nHS9~=5_OydDnbkJ~p43FU>dRd-Id|)%;=pHX#eKaEr27i?<|8u{6uDEX%cg
zD`};&(pu@QOjcGahn3sPXBD)HSjDYUR#~fpRoSX$)wJqZ^{qx$Q>%s5+G=NYw7OW`
ztzK4NYk)P_8fJ~O##rO6N!C<rhBe!oXDzgrSj(+d)>><Wwb|Nc?X>n-`>li45$m{h
z$~tRZur6EItee&y>%R5KdTPC}UR&?1kJcCKyY<WZYXxk~CT+$J+M*q@RXbvvwqtvC
z!cJ+YvH!6%+F9)Eb}l=wUBE7E7qd&+W$f~HCA+F!!>(=Dvm4q??B;eWyRF^9?re9n
zd)j^M{`Meys6E0SZI81j+EeW5_AGm@y}({<FSA$LYwY#*CVQ*B!`^N0v;VaZ+sEva
z_8I%UeaXIR->`4n_w0xE6Z^US%6@Bqus_@1?4R}@8+K5KaA=2f1V?tlj_yPq+le`G
zCxw&RN#|s6GCSFvoK7Amzf;I5>XdLwJLQ~;P8FxRQ_HFAG;kU_&777_8>hX~$?59!
zaC$raoPo{|XSg%U8S6}NCOgxdna&(%zO%?#>a1{9JL{Z{&K768v&-4*9B>XfN1YSS
zY3H1C(YfMWcWya%od?ci=b7`;dE>lyJ~>~VAI@(FauFAIDVKG5S8^3sa}C#WUDtP$
zZYnpeo8HajW_5G8x!rtjLAQuo+%4snbt|})-D+-4w~kxiZR9p}Tez*=c5X+vi`(7p
z<@R+4xP#qc?nrlxJKmk-PIYIvv)y^_LU)O~++F3abvL-1-EHnpcaOW@J?I{BkGrSb
zv+f1=vU|<F>E3bgyN}$b?hE&|`_BF7esRCMzudoWAcn=r7!wP|#8@b%#v(B@=ES^M
zB9=0iCiYJ(V=PN7dn{KhZ>&J9aI9FYWUNfAe5_KeYOF@AcC22kVXR54d8}2eZLCAA
zbF5peXRJ@Ge{4`}Xlz7mbZlH~Vr)umdTdr~Zfrqpaco&^Wo%7seQZ-~YivhscWhtm
z-`L^UvDnGjnb`T*rP$Tjjo9tjz1YLpli2gvtJvGvhuG)Xx7g3vpBU_+9^ug*=Lw$d
zg+1MidbSty;$8|bwU^Gz;AQr*c{#m2UVg8TSJW%vmG;Vc6}>86b+49J*K6Q4_L_Mu
zy*6HZuano+>*4kG`gsGrA>MFrlsDF!;7#_Xc{9B^-h6M7x71tVt@hS=8@(;wc5j!r
z*E`@H@{W2Zywlz}@1l3byYAic?s^Zr$KEsVrT4~r?|t&VdOy719^@lF?o&SN^S<OO
zzUCXg<-5M`C;e1@T0gy?$<ONN@N@h5{DOWFzqnt@FY8zEEBn>_ntmO>zTe1i>bLM)
z`|bRWeiy&H-^=go5AX;3!~Bu{7=OG!$)D=a@Mrt;{DuA!f4RTPU+Zu1H~ZWCo&Fww
zzkkp_;ve@<`DgtL{$>A~f78F?-}fK+PyHAEYyX}9(f{Iq_ka0+{XiUxlW`^<jEnJ5
zT#ZNKX55K;@kBgjJWc$cc*b~^c=mX%c;0w{c;R@lc*%H~c=>pxc-44~c<p$-c*A&;
zc=LFxc-we~c;|Sxc+YsBc>nmI_|W)>_~`h!_{8{>`1JU!_}ut{_~Q7o_{#X2`1<&!
z_}2K2`0n_=_`mVP@ni9m@iX!B@k{Zm@f-2m@q6)y@h9=;@mKM;@elFO@o({;@jr1m
zfhLFqo!}BeLQaGedLo*z6R|`*ks^^gkuH%TkvWkqku#Aekv~x=Q8ZB^Q94mBQ87^^
zQ9V&BQ8&>b(Kyj8(K68{(LT{B(KXQ{(L2#EF)%SCF+4FUF*Y$FF*z|UF*7kIF+Z^=
zu{5zFu{yCXu`#hFu|2UXu{UubaVT*#aUyX#aV~K&aV2p*aVv2*@gVUy@htH&@h0&;
z@hR~&@gwm&0VR<no}`j&l21xWC8;Hiq?L4&elnR%l}wvVpUjlZn#_^Roy?akm@JYk
zo-CCto2-zmoUE3tnXHqnpKO$DnrxA5ootuvnCz15p6r$En;eiFoE(-MnH-ZGpPZDO
znw*iGot&3km|T)vo?Mk&o7|AxoZOb&ncS0vzyJXt00e*l5C8%|00;m9AOHk_01yBI
zKmZ5;0U!VbfWZHHfxlpY01yBIKmZ5;0U!VbfB+Bx0zd!=00AHX1b_e#00KY&2mk>f
z00e*l5C8%|00;m9AOHk_01yBIKmZ5;0U!VbfB+Bx0zd!=00AKI|3Tn?{ih3XK_CDG
qfB+Bx0zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00{gq6Zk)YaHA&x

diff --git a/examples/roaring_bitmap/bitmapwithruns.bin b/examples/roaring_bitmap/bitmapwithruns.bin
deleted file mode 100644
index 5ed243753e169295a32d6251db66180f23ceac06..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 48056
zcmeIuQyb)3w5Z{vVpUYBB$bL&skUv~wr$(CZQHhO+qPG)cJFo0f7tsf^BLb5*YnMt
zfdl{y015&iU;{8fdI(UZDhgDsiv4%-|000=3<=DyNCAKU(!k#=^#A~P^j|*sU-o|$
z{I7=pHSoU{0>HA}0I=r2HvJa>ynrrXJE#Th2(g1)Ln~mWaJl5SR1eaf$h0Bbj9dfq
zwJ21fSdLOwIlR(`syAvLsJo)!gr+@OHfUR-V}`CuPoVFM0T+hs7_nf?hzT90)R<9V
zPKpKGl5fi&tUj@R!{!;=2kdUKzrx`h$F0-&`45+FTsv@U#k~oSdOU0Js>HhtpOmk_
zj~Bl$fIE-{#0aJXUqdROrqDV2oA)5ai8LEB%*ZkzM~gfa3gjqamB7n<D1W2!f$A%2
zPpIFcaf9Y1T4!jlbOyS==yjprjzRzLC&3E@0)apv5C{YUfj}S-2m}IwKp+qZ1OkCT
zAP@)y{x<|lPV#~Y1OkCTAP@)y0)apv5C{YUfj}S-2m}IwKp+qZ1OkCTAP@)y0)apv
z5C{YUfj}S-2m}IwKp+qZ1OkCTAP@)y0)apv5C{YUfj}S-2m}IwKp+qZ1OkCTAP@)y
z0)apv5C{YUfj}S-2m}IwKp+qZ1OkCTAP@)y0)apv5C{YUfj}S-2m}IwKp+qZ1OkCT
zAP@)y0)apv5C{YUfj}S-2m}IwKp+qZ1OkCTAP@)y0)apv5C{YUfj}S-2m}IwKp+qZ
z1OkCTAP@)y0)apv5C{YUfj}S-2m}IwKp+qZ1OkCTAP@)y0)apv5C{YUfk5E@OaK5N
z00jtu23SA<BtQW)zyNH(1Cl^0AT5vq$P8oyasqjP{6Ha~C{O|@4U_{a0#$(OKrNsy
z&;V!*Gy_@!ZGiSbC!j0P1LzI(0|o*^fZ@O>U@R~Jm<&t<W&(47`M@GzDX;=q4Xgt;
z0$YIXz%F1fZ~!<A90N`QXMpp-CEzM>1Go*`10DiTfaky~;4Sb0_zZjnz5~C2zd#5?
zKpdpNFvx=uPzGb54qBiK`d~^h4VWIx1ZD+ufVshZU_r15SR5<`mIW(-mBDIYO|TAF
zA8Z6R1zUiv!FFIrunX87>;?7(2Y`dYVc<w`3^*Q~1WpBKfV07Q;6iW-xEx#st_3%M
zo55}1PH+#nA3Ove1y6vd!E@k6@CtYxyanC`AApa+XW&cl4fr1X1pWhl1Al^lKrjS{
zun-wyLR?4;MMG*R9x_8tC=p5#N*ziU${5NL${xxU${Q*WDjX^nDj6yhDj%v8sv4>h
zsvW8qY8Yw~Y94A8Y8&bh>Ky78>KW=2>K_^u8X6iA8XX!Jni!fAnjV@Jnj2aWS{zyy
zS{YgsS|8dJ+8Wvs+8x>#Iv6?<IvzR|IvctWx*WO|x*56?x*vKJdK!8WdL4Qf`WX5W
z`Zx3=^g9GV5QIVmL_;hjKoX=t8e~8=<UvU&6_ggr0A+@<K{=s3P=2ToR1_)!m4?be
z6`?9nb*L6p7is`ChMGYwp*B!^s1wu`>H+nJ`auJsA<%GW6f_o^08NIbK{KH_(0phS
zv=mwat%lY?8=)=Gc4!y07dikPhK@lep)=5V=n`}lx&hsW?m-WsC(v`~74#PR0DXqO
zLf@fZ&|fG7BQOqAa2V#{2rR=fScfgxg?%_BoCZ!0XM(fBIpExIKDZ!U1TGGjg3H1c
z;L30{xF%c&t`9eYo5C&N)^IzxBise<4)=ol!UN#J@Gy8JJO&;QPlBhyGvL|qJa{3z
z1YQoWg4e<u;LY$hcqhCE-VYywkHRP5)9^X?B76nD4&Q?B!Vlob@H6-&{04pxe}eyk
zzrjD@KQM^E2!@adgK&t5L=hE<BPQY?2_yxQ8cBy_M6w{+kz7b#qySPFDTb6p${^*D
zN=Q|t22vZThcrZ*AkC3hNL!=>(i!Q7^hEj~{gFY)P-Fx$8X1R7M5Z9qky*%GWC5}m
zS%$1c)*$PVO~_Vc2eKR4ha5zXAjgqY$XVn9av8aX+(hmm_mM})Q{)Bm8hM9&M7|*Z
zB0rGd2!KK;iV`S|vZ#PcsDf&!f!e5tCec)AS~LTi8O?^~MDw8e(L!iZv;<ljEr(V_
ztDx1<T4-Ig0ooXChPFi8pzYC4Xjilc+8gbM4n&8b!_iUbSabq98J&jCMCYLM(M9M|
zbOpK^U59Q&x1ih6UFcr)0D2fbhMq*vpy$y`=vDLvdK<lmK183O&(T-tTl53^8U2cW
zM}MJz(GZ4UI7VS%jK?CFjKwe=voII)v6NUEEIpP9%ZlZ|a%1_hf>;r(I93WPi&elX
zW7V*lSRJfB)(C5gwZK|q?XZqm7pyzh3+sywzy@Q(u#wmpY&<pzn~KfAW@GcPh1e2o
zIkpO0i*3L*W81Ku*dA;@b_hF)oxo0G=dg>|73?~83%iRwz#e1Iu$R~y>^=4g`v?1m
z{lxxYAP(agPT~yC;UXTzRXmQHxPvF~6nJVp9i9=-f@jBb;d$`_cwxL4UJ@^Zm&YsN
zRq+~lZM+`d5O0Dv$6MiT@eX)ryc^yV?}PWp2jN5U5%_3)96k}Bf=|b1;dAi?_+oq+
zz7k)9ug5pxTk#$EZhRkp5I=$+$4}vB@eBB6{2G1}zk}b$AK_2&7x-)Z9sUvjg8z&E
zz<=Wa0TC!c5H!IO0wEC!p%DgQ6CRNyQW0s13`Axk8<CU9L*yq45k-j-L}{WNQIV)Z
zR3~Z?b%_Q<W1<<+l4wJ;Cpr;bi5^66q8~Ak7(xsuMiFC)3B+V#8ZncYL(C@@5le{`
z#A;$4v60w9Y$tXRdx-<YVd5BZk~l-0CoU0Ji5tXi;vVskctSiUUJ-AJ55#BUEAgH9
zMf@c~BtqgOMTSY9jF2)JBX!auUD790l4;2FWF|5znS;zt<|7M|Mabf0DY7hCfvikc
zBWsd%$oga>vMJeuY)!T!JCa?<?qn~rFFAl5Ob#PQl4Hp6<Ro$`IfI-{&LbC+OUUKq
zDsnBkf!s`PBX^Q}$o=FY@+f(NJWZY>FOpZt>*Ou+F8P3bOg<xDl5fcO<R|hU@*DY+
z{6m5iOkosBF%(CMRFqPwIAu}}m7r2ksi|~SMk))HoytY!r3z4msbW+~sti@0szg<#
zYEZSQdQ?NI3DulxMYW|mP@So6R8Oi8)t?$f4W&j<qp5M!L~05(otj0>r4~?&sb$nk
zY7MoX+C*)oc2K*iebhnf2z8t~MV+NCP?xD|)J^IRb)R}fJ*8eyuc>#`N9qgpFZF}^
zO#w7SqclO&G)oJ#L@TsL8?;S(bdpX*r=>H{ndxkFPC5^rpDsifrAyGI>2h>Mx(Z#L
zu0_|S8_<pEW^_xs4c(sZM0cfo(7ow?^gwzDJ)9mzkEJKjlj&*nOnMGIpI$^SrB~3a
z>2>r*dJDas-bL@F5739{WAsV-41J!yL|>(E(6{M(^h5dy{hWS9zokFWpXsmkclsCo
zmku!qgEJHpW_TvT$V`mU8H;flpGnE2VbU|1n5;|=CO4ChDaaIIiZi8{vP=c0GE<GI
z$<$%$GmV(0Obezp(~jxLbYZ$Py_mkt0A?^Vj2X#{Va79)n5oPRW;QdAS;#D5mNToE
zwaf-)Gqa7^$?ReFGl!U?%n9Z+bB?*lTw$&=x0t)k1LiUFjCsktVcs*Jn17gW%unVI
z1BT%+7AC_?m<x;HXjl!$!)DkCC&DShsl(~Q8N*q^*~7WQdBX+5g~P?dCBtRH<-?W2
zRl_yHwZrwo4Z}^s&BLw2ZNnYHox|P2J;Qy%{lkO8L&GD&qr>CE6T?%&)5EjEbHfY5
zi^I#pE5mET>%*JETf;lTyTkj!2g66g$HS+>XTul5m&4b>H^X<r_rs6EPs1<5ufy-c
zAH!e5|Av2re}@4UVo{c0X_jRLR$>)aV-413JvPavV$-r2*vxD;HYb~h&CeEMi?Suy
z(rh`lB3p&6&eme<vJKeAY%{hc+lFn=c4E7-J=orCKXxEHgdNU~V#l%**vafPb|yQA
zozE^}m$EC^)$BTUBfEv&&hBFOvIp41>@oHvdxkyFUShAZH`v?kJ@z5{gniDwV&AeK
z*w5@&_B;EF{mX_pgu^+C3v)ad;bbnx>72#6oX@4?(s1dyOk7qj2bY`6#}(v?aK*V&
zTv@IHSDCBE)#U1M^|?k|Q?3Qqnrp{(<hpR(xn5jfZU8r!8^(>~#&F}gN!(O!1~;3V
z$1UWRaLc(>+*)n}x0&0<?d0}w`?*8hQSJnHnmfl`<gRenxm(;_?g96hd&a%w-f-`^
zPuxG;H|{6*hXZ+-$9R%wc#aqOD6jHy-sBxV!KdI;^Xd4Ed=@@CpNr4S7vKx?#rTqZ
z8NNJUiLc7n;A`{s_=bEFzB%8DZ_9V!JM-Q6o_rs^KR<{c%8%ej^W*r5{1kpVKZ~Es
zFW?vR%lMW28h$;$iQmfa;CJ);_=Efr{y2Y%Kg(a>FZ0*<oBSRAKL3b+%D>=W^Y8eN
z{1^UT{s;e?2Lwn!1wx<&RuBY9Py|gd1Y7Wgq>xHTD`XHd3)zI6LLMQ%P)H~$ln_b_
z<%Eht6`{INOQ<U}5E={3gqA`Zp}o*a=qmIOdJFx8fx-}BxG+i>D@+h33)6&|!W?0~
zut-=ctPoZU>x7NM7Gb-vOV}$M5Dp8+gp<M<;k<B3xGLNbZVUH>hr$!#x$sJOD|`?>
z3txrr!Y|>k5E2m)7b!6;@?u1k#h9pzmgtJUm{Lq5rWZ4bS;ZV;ZZV%&P%I)A7fXp{
z#R_6&v6@&@tRvPJ8;MQD7Gi6$o!C+AB6b&hiG9TZ;$U%@I8q!Vju$71Q^gtLY;m5r
zP+TG|7gvdE#SP+SahteP+#~K64~a*`6XI#{oOn^ZB3>77iFd^Z;$!id_)>f$z861<
z|A^njpW+`8jKC2rLPnSf7ZD@Th#HAU%!m_7L{dajN76+yMzTb*M{-5-MhZj<M~X#C
zM#@CWM=C|CMruTAN9siyMw&#LM_NVNMmj_~N4iCNM*2khM+QZPMn*(NN5(}aMy5oj
zM`lIlMixXCN0voaM%F~uM>a*aMs`GYNA^VyMvg>|M@~h~MlM7yN3KO~M(#xJM;=9<
zMqWf-N8Uv~M!rP;jr@rGjsOxQp%Nj{5-SOkBq@?68Imn|Qc_AKrIj*BnWbz}PAQL+
zUn(ROl}bpZrE*e5sftuxswLHx8c2<$W>QP3jnrQ1Bz2W~NWG<g(m-j5G+Y`bjg=-y
zlcj0WOlgiZUs@zBl~zcrrFGIqX^XU7+9mCk4oHWkW70|KjC5YQBwdwmNVlbX(nINq
z^jvx+y_G&lpQW$Tcj=e(R|-XuC?2Jv;V2)CMCE8Ksz<G;8}*|pqiLe)qnV;vqdB6v
zqxqr*qeY^{qotx{qZOi+qt&7{qjjS7qm80Xqb;JXqwS&{qg|riqrIYiqXVLYqr;*j
zqhq4uqm!ajqcftjqw}H*qf4U8qpPB8qZ^`|quZi8qkE$JqlcnLqbH)LqvxU*qgSHW
zqqm}WqYt8wqtBu*qi>?`qo1PxM88FUM*l=X8J01blo^?mML8;~a$GiLM^4Bo<kWIH
zIis9K&MxPY^U4L}!g4XWq+CWWFISSQ$~ENLay_}B+(d3Jx02h+9puh(H@T<WNA52V
zl84G8<k9jtd7?Z;o-WUl=gJG@#qu(FrMyO7FK?2!$~)xU@;>>Xd_+DjpOVkY7v#(G
zHTkA|N4_sVlAp>i<k#{$`J?<r{#X7X|CRv-Qc#6ZXoXb-MN$++Qw+scJSC~5Qqn3J
zl*~#tC8v@{$*&YriYg_P(n>j{qEbbvuGCWMDh-szN;9RU(ne{obW*x1J(S)`KV_gY
zL>aD(QpPG1l*!68Wu`JmnXfETmMSZh)yg_$qq0TWuIy6wDhHIq$}#1naz;6?TvDzo
zH<a7TJ>{YDM0u{fQr;>bl+Vgn<-77r`KyFfM8#D~4XeBwQDrrz>Z+x>s;{P0)2Qjy
zOlnp&hnic>rxsL;sKwP%YFV{{T3M~8)>P}L_0>jdQ?-TKT5YFxRJ*9%)m~~}b$~ip
z9j1;{$Ef4gN$ON}hB{lFr!G{NsLRz=>RNS!x>?<(?o{`v`_)70QT2p+T0N&;RIjMl
z)m!Ra^?~|WeWt!t->C1^PwGGFH}$9bM+IYW42zL5CdS3YSTv@_;xRMk#1gR-vDC42
zv5c`SvFx#2vAnSYvBI%pv68VevGTD>v8u5evD&eEv4*iGvF5Q>v9_@evCgq>v7WI$
zvHr0^v7xaMvC*+{v5B!MvFWi{vAMAYvBj}vv6ZnkvGuV{v8}NkvE8wKv4gQAvE#8*
zv9qxYvCFY*v750wvHP({v8S;YvDdM8v5&DYv43MfV!va625G29Xtc&^f+lH-rfG&|
zYo3<WQfX<m3|eL_o0e0{qvh8MX+^aXT4}AER#B^>Ro7~1b+rasW38FiQfs5N*E(rk
zwH{h;t)Dhf8=?)@MrmWU3EE_Bnl@9Lqs`YAX-l;g+G=f`wo%)nZP#{bd$j}FVeOc9
zQahua*Dh&SwHw-P?Vk2fd!jwpUTJT&587w#tM*;{rTx`HaU_n%sdzZf$0Kn$9*gU7
zEAGbqc*=O1c=~vzc-DB1c<y+<c)@s)c=33tc-eS`c;$Gtc+Gg7c>Q>zc++@`c<Xq(
zc*l5`c=vd(c;EPd_~7`k_{jK}`1tsw_|*7}`0V(+_`>*-`11Iw_}ch}_~!Vw_|EvA
z`2P5z_|f=@`04n$_{I2@`1Sa$_}%z}_~ZDq_{;d4`1|;$_&@P)@t^TOaZrbKOeb|l
z=X6nz>Z%^sP2JHGdI~+Yo=(rGXVJ6kx%9kx0llzZOfRXI(aY<V^s0Idy|!LYZ>TrX
zo9nIgwt5G>v))bbsrS+Q>x1;6`UrirK2D#gPtm9Av-G+80)4T*Okb(5(bwyn^sV|1
zeYd_(Kd2wkkL#!Ov-$=7vVKj!so&A>>yPxO`V0NF{!ag>f6@Qdf9St;z<><YAPm}I
z4Z)BM#n24Huno^h8mWx5Mg}9Zk<G|y<T3Iag^Z#`38S=8&ZuZqF{&H2jJiexqp{J<
zXlb-D+8dpWu0{`|x6#iSXbdri8>5V|#sp)sG0m81%rWL0i;Shl3S+gg&e&*dF}54K
zjJ?JI<FIkeIBA?Q&KsAEtHurEwsFsRXgo2V8?TJF#s}lG@zwZl{4)L;ArmoilQP35
zZ$?bnjG4M=nXc)ZDa|xydNY%m)y!e$HuISU%_3%Tvy@rZtYB6)tC=;;I%a*dk=fL2
zVYW8gnH|k8W_Pof+1DIk4mO9GBh4}9cyp3D)tq6@Hs_fO%_Zh?bCtQ)++c1tx0yT5
zJ?4J%ka^TRVV*Y6nHS9~=5_OydDnbkJ~p43FU>dRd-Id|kNM5~Y5p-m3$`$ev>1!C
zL@R2kR@^cz$4Xc!tkhOIE2EXg%5LSd@>&I~!d5Y>q*cZ$Z&k9YS~aZNRz0hs)x>IU
zwX)h;9jwk)H>;=B$LenlvW8kCtkKpuYoay9nr_Xq=2{D^#nv)wrM1RdZ*8)+T05-W
z);{Z?b;LSuowCka7p%+HHS4Bz$GUGlvYuKmtk>2%>!bC>`q%nl{k8xbvQe9`X`8hL
zTe1~fvklv}Jv(WqveVib?96sHJExt;&Tkj8i`pgZ(snt!qFu$VZr8Hw+70Z+b~C%B
z-NtTjce1<MJ?!3gKYO4(#2#*svd7vJ?8){td!{|do^LO*m)a}r)%H4jqrJu6Ztt@9
z+6U~z_A&dUea1d-U$U>-H|*Q?J^P{k#C~qSvftVt?9cXB`@8+i{%eOE#K9fP2|K(K
zabzdv=#J&Mj_;&&(m3gzOiorOhm+gM=M;2`IK`b(PFbgdQ`xEJ)O6}N^_@meQ>TT~
z+G*!>bh<d*onB5~XMi)<8Rm?1#yI1hNzPPfhBMol=PY!VILn<?&RS=Kv)S3^>~!`x
z`<+A1QRjqn+BxT3bgnqpom<Xb=YjLsdFH%y-Z<}_PtHHiH|MAG#{pf~#az;5T+S8U
zsH?hh*K{2>;ihm?yXoAFZWcGYo6F7X7H|u@#oUr^8MnM!$*t<vaBI8u+=gxwx4GNO
zZR>V$JG<T7o^BtvzdOhs>W*+nyW`x6?i6>rJIkHxE^rsS%iNXj8h5?B$=&MiaCf`=
z+=K2B_qcn?J?ma@FT2;=o9-R=zWd01>b`JayYJkO?icr8_lNu21w6<@J;I|s))PF*
zQ#{QxJlpfUq?gJ|>t*mVd)d64ULG&MSI8^smGDY?<-Ces6|cHi%d6`(@EUu~yp~=Y
zuf5mF>+1FJdVBr6f!+{rxHrlh>rL<`d(*s`-W+efx5!)St?*WR>%5KL7H_+^%iHT6
z@D6*&yp!G;@4R=(yXxKWZhQB<hu#zKx%bL@>wWM(dtbfp-Y@U37fK)rJV7PG2|f`?
z$cb1&Pgn^z;U`ii(j?L+G9|JmawKvm@+Ar;iX@6BN+rrBDkLf=swHYB>LltX8YP-0
zS|nO0+9f(Bx+J<MdL{ZM1|$Y2h9yQO#w5lkCMBjOW+Y}O<|P&;mL!%ZRwdRZHY7GD
zwk38Z_9XTv4keBzP9#nz&Lu7;t|YD}ZYAy}9wZ(oo+Vx;-Xz{9J|+H1d`tXH{7Ha5
z>|;LZGd|~we$-d}xNrK7pYT)osr__*Mn8+6-OuIc^$Ylg{bGJezl>krujE(tYxuSO
zdVWK{iQn9B<+t@a_?`W3eow!T-`^kP5A{d*qy2IIM1P7u-Jj*p^%wYy{bl}2e~rK1
z-{f!gclf*gef~lJh=1Ha<)8I0_?P`_{!Ramf8T%PKlNYuul;xaNB@ieum8jU?E^_D
zi6)68on(_jQc5aGEomg}q?b%4Qzg?TGbA%7vn6vT^Ca^p3nhytOC(Dt%Oxu&t0b!@
zYbEO@8zdVin<ZN&+a%j3J0-g&dn9`&`y~e^ha`t5M<vH5CnP5)rzK}5=OpJR7bTY_
kS0q;_*CjV5w<NbGcO~~GL7>69`4vF`0RH~{|2xb70P7F}00000

diff --git a/examples/roaring_bitmap/host_bulk_example.cu b/examples/roaring_bitmap/host_bulk_example.cu
index 3309881a2..33ca281bf 100644
--- a/examples/roaring_bitmap/host_bulk_example.cu
+++ b/examples/roaring_bitmap/host_bulk_example.cu
@@ -13,9 +13,12 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include <cuco/roaring_bitmap.cuh>
 #include <cuco/utility/traits.hpp>
 
+#include <cuda/std/cstddef>
+#include <cuda/std/cstdint>
 #include <cuda/std/type_traits>
 #include <thrust/device_vector.h>
 #include <thrust/logical.h>
@@ -34,11 +37,16 @@
  * [RoaringBitmapFormatSpec](https://github.com/RoaringBitmap/RoaringFormatSpec) repository and
  * check if the bulk lookup API returns the correct results. Namely, we test the following files:
  * -
- * [examples/roaring_bitmap/bitmapwithoutruns.bin](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata/bitmapwithoutruns.bin)
+ * [examples/roaring_bitmap/bitmapwithoutruns.bin
+ * (32-bit)](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata/bitmapwithoutruns.bin)
  * -
- * [examples/roaring_bitmap/bitmapwithruns.bin](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata/bitmapwithruns.bin)
+ * [examples/roaring_bitmap/bitmapwithruns.bin
+ * (32-bit)](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata/bitmapwithruns.bin)
  * -
- * [examples/roaring_bitmap/portable_bitmap64.bin](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata64/portable_bitmap64.bin)
+ * [examples/roaring_bitmap/portable_bitmap64.bin
+ * (64-bit)](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata64/portable_bitmap64.bin)
+ *
+ * @note This example requires the cmake option -DCUCO_DOWNLOAD_ROARING_TESTDATA=ON to be set.
  *
  */
 
@@ -47,8 +55,8 @@ bool check(std::string const& bitmap_file_path)
 {
   auto generate_keys = []() -> thrust::device_vector<KeyType> {
     if constexpr (cuda::std::is_same_v<KeyType, cuda::std::uint32_t>) {
-      // reference:
-      // https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata/README.md#test-data
+      // Create query keys for the bitmapwith{out}runs.bin files:
+      // https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata/README.md#test-data
       std::vector<cuda::std::uint32_t> keys;
       for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) {
         keys.push_back(k);
@@ -61,8 +69,8 @@ bool check(std::string const& bitmap_file_path)
       }
       return thrust::device_vector<cuda::std::uint32_t>(keys.begin(), keys.end());
     } else if constexpr (cuda::std::is_same_v<KeyType, cuda::std::uint64_t>) {
-      // reference:
-      // https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata64/README.md#portable_bitmap64bin
+      // Create query keys for the portable_bitmap64.bin file:
+      // https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata64/README.md#portable_bitmap64bin
       std::vector<cuda::std::uint64_t> keys;
       for (cuda::std::uint64_t k = 0x00000ull; k < 0x09000ull; ++k) {
         keys.push_back(k);
@@ -100,30 +108,38 @@ bool check(std::string const& bitmap_file_path)
   file.read(reinterpret_cast<char*>(thrust::raw_pointer_cast(buffer.data())), file_size);
   file.close();
 
+  // Create roaring bitmap from the file
   cuco::roaring_bitmap<KeyType> roaring_bitmap(thrust::raw_pointer_cast(buffer.data()));
 
+  // Generate query keys (all should be contained in the bitmap)
   auto keys = generate_keys();
+
+  // Create a vector to store the results
   thrust::device_vector<bool> contained(keys.size(), false);
 
+  // Bulk-lookup query keys against the bitmap
   roaring_bitmap.contains(keys.begin(), keys.end(), contained.begin());
 
+  // Check if all the keys are contained in the bitmap
   bool all_contained = thrust::all_of(contained.begin(), contained.end(), ::cuda::std::identity{});
   return all_contained;
 }
 
 int main()
 {
-  auto data_dir_prefix = []() -> std::string {
-    std::string source_path = __FILE__;
-    auto pos                = source_path.find_last_of("/\\");
-    return (pos == std::string::npos) ? std::string(".") : source_path.substr(0, pos);
-  };
-
-  bool success = check<cuda::std::uint32_t>(data_dir_prefix() + "/bitmapwithoutruns.bin");
-  success &= check<cuda::std::uint32_t>(data_dir_prefix() + "/bitmapwithruns.bin");
-  success &= check<cuda::std::uint64_t>(data_dir_prefix() + "/portable_bitmap64.bin");
+#ifdef CUCO_ROARING_DATA_DIR
+  std::string const data_dir = CUCO_ROARING_DATA_DIR;
+  bool success               = check<cuda::std::uint32_t>(data_dir + "/bitmapwithoutruns.bin");
+  success &= check<cuda::std::uint32_t>(data_dir + "/bitmapwithruns.bin");
+  success &= check<cuda::std::uint64_t>(data_dir + "/portable_bitmap64.bin");
 
   std::cout << "success: " << (success ? "true" : "false") << std::endl;
 
   return success ? 0 : 1;
+#else
+  std::cerr << "This example requires CUCO_ROARING_DATA_DIR to be defined (build with cmake option "
+               "-DCUCO_DOWNLOAD_ROARING_TESTDATA=ON)"
+            << std::endl;
+  return 1;
+#endif
 }
\ No newline at end of file
diff --git a/examples/roaring_bitmap/portable_bitmap64.bin b/examples/roaring_bitmap/portable_bitmap64.bin
deleted file mode 100644
index acd0f9007d6902f2fa29b82d8f5ee6662a4291d2..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16506
zcmeI&F%Cdb3;@s~5*IOJFgb_WQCz_h9MKIZi`^!9P1@i8x4yr&j5nsfiXyMaUCL~m
zIM+7&E_28npZ6?V?B|ka)G-SJ1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ
z;P(Reu7JgX-+!Y42oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+z&C*l;jK-t


From 6cd8413fb7fc00c39696c5ed0855bd3f811a559f Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 13 Aug 2025 17:27:12 -0700
Subject: [PATCH 12/24] Allow build script to handle extra cmake args

---
 ci/build.sh | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/ci/build.sh b/ci/build.sh
index 3d244f334..7ac9029e3 100755
--- a/ci/build.sh
+++ b/ci/build.sh
@@ -51,6 +51,8 @@ HOST_COMPILER=${CXX:-g++} # $CXX if set, otherwise `g++`
 CUDA_ARCHS=native # detect system's GPU architectures
 CXX_STANDARD=17
 
+EXTRA_CMAKE_OPTIONS=()
+
 function usage {
     echo "cuCollections build script"
     echo "Usage: $0 [OPTIONS]"
@@ -62,9 +64,9 @@ function usage {
     echo "  --prefix: Build directory prefix (Defaults to <repo_root>/build)"
     echo "  -i/--infix: Build directory infix (Defaults to local)"
     echo "  -d/--debug: Debug build"
-    echo "  -p/--parallel: Build parallelism (Defaults to \$PARALLEL_LEVEL if set, otherwise the system's number of CPU cores)"
-    echo "  --cuda: CUDA compiler (Defaults to \$CUDACXX if set, otherwise nvcc)"
-    echo "  --cxx: Host compiler (Defaults to \$CXX if set, otherwise g++)"
+    echo "  -p/--parallel: Build parallelism (Defaults to $PARALLEL_LEVEL if set, otherwise the system's number of CPU cores)"
+    echo "  --cuda: CUDA compiler (Defaults to $CUDACXX if set, otherwise nvcc)"
+    echo "  --cxx: Host compiler (Defaults to $CXX if set, otherwise g++)"
     echo "  --arch: Target CUDA arches, e.g. \"60-real;70;80-virtual\" (Defaults to the system's native GPU archs)"
     echo "  --std: CUDA/C++ standard (Defaults to 17)"
     echo "  -v/-verbose/--verbose: Enable shell echo for debugging"
@@ -103,6 +105,9 @@ function usage {
     echo "    Enables verbose mode for detailed output and builds with C++17 standard."
     echo "    Build files will be written to <repo_root>/build/local and symlinked to <repo_root>/build/latest."
     echo
+    echo "Pass-through:"
+    echo "  -- [CMake args...]  Anything after -- is forwarded to CMake"
+    echo
     exit 1
 }
 
@@ -126,6 +131,7 @@ while [ "${#args[@]}" -ne 0 ]; do
     --arch) CUDA_ARCHS="${args[1]}";    args=("${args[@]:2}");;
     --std)  CXX_STANDARD="${args[1]}";  args=("${args[@]:2}");;
     -v | -verbose | --verbose) VERBOSE=1; args=("${args[@]:1}");;
+    --) EXTRA_CMAKE_OPTIONS+=("${args[@]:1}"); break;;
     -h | -help | --help) usage ;;
     *) echo "Unrecognized option: ${args[0]}"; usage ;;
     esac
@@ -200,8 +206,14 @@ echo "-- BUILD_TESTS: ${BUILD_TESTS}"
 echo "-- BUILD_EXAMPLES: ${BUILD_EXAMPLES}"
 echo "-- BUILD_BENCHMARKS: ${BUILD_BENCHMARKS}"
 
+if [ ${#EXTRA_CMAKE_OPTIONS[@]} -gt 0 ]; then
+    echo "-- EXTRA_CMAKE_OPTIONS: ${EXTRA_CMAKE_OPTIONS[*]}"
+else
+    echo "-- EXTRA_CMAKE_OPTIONS: (none)"
+fi
+
 # configure
-cmake -S .. -B $BUILD_DIR $CMAKE_OPTIONS
+cmake -S .. -B $BUILD_DIR $CMAKE_OPTIONS "${EXTRA_CMAKE_OPTIONS[@]}"
 echo "========================================"
 
 if command -v sccache >/dev/null; then

From 42e5d01b2be42efd9594d93227c91e7f1a750647 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Wed, 13 Aug 2025 17:46:53 -0700
Subject: [PATCH 13/24] Add unit test

---
 tests/CMakeLists.txt                  |   5 +
 tests/roaring_bitmap/contains_test.cu | 134 ++++++++++++++++++++++++++
 2 files changed, 139 insertions(+)
 create mode 100644 tests/roaring_bitmap/contains_test.cu

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 21828b360..23258d445 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -152,3 +152,8 @@ ConfigureTest(BLOOM_FILTER_TEST
     bloom_filter/unique_sequence_test.cu
     bloom_filter/arrow_policy_test.cu
     bloom_filter/variable_cg_test.cu)
+
+###################################################################################################
+# - roaring_bitmap ---------------------------------------------------------------------------------
+ConfigureTest(ROARING_BITMAP_TEST
+    roaring_bitmap/contains_test.cu)
diff --git a/tests/roaring_bitmap/contains_test.cu b/tests/roaring_bitmap/contains_test.cu
new file mode 100644
index 000000000..db3b9cd33
--- /dev/null
+++ b/tests/roaring_bitmap/contains_test.cu
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2025 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/roaring_bitmap.cuh>
+#include <cuco/utility/traits.hpp>
+
+#include <cuda/std/cstddef>
+#include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
+#include <thrust/device_vector.h>
+#include <thrust/logical.h>
+#include <thrust/universal_vector.h>
+
+#include <catch2/catch_test_macros.hpp>
+
+#include <fstream>
+#include <string>
+#include <vector>
+
+namespace {
+template <typename KeyType>
+bool check(std::string const& bitmap_file_path)
+{
+  auto generate_keys = []() -> thrust::device_vector<KeyType> {
+    if constexpr (cuda::std::is_same_v<KeyType, cuda::std::uint32_t>) {
+      std::vector<cuda::std::uint32_t> keys;
+      for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) {
+        keys.push_back(k);
+      }
+      for (int k = 100000; k < 200000; ++k) {
+        keys.push_back(3 * k);
+      }
+      for (int k = 700000; k < 800000; ++k) {
+        keys.push_back(k);
+      }
+      return thrust::device_vector<cuda::std::uint32_t>(keys.begin(), keys.end());
+    } else if constexpr (cuda::std::is_same_v<KeyType, cuda::std::uint64_t>) {
+      std::vector<cuda::std::uint64_t> keys;
+      for (cuda::std::uint64_t k = 0x00000ull; k < 0x09000ull; ++k) {
+        keys.push_back(k);
+      }
+      for (cuda::std::uint64_t k = 0x0A000ull; k < 0x10000ull; ++k) {
+        keys.push_back(k);
+      }
+      keys.push_back(0x20000ull);
+      keys.push_back(0x20005ull);
+      for (cuda::std::uint64_t i = 0; i < 0x10000ull; i += 2ull) {
+        keys.push_back(0x80000ull + i);
+      }
+      return thrust::device_vector<cuda::std::uint64_t>(keys.begin(), keys.end());
+    } else {
+      static_assert(cuco::dependent_false<KeyType>, "KeyType must be uint32_t or uint64_t");
+      return {};
+    }
+  };
+
+  std::ifstream file(bitmap_file_path, std::ios::binary);
+  if (!file.is_open()) { return false; }
+
+  file.seekg(0, std::ios::end);
+  std::streamsize file_size = file.tellg();
+  file.seekg(0, std::ios::beg);
+
+  thrust::universal_host_pinned_vector<cuda::std::byte> buffer(file_size);
+
+  file.read(reinterpret_cast<char*>(thrust::raw_pointer_cast(buffer.data())), file_size);
+  file.close();
+
+  cuco::roaring_bitmap<KeyType> roaring_bitmap(thrust::raw_pointer_cast(buffer.data()));
+
+  auto keys = generate_keys();
+  thrust::device_vector<bool> contained(keys.size(), false);
+
+  roaring_bitmap.contains(keys.begin(), keys.end(), contained.begin());
+
+  bool const all_contained =
+    thrust::all_of(contained.begin(), contained.end(), ::cuda::std::identity{});
+  return all_contained;
+}
+}  // namespace
+
+TEST_CASE("roaring_bitmap bulk contains from RoaringFormatSpec testdata", "[roaring_bitmap]")
+{
+#ifndef CUCO_ROARING_DATA_DIR
+  SKIP(
+    "CUCO_ROARING_DATA_DIR is not defined. Configure with -DCUCO_DOWNLOAD_ROARING_TESTDATA=ON to "
+    "run this test.");
+#else
+  std::string const data_dir = CUCO_ROARING_DATA_DIR;
+
+  SECTION("32-bit: bitmapwithoutruns.bin")
+  {
+    std::string const path = data_dir + "/bitmapwithoutruns.bin";
+    if (!std::ifstream(path).good()) {
+      std::string const msg = std::string("Missing file: ") + path;
+      SKIP(msg.c_str());
+    }
+    REQUIRE(check<cuda::std::uint32_t>(path));
+  }
+
+  SECTION("32-bit: bitmapwithruns.bin")
+  {
+    std::string const path = data_dir + "/bitmapwithruns.bin";
+    if (!std::ifstream(path).good()) {
+      std::string const msg = std::string("Missing file: ") + path;
+      SKIP(msg.c_str());
+    }
+    REQUIRE(check<cuda::std::uint32_t>(path));
+  }
+
+  SECTION("64-bit: portable_bitmap64.bin")
+  {
+    std::string const path = data_dir + "/portable_bitmap64.bin";
+    if (!std::ifstream(path).good()) {
+      std::string const msg = std::string("Missing file: ") + path;
+      SKIP(msg.c_str());
+    }
+    REQUIRE(check<cuda::std::uint64_t>(path));
+  }
+#endif
+}
\ No newline at end of file

From 20dc8168ebc408657981cb441fe971eaaae9553e Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 14 Aug 2025 05:36:47 -0700
Subject: [PATCH 14/24] Minor doc fix

---
 README.md                                    | 2 +-
 examples/roaring_bitmap/host_bulk_example.cu | 9 +++------
 2 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index f9cc3efc0..052636255 100644
--- a/README.md
+++ b/README.md
@@ -266,4 +266,4 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
 `cuco::roaring_bitmap` implements a Roaring bitmap following the [Roaring bitmap format specification](https://github.com/RoaringBitmap/RoaringFormatSpec).
 
 #### Examples:
-- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/roaring_bitmap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy9WAtv4zYS_itzWqCQN7blZB9pncfVG2d7Rrd2YXu7KDYLgZJom7AsqiQVxxfkv3dISpbkKN1He-cAsUUOv3nwm-FQ946kUjKeSKf_8d5hkdM_bjsxSZYZWVKn74RZRJy2I3kmQv3sPb9J4Dlc8XQn2HKlwA1bcNI7eQXj30bD0QCuJtNfJ9PBfDQZd7WoEX_HQppIGkGWRFSAWlEYpCTEr3ymDb9Roe2Ak24PXC1w4-RzN07rzKDseAYbsoOEK8gkRRgmYcFiCvQupKkClkDIN2nMSBJS2DK1MqpyHGMO_J6D8EARlCe4IsWnRVUSiNqbrj8rpdK-52232y4xZne5WHqxFZbeu9HV9Xh23UHT98veJzEGFgT9I2MCHQ92QFK0LCQB2huTLXABZCkozimuLd8KpliybIPkC7UlghqciEklWJCpWvAKO9H_qgCGjyQYuMEMRrMbB94MZqNZ2-B8GM3_M3k_hw-D6XQwno-uZzCZ4maNhyO9Vfj0Fgbj3-Hn0XjYBoqhQ1X0LhXaCzSV6bDSyMZwRmnNjAW3ZsmUhmzBQigIBEt-S0WCbkFKxYZZqqGRkcGJ2YYposzYI-eMKu8muUmesSSMs4jCeZiF3BOcCET0A6Y2JO2G2erykUymWMzUzlOCMCW7qzS9PESKiCdV5IX4L6KLyycnWaKaJ9Uupb5VUBNQK5FJ5UX0Fh3xb2mouOiumkRivkRCxM2TWcIwdpLEVYiq3AI3npJNbS3jDYOaIMmyNmQhDaD33DL2R5NJKwTwgyxe-_SO4J5TjK-dDgSjCxjSDe4Weq0o7pnUe5znTn1bkIUaBUkYc77OUgMMg19HsqwKo8SmcK4JtpgYnGA-bDm8OOkgEFgwwxhkN4XXLyvD4KZcKJNQyMANUS1YCL7R1hj8j1Nr0hsj_daIzJCin1yd0BIzeok8z4IuFg2vJls8lWtamMoplwyjttvzFytBuAZm_dfuFs6inyivMoHE1nMhFwIjjmMyi5GPMCYbGu_a2mUMpDJCCx7HfKtzRe-E7BsVHetJHiJ5QH7PfulKxzMlskR2A5aYJa6NYOtbnPWCmAfeq-PTUxL94GkDI6JIs7LW15n5_7PxqwwsiJQ_v35Z2mgp9w_a-Ppls7rWPjF-xBOOwryaG_lBktNpQ9aYd6munNAZXr2_mvjDyYfxu8lg6E8ng-lo_JM_v57Nh4P54GIy1idMgOWZqn3ymcKqKGJjJmPNwVqWICfhZ7qb42-sDAHnsWW4i3b3-7aKIJUx_b_LM9DXTPVTolZo_D3iAslQ15ImVFcIf013Ei7g4ye3BZ1LsJWt36-VxvNCJRgA0PlklOjTB9sLLLdaOVrApC_RRv-2WNKGymyGhfrFia8uWwUQgOfBFZZD9PCPjGLiGnuK06qkyj3y-aHgS5l9BcQ_Rs7p9WD4y3V3Ez3TQx09VqgxLuQBaXLKmH5WSGsX3AYxWGO0e2f4dQ7HPf3Rv48uzEMlLmDgumkmV35AcIfXrT32Q00JAhvQEu0c2z37--ho_VeYL5DJn8U9reB-_2W4TZi22D5BsKZ4ugY0oEuWuK22VUGTyG0V4A9AY91hfTsZX7_8CjI2VgRDxf8FE7EElVx8pNqWvs_R0rj3eVpqMUvLO7O9WRzb3cbnH4rnb9jxz-ka1HUd9_6GrgOR3t1JjlbKNoq8qok8aTGzScsODGU6cU80xF-Y27v7Pl8AR8D-ZmaYLf3SzLgvOYItfOgTKalQrm6_tRo8TyKaKH9BUHhf49vYF-a_YYMG6WNpX7wwPoUV-Z2v6sH9Q6Fff-kH_Y1JMUFVJlP0s83NvDU2o-7hUdXOhbjs95HqROysLkx1919aqou5zREUXS68NCtCKgScn6MLbwmKmYublsMBPXyoR4-ZdRiJOLc998UExQw97N34iSp7mZXsv8YVY4qkdL10e1WbEc4aXBzL6KleYxb45teFXaxoHC9dK_wUGm50K49lwZHy5mFuAylLEho1UCbYKTy1g2yxoMLdK2-VOzOl2M0bn3BbOWzw7iB2e2PQ7sgVFKeowHum8kMi1Xm4IuL5pVvYIsjWT7mRMfOuVdfVVQz3BwlaU5xDhzGX1K1YkhfevOsr7g_FbWHPHkveem9Y9if18S828ayyxbYvqh4ALsHUldhRx5FOBjxs9HsJJBdLKl1Ka99b5S1VrcVq8JSA3S_NUYnf9sKe3z-qm10vCLrpuyyNsLVAB1eXAsPaiqo3eOPp5DeeikdkiWvzO421Xssf3tqtCvlktWmXVhTTVS-Lm5eOnlZkNYsnA6jXmY4WF_ilzMU-DnqcL9xHSqt25IZhJaj0AbrKMbW7f7D8yzO8pgYnTJ7rpmdDNKrplp-xRYRXatO-F1277tj94Whaze992w2aUn7EBNrduOps76bMwlC_uql_LmxD39wR7cGPsKA9cbPMy3KB_t03AX4tmj2W6mjNFzanoIit12i5rde5hn5eqd1C479xAJ2jOKznDMER5LByV_a1XNnDJcc4-UyfhqXO_RnRfH1r3Lb8foZkMKTEAsKwGpj3l7Wr3o1T9gH558b5gvtf63Dho5Mp9876k0RsgYx12o5-m4qlUZRvh53kNgyPT15lxzhtzcJJp4NwF-HR0fEpdIgIVxdy45_2oNPBvlnhP6W7gagTk01g3ifHLKhghmEY4-CtfQWMA5pwa-ehXcxjWa7NY9FyHj6Zvz8Bdgyhgg==))
\ No newline at end of file
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/roaring_bitmap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy9WA1v2zYT_iv3qsAgN7blpB_ZnI_NjdPNWF97sNMVQ1MIlETbhGVRI6k4XpD__h5JfSbK2nV75wCxRR6fu-M9dzzqzpFUSsYT6Qw_3jkscoaHXScmySojK-oMnTCLiNN1JM9EqJ-959cJPIcLnu4FW60VuGEHjgZHr2D662Q8GcHFbP7LbD66msymfS1qxN-xkCaSRpAlERWg1hRGKQnxK5_pwq9UaDvgqD8AVwtcO_nctdM5MSh7nsGW7CHhCjJJEYZJWLKYAr0NaaqAJRDybRozkoQUdkytjaocx5gDv-UgPFAE5QmuSPFpWZcEokrT9WetVDr0vN1u1yfG7D4XKy-2wtJ7N7m4nC4ue2h6uex9EuPGgqC_Z0yg48EeSIqWhSRAe2OyAy6ArATFOcW15TvBFEtWXZB8qXZEUIMTMakECzLV2LzCTvS_LoDbRxLcuNECJotrB96MFpNF1-B8mFz9NHt_BR9G8_loejW5XMBsjsGajic6VPj0FkbT3-DnyXTcBYpbh6robSq0F2gq09tKI7uHC0obZiy5NUumNGRLFkJBIFjxGyoSdAtSKrbMUg2NjAxOzLZMEWXGHjlnVHnXyXXyjCVhnEUUTsMs5J7gRCCiHzC1JWk_zNbnj2QyxWKm9p4ShCnZX6fp-UOkiHhSRV6I_yK6PH9ykiWqfVLtU-pbBQ0BtRaZVF5Eb9AR_4aGiov-uk0k5iskRNw-mSUM906SuA5Rl1ti4CnZNtYy3jKoCZKsGkMW0gB6zy1jfzCZtEYAP8jijU9vCcac4v7a6UAwuoQx3WK00GtFMWZSxzjPnWZYkIUaBUkYc77JUgMMo18msqoKk8SmcK4JdpgYnGA-7Di8OOohEFgwwxhkN4XXL2vD4KZcKJNQyMAtUR1YCr7V1hj8j3Nr0hsj_daILJCin1yd0BIzeoU8z4I-Fg2vIVs8VWs6mMoplwx3bV_yFytBuAFm_dfuFs6inyivMoHE1nMhFwJ3HMdkFiMfYUq2NN53tcu4kcoILXkc853OFR0JOTQqevDROqurGc-UyBLZD1hiJl27S52vccgLYh54rw6Pj0n0naeNiIgiXquyzmNT_j07HhlRBD1n2uuXlR2WHv-gHa9feq3qOiWJf8DTiMJVncd50c9DvyUbzJFUVznojS_eX8z88ezD9N1sNPbns9F8Mv3Rv7pcXI1HV6Oz2VSfBgGWUqrKRDFFUFHExqzD-oB1J0H-wM90f4W_MYsDzmPLRhftHg5txiPtMFW_ybPF16zyU6LWaPwd4gLJUNeKJlRns7-hewln8PGT24HeOdgqNBw2ythpoRIMAGjuGyX6pMBWAEujVo4WMOlLtNG_KZZ0oTabYVF9ceSr804BBOB5cIGlCz38PaOYZMae4mSp6HCHvLwvOFFlSgHxjxFwfjka__eyv42e6aGeHivUGBfyDWlzyph-UkhrF9wWMdjgbg9O8OsUDgf6o38fnJmH2r6AgeunmVz7AcEIbzol9n1DCQIb0ArtFFsz-_vgYPNnmC-QyZ_FPa7hfvtluG2YtjA-QbC2_XQNaEBXLHE7XauCJpHbKcDvgca6G_p6Mr5--RfI2FoRDBX_H0zEElRx8ZFqW_o-R0vj3udpqcUsLW9NeLM4ttHG5--K56-I-Od0jZq6Dgd_Q9cDkcHtUY5WybaKvGqIPGkxs0nLHhjKdOIeaYg_MXdw-22-AA6A_c3MMCH90sy4qziC7XboEympUK5ulbUaPE8imih_SVC4rPFd7OHy37BFg_SxVBYv3J_Civx-Vvfg7r7Qr7_0g_7GpJihKpMp-tnmZt7GmlH34VHVzYW4HA6R6kTsrS5Mdfc_WqqPuc0RFF0uvDQrQioEnJ6iC28JiplLlpbDAT38UI8eM-twJ-Lc9twXsylm6L5040eq7MVTsj-MK8YUSelm5Q7qNiOcNbg4ltFTvcYs8M2vM7tY0TheuVb4KTQMdCffy4Ij1S3BdO4pSxIatVAm2Cs8tYNsuaTCLZV3qsjMKXbexicMK4ct9vliXxqDdkeuoDhFBd4JlR8SqU7DNRHPz93CFkF2fsqNjJl3rbq-rmIYHyRoQ3EOHcZcUrdmSV548_tE0esXnX3JHkve5q2j6k-a419s4kktxLYvqh8ALsHUldgZx5FOBjxs9DsEJBdLal1Kp-yt8paq0WK1eErAxktzVOK3vVznd4V6sJsFQTd955URthbozdWlwLC2puoN3k56-e2k5hFZ4dr8_mGt1_IPb9hWhXyy2nQrK4rpupfFLUnvnlZkNYsnN1CvMx0tLvArmbNyH_Q4X7qPlNbtyA3DSlDrA3SVY2p_d2_5l2d4Qw1OmDzXTc-WaFTTLT9jywivv6Z9L7p23bH748m8nt9l2w2aUn7EBNrduuqkdFNmYahfszQ_Z7ahb--ISvADLGjtl7aiLBfo33wV4F9Fs8dSE639wuYUFLH1Gi239TrXMMwrtVto_B4H0DmKw3rOEBxBHlbuWlyrlQNccoiTz_RpWOksz4j261tr2PL7GZLBkBILCMNqYN41Nq56107VB-Sfa-cL7n-dhwsfnUy5d9afJGJLZKzTdfSbTyyNonqT6yQ3YXh49Co7xGlrFk46PYQ7Cw8ODo-hR0S4PpNb_3gAvR72zQr_Kd0NRL2YbAPz7jdmQQ0zDMMYB2_s61oc0ITbOPfdYh7LcmMei5Zz_8n8_Q-9NYGl))
\ No newline at end of file
diff --git a/examples/roaring_bitmap/host_bulk_example.cu b/examples/roaring_bitmap/host_bulk_example.cu
index 33ca281bf..4e371eaa1 100644
--- a/examples/roaring_bitmap/host_bulk_example.cu
+++ b/examples/roaring_bitmap/host_bulk_example.cu
@@ -36,14 +36,11 @@
  * In this example we load two 32-bit bitmaps and one 64-bit bitmap (portable format) from the
  * [RoaringBitmapFormatSpec](https://github.com/RoaringBitmap/RoaringFormatSpec) repository and
  * check if the bulk lookup API returns the correct results. Namely, we test the following files:
- * -
- * [examples/roaring_bitmap/bitmapwithoutruns.bin
+ * - [bitmapwithoutruns.bin
  * (32-bit)](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata/bitmapwithoutruns.bin)
- * -
- * [examples/roaring_bitmap/bitmapwithruns.bin
+ * - [bitmapwithruns.bin
  * (32-bit)](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata/bitmapwithruns.bin)
- * -
- * [examples/roaring_bitmap/portable_bitmap64.bin
+ * - [portable_bitmap64.bin
  * (64-bit)](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata64/portable_bitmap64.bin)
  *
  * @note This example requires the cmake option -DCUCO_DOWNLOAD_ROARING_TESTDATA=ON to be set.

From 5d1b47056d0a6b035d98a5beba40198513c9e57b Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 14 Aug 2025 06:20:14 -0700
Subject: [PATCH 15/24] Compile benchmarks with -lineinfo

---
 benchmarks/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 916d674a3..9940c82b9 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -28,7 +28,7 @@ function(ConfigureBench BENCH_NAME)
     target_include_directories(${BENCH_NAME} PRIVATE
                                              "${CMAKE_CURRENT_SOURCE_DIR}")
     target_compile_options(${BENCH_NAME} PRIVATE --compiler-options=-Wall --compiler-options=-Wextra
-      --compiler-options=-Werror -Wno-deprecated-gpu-targets --expt-extended-lambda)
+      --compiler-options=-Werror -Wno-deprecated-gpu-targets --expt-extended-lambda -lineinfo)
     # Add GCC-specific warning suppression only for GCC
     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
       target_compile_options(${BENCH_NAME} PRIVATE -Xcompiler -Wno-subobject-linkage)

From aa56fd6398a340e78ce0a58dc60d778b9ff16b4a Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 14 Aug 2025 06:21:15 -0700
Subject: [PATCH 16/24] Use cub::DeviceTransform instead of thrust::transform

---
 .../roaring_bitmap/roaring_bitmap_impl.cuh      | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
index 42752f2d6..82762dbe0 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
@@ -21,6 +21,7 @@
 #include <cuco/detail/roaring_bitmap/util.cuh>
 #include <cuco/utility/traits.hpp>
 
+#include <cub/device/device_transform.cuh>
 #include <cuda/std/cstddef>
 #include <cuda/std/cstdint>
 #include <cuda/std/functional>
@@ -28,7 +29,6 @@
 #include <cuda/stream_ref>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
-#include <thrust/transform.h>
 
 namespace cuco::detail {
 
@@ -74,17 +74,18 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
                                OutputIt contained,
                                cuda::stream_ref stream = {}) const noexcept
   {
-    auto nosync_exec_policy = thrust::cuda::par_nosync.on(stream.get());
     if (this->empty()) {
+      auto nosync_exec_policy = thrust::cuda::par_nosync.on(stream.get());
       thrust::fill(
         nosync_exec_policy, contained, contained + cuda::std::distance(first, last), false);
     } else {
-      thrust::transform(nosync_exec_policy,
-                        first,
-                        last,
-                        contained,
-                        cuda::proclaim_return_type<bool>(
-                          [*this] __device__(auto key) { return this->contains(key); }));
+      cub::DeviceTransform::Transform(
+        first,
+        contained,
+        cuda::std::distance(first, last),
+        cuda::proclaim_return_type<bool>(
+          [*this] __device__(auto key) { return this->contains(key); }),
+        stream.get());
     }
   }
 

From 144be8511a2df41bfb1e5d1dfd3ca4031543ee31 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Thu, 14 Aug 2025 15:23:45 -0700
Subject: [PATCH 17/24] Pass ParentT to CG type and pass CG objects by-value

---
 README.md                                     |  4 +-
 examples/static_set/device_ref_example.cu     |  4 +-
 examples/static_set/device_subsets_example.cu |  4 +-
 include/cuco/bloom_filter_ref.cuh             | 10 +--
 .../detail/bloom_filter/bloom_filter_impl.cuh | 16 ++---
 .../detail/bloom_filter/bloom_filter_ref.inl  |  9 ++-
 include/cuco/detail/bloom_filter/kernels.cuh  | 10 +--
 include/cuco/detail/dynamic_map_kernels.cuh   | 12 ++--
 .../detail/hyperloglog/hyperloglog_impl.cuh   |  8 +--
 .../detail/hyperloglog/hyperloglog_ref.inl    |  4 +-
 .../cuco/detail/open_addressing/kernels.cuh   | 26 ++++---
 .../open_addressing_ref_impl.cuh              | 49 +++++++------
 include/cuco/detail/probe_sequence_impl.cuh   | 20 +++---
 .../probing_scheme/probing_scheme_impl.inl    |  8 +--
 include/cuco/detail/static_map.inl            |  6 +-
 include/cuco/detail/static_map/kernels.cuh    |  8 ++-
 .../cuco/detail/static_map/static_map_ref.inl | 61 ++++++++--------
 include/cuco/detail/static_map_kernels.cuh    | 10 +--
 .../static_multimap/device_view_impl.inl      | 52 ++++++++------
 .../cuco/detail/static_multimap/kernels.cuh   | 20 +++---
 .../static_multimap/static_multimap.inl       | 69 ++++++++++--------
 .../static_multimap/static_multimap_ref.inl   | 34 ++++-----
 .../static_multiset/static_multiset_ref.inl   | 34 ++++-----
 .../cuco/detail/static_set/static_set_ref.inl | 36 +++++-----
 include/cuco/detail/utils.cuh                 |  2 +-
 include/cuco/hyperloglog_ref.cuh              |  5 +-
 include/cuco/operator.hpp                     | 48 ++++++-------
 include/cuco/probing_scheme.cuh               |  8 +--
 include/cuco/static_map.cuh                   | 21 +++---
 include/cuco/static_map_ref.cuh               |  4 +-
 include/cuco/static_multimap.cuh              | 71 +++++++++++--------
 include/cuco/static_multimap_ref.cuh          |  4 +-
 include/cuco/static_multiset_ref.cuh          |  4 +-
 include/cuco/static_set_ref.cuh               |  4 +-
 tests/static_multimap/for_each_test.cu        |  5 +-
 tests/static_multiset/for_each_test.cu        |  5 +-
 tests/utility/probing_scheme_test.cu          |  3 +-
 37 files changed, 369 insertions(+), 329 deletions(-)

diff --git a/README.md b/README.md
index c66f76f30..4c04ac88f 100644
--- a/README.md
+++ b/README.md
@@ -209,8 +209,8 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
 
 #### Examples:
 - [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJx9VgtvIjcQ_ivTraqSu-UVKTqJPFSapCq6EzmF3J1OpSLGa1grxqZ-wFHEf--MvcvjHk0kYD3jmW9mvs_rbeaEc9Jol_X-2mayyHrdPFNMzwObi6yX8VCwLM-cCZbTc_vVWMMruDXLjZXz0kODn8F55_y8iR8XOQw_Du4Gfbh9eHz_8Nh_GjwMW7QhbnonudBOFBB0ISz4UkB_yTh-VZYcPgpLaOC81YEGOYyzyjbOzi5jlI0JsGAb0MZDcALDSAczqQSIL1wsPUgN3CyWSjLNBaylL2OqKk6EA5-rIGbqGfoz3LHEp9mxJzC_h05_pffLXru9Xq9bLMJuGTtvq-Ts2u8Gt_fD0X0Toe-3fdAK2wtW_BOkxcKnG2BLRMbZFPEqtgZjgc2tQJs3hHxtpZd6noMzM79mVsQ4hXTeymnwJ82rcWL9xw7YPqaxcf0RDEbjDH7vjwajPMb5NHj68-HDE3zqPz72h0-D-xE8POKwhncDGhU-_QH94Wd4Oxje5SCwdZhKfFlaqgKhSmqrKFIPR0KcwJiZBMstBZczyaGmEczNSliNZcFS2IVMhEOQRYyj5EJ65uPaN8XFVO2xHuufpeYqFAKueOCm7WgLnzjhWzyUN197FAw9ivYsaE6Rmbo5tvvSBufbhVhhlslKcG9sq_yeizJznJb6vtHhXAWSrPV1fmlwHIItTjbFOl30bL9K_Pgt8rZE78k0qJeJ-MKwwwIrSuaplWIGd2KBvfGWeYEdctTRiqmHJuC8KQKOm6JB__3AHYT3hK5kjcuApEKOqA3SbWUSE2fWLGLAuBkHE50CaZWmWhiancHhVWNS8kXAs8T5WP-MxIhZnrnRpCb3HCkIhApxvojNAQpOUmqP0pO6sTKyOBvrLa5jKsrwVmzgGmH5S2oSQLsN94ul34BTxifgVhAZhfZJTfTbrvD3OKNFqYXCFqyYCsK1qG7kZXoCV5qgcBN2TSicOG5ygZdVHl8yTx3Ac0UgWcFwHiwJEo8b-l4GDwXzjCqJODmNhKQBgiBOsMxJDQGLaHYPNQzDYkohUy9I6AgiNU-QBHCMRa_n5L9i4o_i6rCYxA3XcNH5tdPpHCLe4vmGWgdOB5HEBk2Zq5SPvr-AMgwnx4jWtIMFzHkITNZJsmLsTuvi8rsgDtGvk5ELqRp7VO3jOGfH2IirgdPAIgfiCYzdVYJh0Oc66nM11TT6ZheYixSM7UydqvsZm06qR4B7xl_hFG4owbaOmFc--4Fsvx3NbneEFCWKTSSUScf7EW07OXRzOM-h1cpB7mhD0nyvd3JoJBC0Z9-Y-Jrae9ehG1EGUzFH4p_lSRRCF_S7c9S8QWQFdk6phATFYJIymSup2jgqPPQSf34Y9ijmCHHSmUFKRr0E5X9cztQYdYOeeA6fFFR3rBT8BeTsgI80WekeCYh6icfSAWZ9JvxP_TFbbTpKhncBOdskZe7zlWyFZ5kQOm0jV8JcMRbdIuRJNCJv6yrJYGaNk1R15goGvTOIX0h0WRBb_Ga7q-FgyY3T4GewrVRh8Gi4usITaBQ4xxflT_BHzF5jbo0xRnYJuxTKCh-sBlIzruDdii4r-BqwhytYplecd88vQhfNZunT_SxrYr5r_vp19w00meXltVtM3nSg2URVe_zwWIkomootpvHSpuT0KCbnXOHiKt2wcAFFql-yXV7bUTsndmRGtvs7_v8HAzB-mg==))
-- [Device-ref APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJztWG1v4zYS_itTFcU5G_klQfdaeJPgfMkWNVokRZJtUdSFSlO0xYssqiJlxw3y3-8ZUrLsTbZogft4CZBYmuEzz7xwOPRTZJW12hQ2Gv_yFOk0Gp_EUS6KZS2WKhpHsk5FFEfW1JXk5-GbWUFv6NKU20ovM0c9eUSno9PTPv68jen6x-nVdEKXN7c_3NxO7qc31wNe4Bd9r6UqrEqpLlJVkcsUTUoh8a-RxPSjqpgNnQ5G1GOFWdTIZtHRO4-yNTWtxJYK46i2CjDa0kLnitSjVKUjXZA0qzLXopCKNtpl3lSD4-nQzw2ImTsBfYEVJZ4W-5ok3I46_2TOlePhcLPZDISnPTDVcpgHZTv8fnr5_vrufR_Ud8s-FDnCS5X6vdYVHJ9vSZRgJsUcfHOxIVORWFYKMmeY-abSThfLmKxZuI2olMdJtXWVntfuIHgtT_i_r4DwiQKBm9zR9G4W0b8nd9O72OP8NL3_9ubDPf00ub2dXN9P39_RzS2SdX015VTh6RuaXP9M302vr2JSCB1MqceyYi9AVXNYVRpieKfUAY2FCbRsqaReaEltGdHSrFVVwC0qVbXSoeBAMvU4uV5pJ5x_98I5b2o4K2bF57qQeZ0qOpO1NEPLS2RilRvIOrv4WCMV0EiHi7qQjCzyi325y6raumGq1rCSrJV0phpkr6nkZols5a8LLfKqUGSDF_aNgacguFbJsjJ1aV-qgF6qFgew2iCLSqy86vBNqKF_-dpuqFZqoSo2mahHgWQoOB-05pVWC7pSK4TRwbRCMC0HvynqLl4NVt9q2Jz8MLXdDt1TKiuzhgLShJ1W9M3G529nnzaZlhlJVNpc8T5sCtjBben8rvOIbFqagncZEruozMrLUOmBBYTpXpKHQ2BYVTmYZRYg_6C2loENXmXCZv59bZmNKXj9LtTkQ-3rUGF_khP2YQZGiBPigaxtS1WIlULlulu1iGn3YlqUtZsydxQCop8ky9zMRZ4ktDY6JYlsm1Wyn9ZAsxegmFN8iOJ5Yxu7dDy2-g-VOCqOZsUTXCW2adFCwH5J5_SyWt5xLIgDZx1vPxI1_JfLhJGwIlgdj5s3jbpXclwtwFyOx_wxTUpROc174KzRvuh5IdpmgjpWIk3muZEPvaOjBgfB_ueX4OvNUw52CXeXlHGX-H-lV4NH5NYvCw9D6qh0ADp9pMOfc-rtrWogpukjHo4psPFPRweIjIl6g2M9hjxDJMkHkjjwgyYV7G5Mb3q-YI7ZuPeItXjV8fm-K17wPCueGf1vl8je-5va_cXa8ZvA_r2KiT_CR3GjRf6_jv7ndTQ3Jm-Iwp-2Y6Uw5E-YNnl_VmML8uJBE4xKFIgFnZ_TCFZC4n7Bgl-BuW_iHZfhXyhSBAMjiy56XFVtBYQ--J3aMmjhGh_RRd-vSrclmxuHDl4pNG4-xFXhwhTCn6s1Ps8ifgkeOSaFtchrZQd0n0HcPJHNTJ2n3OWtynFSYpGtZdbYcZlw3OQxjykc8mSkrCseZDCm8X_ULqXCCW7xnmdXi4opJohk0lKAE_2TzofrejVnyOYMQFGCRNjsikeHg63S4Rb1KvELzunt6B-j0ahDvMRciBkJxxZ2jUaA5sI2ExN0v0DsRUoLwePAbiN0wCxNghTYo8Hbd6-S6NDPg1Aqnfd2rIb7OEf73PjgriUnrDvo_PiKEOdKAHkWtdjIVkhuqID-CQnrD1sfVQ4YtVH1oeeZCTR35_sZcnHBFp5axLjR2aXl6WWCnp_3-KLKEUo-p8MUtEvU0yimk5hOYxoMYtK-vMPENB4fjFyBBK_ZhSfspla7hfb7bTBXS5T_UexXDFSR8ufRXginYXTgMCx0hXhlIl-0A1A3R_hxSDmfvO74eMVE84SNvpe909cNWoXkpzuLXjnkRjRHAF1-uJpAUHU5-cRUcXZ2dnL6NYJ4-vXFxUWvO-kwfvVCloLinxONP0X7DrHn0ZAnJXSCOnefThE3xovQvQ6S1FZBpuQD9z6R58Fp7jaF2VDXQ9ENmpAP6No4FZrGRlEpLBd7N1R6gci5f255D-yajKLf2h78G4VwYWLfZYGWCuOlyOMm5G5j-LqyMr7ztehm_h_F26stAT6-u9kUl6d2bGVQgTvVwq907STa2rW8Cm2OZfmWr198bVl4X-cqE2vdcbvn22lq0Ef5vsqXPzgGVAJPGe5PTDZUBWt6Pb4v5aI8rJLg_ke1cVgVrdJHddEVQhwy2Qr2Eombt15s2ww0uYQvCi6pIixj1b2DEmoeNPFCdLu2glhgFr0DU63lZtvyDY37EdojDjp0F7d9et4NDjhKD8H5BA291CBNZ2fohHe1lLiWfkbfeOst58EMGJE_VBmqUq6uCuIzAG-iOOKvBnBKV90XHlGxlvLk9G19ArEpXfg2JOrD3rk8Pj75ivqiktm5XSVfjajfx1ng8MfBE5X2c7Ga-69Icj3fw5RS5ni5Dt9n4AVae_EQPcetHPk6kGPXRc-_-t__AoyhE5s=))
-- [One single storage for multiple sets](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_subsets_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJzNWYtOG0kW_ZU7XmllwNiYbDYZA9awQLLWRJANZEajIeopd5ftWtrdnqpuGw_i3_fcerTbj0TJalZaI2PcXXUf5577qOapYaQxKs9Mo_frU0MljV631UhFNi7FWDZ6jbhMRKPVMHmpY_7e2b_PaJ8u8tlSq_GkoGa8R8dHxy8O8etli65_GlwOzuni5sP7mw_nd4Ob6zZvsJveqVhmRiZUZonUVEwknc9EjA9_p0U_Sc3W0HH7iJq84L7h79039k6slGVe0lQsKcsLKo2EGGVopFJJ8jGWs4JURnE-naVKZLGkhSomVpWXY82hX7yQfFgIrBfYMcO3UX0liaIynV-Topj1Op3FYtEW1ux2rsed1C02nXeDi6vr26tDmF5t-5ilgJe0_L1UGo4PlyRmsCwWQ9ibigXlmsRYS9wrcrZ8oVWhsnGLTD4qFkJLKydRptBqWBZr4AU74X99AeATGYA7v6XB7X2D_nF-O7htWTk_D-7-efPxjn4-__Dh_PpucHVLNx8QrOvLAYcK397Q-fUv9OPg-rJFEtBBlXycafYCpiqGVSYOw1sp18wY5c4sM5OxGqmYAo1onM-lzuAWzaSeKkc4GJlYOamaqkIU9tqWc1ZV5z67z_6isjgtE0mncRnnHcNb4sjIItJy1I7LSX9zVSKwKukIrcWyv_PWqMxiVizStfvFRJem6CRyDiOiuYyLXLcnu5YgqGUsd98zscjaW1blOTCA6XMZjXVezszWEpGOc7BgMl23GfYmcrR2TeWIuhTrC7NyKrWKrdDOvuPhDzY_vD-mHAI1E8lHgXBKQOfWDLWSI7qUUwSigInS0CRfMC85zfJM0rBMHwh7yQAQDizuxTCgkDQt00JBGnnpHF2ONkgxteJVlqi5SkqRkkeA4z1Xwpt1aBSsRyjp_P3ArIrGXY71BSjyB5TANr2EOdANelqiiTRd0mIiM8gRKXPMZjxIC_oAZUpEIQir8tjqtIYFa62GiTATNrmFMjDTAoSIYaTJ09KuV9k8T-cAQwKtfMkaBBl8pAEQDwbzfywz6xsWeSBcrtxxkZLM9MwCywzPOOt5k5EzoS2Izr8NY9mv6gKvlyg-KFgZly6p21a41QLC0RB1K54oOXflpmRLrbqUy_VC2qKd5dlhvrAZyYgXy5lcIf4DKqv0FjuKcIVhxUnFDTZlVupZjsoHaqTLNg0KXsZVWWWFRB7bkmYshaB-Cq5a8Z4UIitogQqMNUluVwRlFf9XuS9KZhrr5mJE8TgyzAd-ndHrE3x0Op1TQt_5eIlfqxyjt5xjXNhfQweomhjWOAGu3DAYSBenLR3DMn5AbbF6zqhb6bgup0OUKEjE2rjUWsIRk-agvJNqYbeSncItyde0ekHy0cmWZJnKKcQa1xOM1IUlwu-l1EuGwwX1QS4jjhyEAPEg5Ue5tPEMq2Y6H-IzMmhYU-k3AFjiKtrrIWOk0JFfdeqR5W6x--V2oRAJpFDEqROFInoaDOr3gzF3rI2ZqvPU0MVbGmuRlalAeJefVfHVL6vC1hlnPTkfqemc-pMUBOHztmmDqiV3bvYb1_ZQYTsd5yXCZqtBLVd9YQiR8F9DzBySgWbuXgVhq86__onT8253Bof6EzJ5Ux-uB538kQlAVLel1wsrTsLWakt4BXPXu-7K3G_Fmjtwr-cSBNREvkauDfyXktAcez1MWSKNinxFxG-WtiNbvlnGJu5VNtyic66C5GJ662qebVI7sz3ywERRvQCunLbTTY03130n7Iyeui160SIM5a9a9H2LuvjexYUurnRxqfv9syfW1XRWLH0Rw8gJI3niQwVyxYz_1txP7ht8EcmVYqyci7SUhrsPbvtvXO7LNOEmZFDEYhZgynji1BQTUXClx-yO9oRKF6N-8tCLkZ4_Z2VhbUfdr2rbymfJRkZ8IxgBHw-7J2tzjpthBhZFWFMDFyUyr4aTqjHWxjGy41jVBa-4gm93EjfuGFKQgvzzEm3AlNf62_Vvdc21xsptfko-eQy9z7lTao479z8bSa4jXmY-_DcQNL4JRtE4zYfgd0TzXAVtzUCy_UositITU5bz3OCcAgzHnL9bg6dFjmjVnajgKRFLx0hN_JlEMLhQtrb7xtBv2puYDSKfvEOUu4fmnj2cgeb0lsdD50ATI8PF2z0e_eTjhiqVPEJT0-6-VNP2I_CxXwbJI74c-OZpv-1RJ7T8us1aLEIh4rQ7o_1mBe0BK3A22bUOrWr5WX1zW0vkfBI5fHJtmq7WuT17XiVcc7Sy8eUOXWB-tayyhx74awdPNDm3kTfx0NTEGlJQiUavCG0eHwcHe_Tk6sq6ZW0fVoa_Zenzq_rk_Hi-z553cP08Tb0M5Jplup8CR3npDlhforOWNhy6BNE2Oc1ADYVxJ0rro280lsYSzF2d4ZDOOPIhL8QXzfkf5cII4fs_y4QNqv-pXP8i0xmLb-M576ix_IM0GO0w62OU5xKpRohy9WxkPbZ-4A9UoygyEzSQBIFh1msrytoGKc2am3SGfEAO-CUuO569iCXmSTerN4NdX5FIMP1f3DLX03HNXrewFhprOCTWQWtbNu1IQI9OnOuE3dlGA1MhmC6zFR5QVuRTFd_o5l-dpy3ad0q_O9tK7885_xnsKpsGzhgPJWJiYH1KR60d2cjtvUpFcrJDDHxIZhpAj5r3jdsyjqUx39Eba_GasHBEad_DxsaeD16oURypKU6nzZCCsHJ1vAldGGFAYXAdNVlPGtvxs3LKB6S_h9ZytTqu2Yd3E4E6JihRo5G0xzB-ziaNHTqQPzskus02u4w7BfG4Vk1R9k--GSFO0N5_OgaI9feLjfffNt4v197P9RTGhKSSoNppdU-W6mr7TceG2uK2H7-aMGgtHdyOekZgxQkdHKiKHHUx_PgCNTAaClQpP8fHwhTr6sOw60rDVDzIyAmRjzjUF6e7BuTtibdZB5opvleRnD8-5zzIMWJmsKsobJhXj2qFCTPpSD0ihlP73CmfYl6UYYudwIq8ECmjEIjGE22l0OPlz-yRsew6OlnhWVtUA7WOIf9GhNYwrss7OKsvh-O-cHgjf1Xs1Sd7mqo21YHhIXmXD5zU9lgJarsZmjiMoShXGDCLNzsZy4sgL3KSzsLqtiVCDd8L9yRv98Ot0vjH1ygCPn8rRWZ1gLXMXh0tnza0P4dUHmRopkDqD_8sLGyvntaP0Z0xiYTjBsdwpQVTUtje3D4WBJfqLKuYWQ0HK7_9E8yaV1-iw1aObR2zaxee1tnQWnOC63FzD4yo2PHp2fMlGLmetCE3g6Ynl6QVAqtz79M2Ks8teqq9a0Y-r-Wme3jNj3hqD79r-FV90lQzyPaIXDv-2AZmr56envIJ9Ph1v99vrsR4HNpjWdSOEW_UZscJwrg9f60oXq9lUerMJjpcbLQaXDnQ3fXqX16NbB7H3eOXZRe3-Wmz_X9Y4xAUOIsPDrqv6FDoeHJmptGrIzo8RC8pDm1BTGRymIrp0P6TLFXDmsw4jlNcnLv_aOECpuzsofHcCvcRvLX7QLzx_Mn-_Af5iUl4))
+- [Device-ref APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJztWG1v4zYS_itTFcU5G_klQfdaeJPgfMkWNVokRZJtUdSFSlO0xYssqiJlxw3y3-8ZUrLsTbZov9ynS4DEEofPPPPCmaGfIqus1aaw0fiXp0in0fgkjnJRLGuxVNE4knUqojiypq4kPw_fzAp6Q5em3FZ6mTnqySM6HZ2e9vHnbUzXP06vphO6vLn94eZ2cj-9uR7wBr_pey1VYVVKdZGqilymaFIKiX_NSkw_qorZ0OlgRD0WmEXN2iw6eudRtqamldhSYRzVVgFGW1roXJF6lKp0pAuSZlXmWhRS0Ua7zKtqcDwd-rkBMXMnIC-wo8TTYl-ShNtR55_MuXI8HG42m4HwtAemWg7zIGyH308v31_fve-D-m7bhyKHe6lSv9e6guHzLYkSzKSYg28uNmQqEstKYc0ZZr6ptNPFMiZrFm4jKuVxUm1dpee1O3BeyxP27wvAfaKA4yZ3NL2bRfTvyd30LvY4P03vv735cE8_TW5vJ9f30_d3dHOLYF1fTTlUePqGJtc_03fT66uYFFwHVeqxrNgKUNXsVpUGH94pdUBjYQItWyqpF1pSm0a0NGtVFTCLSlWtdEg4kEw9Tq5X2gnn370wzqsazopZ8bkuZF6nis5kLc3Q8haZWOUGss4uPpZIBSTS4aIuJCOL_GJ_3WVVbd0wVWtoSdZKOlMNstdEcrNEtPLXFy3iqpBkgxf6jYGlILhWybIydWlfioBeqhYHsNogikqsvOjwTcihf_ncbqhWaqEqVpmoR4FgKBgfpOaVVgu6Uiu40UG1gjMtO79J6s5fDVbfauic_DC13QndEyors4YAwoSTVvTNxsdvp582mZYZSWTaXPE5bBLYwWzp_KnziKxamoJPGQK7qMzKryHTAwsspntBHg6BYVXloJZZgPyD2loGNniVCZv597VlNqbg_TtXk3e1z0OF80lO2IcZGMFP8Aeiti1VIVYKmetu1SKm3YtpUdZuytyRCPB-kixzMxd5ktDa6JQkom1WyX5YA81egGJO8SGK541j7NLx2Oo_VOKoOJoVTzCVWKdFCQH7JZ3Ty2x5x74gdpx1fPxI1LBfLhNGwo6gdTxu3jTiXshxtgBzOR7zxzQpReU0n4GzRjoOixkyLU3muZEPF73wRttk_3Xv6KhBhvv_-SUs8IQoB9-E603Kmpb4f6VXg0dE228LD0PqyHUAOn2kw59z6u3taiCm6SMejimw8U9HB4iMiQyEqT2GPINvybuWOBSDJjjsgJje9HwKHbNybxFL8a7j831T_MLzrHhm9L-dNHvvb2r3F7PJHwv793Io_ggf6Y6i-f_M-h9k1tyYvCEKe9qqlkKR70JtOP8s6xbklweNMypRwBd0fk4jaAmh_AUbfgXmvop3nJh_IW3hDIw1uuhxnrU5EWrld2rLoIVrbESlfb8q3ZZsbhyqfKVQ3LnRq8KFSYU_V2t8nkX8EjxyTBNrkdfKDug-w3LzRDYzdZ5yJ7AqRzfFJlvLrNHjMuG4EWBmUxgEyEhZVzzsYJTj_8hmSoUT3AY8zy47FVNM4MmkpQAj-iedDdf1as6QTZ9AmoJEOP6Kx4uDw9PhFvUq8RvO6e3oH6PRqEO8xOyIOQqtDedIw0FzYZupCrJfwPcipYXgkWF3NDpgXk3CKrBHg7fvXiXRoZ-HRal03tuxGu7jHO1z4-ZeSw5Y1wz9iAsX50oAeRa12IhWCG7IgP4JCesbsvcqO4xar3rX81wFmrsZ4AyxuGANTy1i3MjswvL0MkDPz3t8keVwJffyMCntAvU0iukkptOYBoOYtE_vMFWNxwdjWSDBe3buCaeplW6h_XkbzNUS6X8U-x0DVaT8ebTnwmkYL9gNC13BX5nIF-2Q1M0afmRSzgevayivqGiecND3onf6ukKrEPx0p9ELh9iIpinQ5YerCRaqLiafmDzOzs5OTr-GE0-_vri46HW9DyNaL0QpCP450fhTtO_gex4feZpCJahz9-kQcWG8CNXrIEhtFmRKPnDtE3kejOZqU5gNdTUU1aBx-YCujVOhaGwUlcJysneDp18QOdfPLZ-BXZFR9Ftbg3-j4C5M9bso0FJhBBV53LjcbQxfaVbGV74W3cz_o_h4tSnADb2bX3HBakdbBhW4dy38TtdOq61ey7tQ5ngt3_IVja82C2_rXGVirTtu93yDTQ3qKN9p-YIIw4BK4CnDHYvJhqxgSS_Hd6pclIdZEsz_KDcOs6IV-igvukSIQyTbhb1A4nauF9s2Ak0sYYuCSaoI21h0r1FCzIMmfhHVrs0gXjCL3oGqVnNzbPkWx_UI5RGNDtXFbZ-ed4MDWukhOHfQUEsNwnR2hkp4V0uJq-tn9I3X3nIezIAR-abKUJVydVUQ9wC8ieKIvz5Al666L0WiYi3lyenb-gTLpnThG5OoD33n8vj45Cvqi0pm53aVfDWifh-9wOGPgyUq7ediNfdfo-R6vocppczxch2-88ALlPbiIXqO23XE62Adpy56_tX__hezoiBz))
+- [One single storage for multiple sets](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_subsets_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJzNWYtOG0kW_ZU7XmllwNiYbDYZA9awQLLWRJANZEajIeopd5ftWtrdnqpuGw_i3_fcerTbj0TJalZaI2PcXXUf5577qOapYaQxKs9Mo_frU0MljV631UhFNi7FWDZ6jbhMRKPVMHmpY_7e2b_PaJ8u8tlSq_GkoGa8R8dHxy8O8etli65_GlwOzuni5sP7mw_nd4Ob6zZvsJveqVhmRiZUZonUVEwknc9EjA9_p0U_Sc3W0HH7iJq84L7h79039k6slGVe0lQsKcsLKo2EGGVopFJJ8jGWs4JURnE-naVKZLGkhSomVpWXY82hX7yQfFgIrBfYMcO3UX0liaIynV-Topj1Op3FYtEW1ux2rsed1C02nXeDi6vr26tDmF5t-5ilgJe0_L1UGo4PlyRmsCwWQ9ibigXlmsRYS9wrcrZ8oVWhsnGLTD4qFkJLKydRptBqWBZr4AU74X99AeATGYA7v6XB7X2D_nF-O7htWTk_D-7-efPxjn4-__Dh_PpucHVLNx8QrOvLAYcK397Q-fUv9OPg-rJFEtBBlXycafYCpiqGVSYOw1sp18wY5c4sM5OxGqmYAo1onM-lzuAWzaSeKkc4GJlYOamaqkIU9tqWc1ZV5z67z_6isjgtE0mncRnnHcNb4sjIItJy1I7LSX9zVSKwKukIrcWyv_PWqMxiVizStfvFRJem6CRyDiOiuYyLXLcnu5YgqGUsd98zscjaW1blOTCA6XMZjXVezszWEpGOc7BgMl23GfYmcrR2TeWIuhTrC7NyKrWKrdDOvuPhDzY_vD-mHAI1E8lHgXBKQOfWDLWSI7qUUwSigInS0CRfMC85zfJM0rBMHwh7yQAQDizuxTCgkDQt00JBGnnpHF2ONkgxteJVlqi5SkqRkkeA4z1Xwpt1aBSsRyjp_P3ArIrGXY71BSjyB5TANr2EOdANelqiiTRd0mIiM8gRKXPMZjxIC_oAZUpEIQir8tjqtIYFa62GiTATNrmFMjDTAoSIYaTJ09KuV9k8T-cAQwKtfMkaBBl8pAEQDwbzfywz6xsWeSBcrtxxkZLM9MwCywzPOOt5k5EzoS2Izr8NY9mv6gKvlyg-KFgZly6p21a41QLC0RB1K54oOXflpmRLrbqUy_VC2qKd5dlhvrAZyYgXy5lcIf4DKqv0FjuKcIVhxUnFDTZlVupZjsoHaqTLNg0KXsZVWWWFRB7bkmYshaB-Cq5a8Z4UIitogQqMNUluVwRlFf9XuS9KZhrr5mJE8TgyzAd-ndHrE3x0Op1TQt_5eIlfqxyjt5xjXNhfQweomhjWOAGu3DAYSBenLR3DMn5AbbF6zqhb6bgup0OUKEjE2rjUWsIRk-agvJNqYbeSncItyde0ekHy0cmWZJnKKcQa1xOM1IUlwu-l1EuGwwX1QS4jjhyEAPEg5Ue5tPEMq2Y6H-IzMmhYU-k3AFjiKtrrIWOk0JFfdeqR5W6x--V2oRAJpFDEqROFInoaDOr3gzF3rI2ZqvPU0MVbGmuRlalAeJefVfHVL6vC1hlnPTkfqemc-pMUBOHztmmDqiV3bvYb1_ZQYTsd5yXCZqtBLVd9YQiR8F9DzBySgWbuXgVhq86__onT8253Bof6EzJ5Ux-uB538kQlAVLel1wsrTsLWakt4BXPXu-7K3G_Fmjtwr-cSBNREvkauDfyXktAcez1MWSKNinxFxG-WtiNbvlnGJu5VNtyic66C5GJ662qebVI7sz3ywERRvQCunLbTTY03130n7Iyeui160SIM5a9a9H2LuvjexYUurnRxqfv9syfW1XRWLH0Rw8gJI3niQwVyxYz_1txP7ht8EcmVYqyci7SUhrsPbvtvXO7LNOEmZFDEYhZgynji1BQTUXClx-yO9oRKF6N-8tCLkZ4_Z2VhbUfdr2rbymfJRkZ8IxgBHw-7J2tzjpthBhZFWFMDFyUyr4aTqjHWxjGy41jVBa-4gm93EjfuGFKQgvzzEm3AlNf62_Vvdc21xsptfko-eQy9z7lTao479z8bSa4jXmY-_DcQNL4JRtE4zYfgd0TzXAVtzUCy_UositITU5bz3OCcAgzHnL9bg6dFjmjVnajgKRFLx0hN_JlEMLhQtraHxuBuurwdotI99JvuijJR_XJzzx7XQHx6ywOjc6mJIeLi7R4Pg_JxQ7lKHqG7aXdfqmn7EYjZL4PkEV8OfDu13_aoE4aAuhdaLEJp4kQ8o_1mBfYBK3A22bUOv2r5WX1zW0tUgSRyiOXaNF31c3v2vEq45ohmI849u8BEa3lmj0Hw146iaHtuI2_iMaqJNaSgEq1fERo_Pg4O9ujJVZp1y9o-0ByQliXUr-qT8-P5Pnvewf7zNPUykH2W-34uHOWlO3J9ieBa2nDoEtTbZDkDNRTGnTGtj771WGJLcHl1qkOC4xCITBFfNOd_lB0jhO__Pjc2yP-nsv-L3Gd0vo35vKPG-w_SYPzDeQDjPpdRNULcq-cn69H2h4JAPooiM0GTSRAqzgNtRVnbIKVZc5POkCHICr_E5cuzF7HEzOnm-Waw6ytSC6b_i9vqeoKu2esW1kJjDYfEOmhty68dKenRiXOdsDvbaGByBPdltsIDyop8quIb3fyr87RF-07pd2dbCf855z-DXWXTwBnjoURMDKxP6ai1Iz95BKiSk5zsEAMfkpkG0KPmfeO2jGNpzHf0xlq8JiwcY9r3sLGx54MXqhZHaooTbDMkJaxcHYFCp0YYUCpc103Wk8ZOBVk55UPU30OzuVod6ewDvolAZROUqNFI2qMaP4uTxg4myJ8dEt1mm13GnZR4pKsmLfsn34wQJ2jvPx0DxPr7xcb7bxvvl2vv53oKY4pSSVDttLqnT3W1_aZjQ21x249oTRi0lg5uRz0jsOKEDg5URY66GH7EgaoYDQWqlJ_1Y2GKdfVhIHalYSoeZOSEyEcc_IvTXUP09lTcrAPNFN-rSM4fn3Me5BgxM9hVFDbMtEe1woS5daQeEcOpfTaVTzFTyrDFTmlFXoiUUQhE46m3Uujx8uf6yFh2HZ2s8KwtqoFax5B_I0JrGNflHZzVl8NxXzi8kb8q9uqTPXFVm-rA8CC9ywdOanv0BLXdnE0cxlCUKwyYxZu9jeVFkBc5SWdhddsSoYbvhXvat_sBWGn8I24UAZ-_lSKzOuRaZq-On08b2p9DKg8ytFcg9Yd_Xha2V0_0x-jXmE3CkYRjuNKCuSlsb24fHYJLdZZVzKzGhZXf_ilnzasv0WErx7aO4rULT-tsaK05wfW4uQdGVOz49Oz5EoxcT9qQm0HTk0vSCoHV2fhpG5XnFj3V3jUjn9dy0z3g5sdAtQfkNfyqPmmqGWR7aK4dkWwDs1dPT0_5lHr8ut_vN1diPA7tsSxqB4s3arPjBGHcnr9WFK_Xsih1ZhMdLjZaDa4c6O569W-xRjaP4-7xy7KL2_xE2v7PrHEICpzFBwfdV3QodDw5M9Po1REdHqKXFIe2ICYyOUzFdGj_kZaqYU1mHMcpLs7df71wAXN39tB4boX7CN7afSDeeP5kf_4DPIxWUA==))
 - [Using shared memory as storage](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/shared_memory_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJyVWA1vGzcS_SvEHgrIqSzZwRUtFNk41fbhhOTsg-0kKKpiTXEpiefVUuVypaiG__u9GXK1K9nppQ7iRPyYGb55fDPUU1LqsjS2KJPBr0-JyZLBaTfJZTGv5Fwng0RVmUy6SWkrp-hz_82kEG_EhV1tnZkvvOioI_H25O3fj_Hrh664_jS-HI_Exc3tf25uR_fjm-sebeBNH4zSRakzURWZdsIvtBitpMI_caYrPmlH0Yi3vRPRoQWTJM5NkqN3bGVrK7GUW1FYL6pSw4wpxczkWugvSq-8MIVQdrnKjSyUFhvjF-wq2uFwxC_RiJ16ifUSO1b4NGuvFNLvQqefhferQb-_2Wx6ksPuWTfv52Fx2f8wvri6vrs6Rui7bR-LHPAKp3-vjMPBp1shV4hMySnizeVGWCfk3GnMeUuRb5zxpph3RWlnfiOdZjuZKb0z08rvgVfHifO3FwA-WQC40Z0Y300S8fPobnzXZTufx_f_uvl4Lz6Pbm9H1_fjqztxc4tkXV-OKVX49E8xuv5FvB9fX3aFBnRwpb-sHJ0CoRqCVWcBwzut98KY2RBWudLKzIwSNY3E3K61K3AssdJuaQLhEGTGdnKzNF56HntxOHbVnxST4m-mUHmVaTFUlbL9kraotNS-p6rF-eGKTGJF1p9VhSLLMj9cYS1igYm1TufOVquyF4z034TU_YMpVS6Qgixd6qV121R_kQBAw2FYMnVGz8QlJgvAL73GAUo6cCRSEyOlNtgSwVZzLq9hE3sRUy4B8532t3qGWNJ0ntupzNNUrK3JsB9byVj6CDR13vHblS7kUsctg4H-4nXhUxoXa5mbLA0jlPz_90OgwsJy5bewvx2-MI5BtnwudosQTAG26vxoUjyRj35ffAYRjCu9kHluFZ2LkWifHbhYRzDVjHnAoR560cA9AxcWgNl0lZ0h1QCoysLfvLJVKaaVetS-pNEyt77sxu2bhVELdr5h-2utYMz8AQO5lVnJbtI0JjYVL84J4CodMGTDv7aR7PFs5-i3d5Q6dniNma7YaAoOLKiU5zNJ5SuZ705ip_9FIGLm7JKnHe6-dE4yEYSscPvrpWe7UOJI6vSMA-rsJTWEx6rIceDAFAXuvkEgEkjt0Nctz9Foyy9ma5-dl7ntiqfn5m_cfdScn_KlKuewHPadlpC5XNlH0RcX96PGDcODKase4e7l_RsMSMnTYCLldZ2Wm3EBYcTx_9A7_GrAqpKkBSmP_ku6bbSIjXAEJBNmZ6HDEzvk7raFWjhb1LZDjAh5KR-BV-VAxHov65RYSCo5hSkXQQyjpxKGOgfQMLfr9GcWEkGFC6yOpUkWW9GolPHbXnOFFnKtKQ5oS1GCzEthSEkwImsX0bBfSI9IVyvrcCfoELVN4kRB9aXDoNCNkhDrB3BEO_9w1CQojBDZkB8CzOmpKbI0JMq6shM0IqxrHfMKxbBOfZhEBjiMMJSFmCMT2eFX1SXSBB8RREA1MgIYBEJEtjO3cWoz45ZgI4l_llsClPN5WlI6LVcw_CpQFtFf0Moa1dA86DYVa1BRrkgsgo5TexHB7ArT071uPGVHoU4j0hBUA18vTsepr7BMebnPMSJw2CliQFQROdo_49q-CNUwoD0Bh02oQwikZGGYkuaixqLnoLYnXBxWYFJWDJUPu7MyDxWg9zUsEkTagL7Fsd1wNY_km8oytB1sqcHh4YCWLSfoVGAIilzMtYvmm3WRm3VFuBh9uhrdD4gatdBQ7pCcPWfoJxoXYciEjgLwUfOlgaWx7lCS2ltAulYev3IB6g2t5F7UcFMSX8DN9JiJTrj5jbde_YGpciSeBEpd4WedSXJ1e3tzOxDvcQ--y1gygiV4TCLpxPOkeKYIsAU0MkWH2oS6FIfcvud7hAX7ojRJWOfRHNYqT1BJNKo4-xpYcZnb61vxAOD2Ty-xg7nZ2ykzAcZ8cNiCPlSvA69gMxjCfz7-zOsP2wwEeB7yQG3mK63F0_Hpc83yzyaDYiLvh0VdZJXj9tJZZGx-kGG2HPqFoApn4rQ2edFcfsF1SNCKV_bXihL2xt3_xr1cVktRVMspdKbuRer6U6sd3X5uBQ_6oIhvEGy2DpdT1i90PdR_erOkVs4ZejDUtiKG3AUMDXUCpycnJ-eEMhkE5DqIwEMTPnFE0bMFFYbOgA3vHhqetDvHs685qDG75AvFQUfARQm9WYYAg8E4kYaJfcM5dkuXxiXDiGw3zuK6yir3KWrsIq3FiHly3gAPacqaKKiDYQ9zpDFAv5IO5cXjOSnkFHreCo066L14mjad3HxLs4yfFmLfuIMeJdTkcDUr8eDUocv5S9vxqEFmfkdrmXrLqHzj9lcS8s2OWfTgPo09vXV_xXMNMjdrw9Y9PD9_Ub5INfECpNtIWWzVGsilhvTjRc-5a9rZmPvm-YBrYMKV4pUbzdUiXsnwdKLLsXFyRZJHjeTereS2XEjSw4eGGg8HDKp7crBo18zUzBoM6tlhi7MXwIEk1JaabDeaQaeEaq7RYHJ5cNByetmEiHe3lqV075hNj2eo-aVOgntj8EOX9FXAbm-HLvARl0godOWC6QKw45XumhrMD4zI7ddUtP0AEWch_yG_1Mek7enha4z7MzJ0Wlfq6blVXD9InHPRTl_onAUlg0SyfnXsuv2Dl_KwnbDz4XB4CkV7-9M5fO6_qF552b4LVSuTl6hqSre6OO6-UICTbhLl2jXfoiXFWqnTtz9Up5i2Kx--YkuOcX_P1Pffn_4ojqVTi7Nymf54Io6Pga4_5jAynR3ncjnl791yM23ZVErlGFyHL8kwgOdm8Zg8d-t5gLs3n-l18vwb__kfZVnMsw==))
 - [Using set as mapping table to handle large keys or indeterministic sentinels](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/mapping_table_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy1WQ1v20YS_SsDHe4iJZRkKck1UGyjOqdFjRb2IXZaFLYhr8iVRITkqsulFJ3g_35vZknqw3KdHnIOHIvk7OybmTcfS60buc7z2GR5Y3CzbsRRY9ALGonKpoWa6sagERaRagSN3BQ25Ovuy9uMXtKZma9sPJ05aoYt6h_13wR08ev5h_MhnV1-_Pflx-H1-eVFh2VF_pc41FmuIyqySFtyM03DuQrxp3wS0K_aMhDqd46oyQK3jfLZbaP1XrSsTEGpWlFmHBW5hpo4p0mcaNJfQj13FGcUmnSexCoLNS1jN5OtSj0Ch34vlZixU5BXWDHH1WRbkpSrofPPzLn5oNtdLpcdJbA7xk67iRfOu7-cn_1wcfVDG9DrZZ-yBJ4lq_8oYgvDxytScyAL1Rh4E7UkY0lNrcYzZxj50sYuzqYB5Wbilspq0RPFubPxuHA7zqtwwv5tAbhPZXDc8IrOr24b9K_h1flVIHp-O7_-6fLTNf02_PhxeHF9_sMVXX5EsC4-nHOocPUjDS9-p5_PLz4EpOE6bKW_zC1bAagxu1VH3odXWu_AmBgPK5_rMJ7EIVUMoqlZaJvBLJprm8aeawAZiZ4kTmOnnNx7ZJxs1b3NbrO_xVmYFJGm47AITTfnJeEo164TFrPTfYlIQSLqKmvV6vTgI1fME73zyM1skbtupBfYe7TQoTO2s686NvC0Vqnc7r70cf5e-JcitjBy5Di6I_1FwVsa6Go6fD-2sZ7QB53CVmeV0znNzJJDz0yeqXxGMIgUnEOJMZ-LOYmyipkG6RZnKqFIOVWrvWeH3PvlIp5TXoQzViOPBoONs-45jLt3AfseC-ZzYx3Ik6zoTfcdqOo0fdar3Af7mrNsEylSNgbpaaxDxdAPQLAamljSmRRkOENKgs7tYRa1r5ZqTs2z4VVLdBuwoow_cwibIhNybf2tJm79UWh4Lm8FTBrxxEzZiNODMqxc8FalBXlpQlsMqDj2zl9iyw5dG9KZeLUymndNlJ1q2TwPoEJzolqDNOf04tzkYKY6c5VBiQk_V7b6SuHZX-YtJRqwOvSTWeKDDXydqpUuTZFEkFGS-IqrmUCCK4AmldIV6RBMg3ejAplm6i1mqLlki8zFqUbhyh3T4zGqTeEdRlHMzlRJsgoOxYpDiCggWFHEuc6pCvZ4R0MsNEkSS2HGU5MUQoFUu5mJtuhRG1fWO9FrxXt4sIiRPgr8Bu5MJ1CuuFZHKIacB0VWcGvIE4MQohDy3hXLCclShK6wGtFjH1c6FioppP4pBqbtAhr8PdGeIp1BUco4AsRVjONXKgck2J526MI4XfpWEAnz4ICSdytxxoE9uVZNcAtlXc_UIkax8H4IUX7H9QacY776aqmMHARazrRHEWfzwomh5FZz0SvNS38RqkfaccXMPAZsHUcHkIA8lZsU92MwCBSMNgQQWLnUI6iPmFpbJQjGVTFlLnoKMDrpLXnBeb5CLBnETo3KRDnH0KJYCid8CSyrFrQ9qloD30_bEkjOttLjcGMZmA0xifNIKAIIh0ogSX3vVCo_CcT7du_el9DHnsrVRNdFS4VhyfV7VnbT7t3ds1CciaNrtWegkUkFF9OZ48I1CiSHmFtJwBRcafghENftHvaLMbzCC23hZ9ur2WKsct-4S2M3kfsesdU-fmVDYYRcsDYhZMfPCzs3XJG5-nXo3JW8gEanQVQBk0uzgRNTFA1RX5cbR0uMQ5CJfMyqzVQyBWI3SzeNuG56vpt9QvTaVSbUfmFVlcluaWpr_UrH_krpmi63vCEJUO7iNHYHOdGUcTNTKHTX6Li-DlC6GvlN1hCvr5rX2BEiL4UaLRrQiD-s-b8HWj-wrMdOG_RnMx1-RsgnNUpPuoipE2KLOjTwSmxrNnqulroOcNI_qfby5iazXFif6IkDMUCKnGsiVOovu5K2lPTT9ROiVqMiZvQCPtEvKgtkqtmzwS_o8p_RqJxuRt5XPNvR2JikbMLGNltNUOZ1f-QYb0DVBSC1_BrWI46HO7vegaBJQm9I-w4puao8FKFUWcT0syCptImHNJ5PosFgqt3x0WlTQnkDRHctOjl5UsCKgO_6m5896d5z6np_SV3_OXX9v6Tu9XPqXu-o40MRCbdr-stT3H94_1y6SnWaFFlYFX9fex-V2U19kubsSfjNctkX1jqZ_eXXZPMWm1UB9Ac4LGAPEveSx0TZyw86k9hiWigpvM3gp4vtV3FWEPwPcXKxNBcZnmCNYMRCV6ObYu7N6F5A3h-IxzXPE2XXFKE_j8fCYLaQrSrX_8Mvw5AuXhPTQoNx5fgYx8sbHC3x4bHZfpEXCuigVO-x1E5KHFjS_4olT4F6Xa29Obr7U1wbwd7d10PbrOp_rfrXpeDd3S1i32ASMAc4zqmKsyYHo_I7uHo92x0c6lmxPLVgcs2qM041bPAIyT1_rpPEE7zgD7yQY86a_Uj3M_TtFBnBeFyUKRRQiMNWIF0i2BaTPnccmQKTWkBvPv1yevq-xHueqinXe95WxNC-_iOAZ8WUTZEn24akxuoKNpoOj-244kabItXy2I-TpfpNQ3liJuaXPkxUmGANT90y0G8dKXy9EJbTTPKQTnyYxTR_9j-GZ07X-45Z93pFQC_UCxzp0Hkfu2Td6wTUx-9r_L7pPDwEuwQ6VPj_35r7fdY8flrzW2j8J36_w--7b4r5G2ieqCT_Gnc8vN8LrbDupIxwh6-avhD7Vz2Dwc67Hok3Rb7P-DW-NNc5qHAI3ctAlfkeIzdwuC6z5r7OTtKZKXBgF1CL8u3WcmaS6vBnVTbV5SacvNWri_K9yfCqzg89mcRhrKUZKH9YKpPeZPKmrMzoocD0eV3iqTJTDkdyYD6YOFgeijakEKqFPzj7c9LWIYl1bfbYDJJoLG41gndGtdoTavdKH27FhTWN5s7isXd3h_9rtjqokmWERJoPmlyadt9hrf0N_YV74bHwgUM7cqdrCfhL6j8E3p1jrgJvj_5OiVEReMRhfoJ_W0QU7ZUt68dWPcnh-qc6k6wrS59f4rdNoF_ZEb87QCSPe8FmIhJ6nq53rzcbPDzJ1Xz_HP22ntT34uLXjGTNyRNJUgf-dH0UEAD2kYDIP598_D7WH-ibW8o6Y41-0OT3d1s3cS5ttlrb7Ciy-I9C-_03P88jaXLcW_vZv6VthL2gh9FhXMOAtdAjnFmaWyIVxkdqsiLdCdSJ7xH8yp1f0x3SEezvXVm5N0MhTta_5fDzAm8lD-SFPnLRlTWiitegGhdIxtKmwIwB6eg9_oiC9_TqVdzaGnQ_lgb7t1mEzVnj_isHL-7nP1_7brZsuInvNiMs_ymH3iM_uTSCBvfuONF28x1SI1uEYa__tujhsZk7_wVTow0fnISvXvW-o7ay4ewkT0ffHVG7jRLi2pLUkY7aiUrH8q1TEo-3dIZhmODmwn9PhBvoOtnnxkNQPUcW7TwHZRoPd_Lvv3bzBxQ=))
 
diff --git a/examples/static_set/device_ref_example.cu b/examples/static_set/device_ref_example.cu
index 76edd041d..74346616a 100644
--- a/examples/static_set/device_ref_example.cu
+++ b/examples/static_set/device_ref_example.cu
@@ -42,7 +42,7 @@ __global__ void custom_cooperative_insert(SetRef set, InputIterator keys, std::s
 
   constexpr auto cg_size = SetRef::cg_size;
 
-  auto tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+  auto tile = cg::tiled_partition<cg_size, cg::thread_block>(cg::this_thread_block());
 
   int64_t const loop_stride = gridDim.x * blockDim.x / cg_size;
   int64_t idx               = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
@@ -60,7 +60,7 @@ __global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, O
 
   constexpr auto cg_size = SetRef::cg_size;
 
-  auto tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+  auto tile = cg::tiled_partition<cg_size, cg::thread_block>(cg::this_thread_block());
 
   int64_t const loop_stride = gridDim.x * blockDim.x / cg_size;
   int64_t idx               = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
diff --git a/examples/static_set/device_subsets_example.cu b/examples/static_set/device_subsets_example.cu
index 89e1f81cc..0e3758649 100644
--- a/examples/static_set/device_subsets_example.cu
+++ b/examples/static_set/device_subsets_example.cu
@@ -80,7 +80,7 @@ __global__ void insert(ref_type* set_refs)
 {
   namespace cg = cooperative_groups;
 
-  auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+  auto const tile = cg::tiled_partition<cg_size, cg::thread_block>(cg::this_thread_block());
   // Get subset (or CG) index
   auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
 
@@ -105,7 +105,7 @@ __global__ void find(ref_type* set_refs)
 {
   namespace cg = cooperative_groups;
 
-  auto const tile = cg::tiled_partition<cg_size>(cg::this_thread_block());
+  auto const tile = cg::tiled_partition<cg_size, cg::thread_block>(cg::this_thread_block());
   auto const idx  = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size;
 
   auto raw_set_ref  = *(set_refs + idx);
diff --git a/include/cuco/bloom_filter_ref.cuh b/include/cuco/bloom_filter_ref.cuh
index 2f3dcfa2b..706f2b539 100644
--- a/include/cuco/bloom_filter_ref.cuh
+++ b/include/cuco/bloom_filter_ref.cuh
@@ -91,7 +91,7 @@ class bloom_filter_ref {
    * @param group The Cooperative Group this operation is executed with
    */
   template <class CG>
-  __device__ constexpr void clear(CG const& group);
+  __device__ constexpr void clear(CG group);
 
   /**
    * @brief Erases all information from the filter.
@@ -132,7 +132,7 @@ class bloom_filter_ref {
    * @param key The key to be added
    */
   template <class CG, class ProbeKey>
-  __device__ void add(CG const& group, ProbeKey const& key);
+  __device__ void add(CG group, ProbeKey const& key);
 
   /**
    * @brief Device function that adds all keys in the range `[first, last)` to the filter.
@@ -148,7 +148,7 @@ class bloom_filter_ref {
    * @param last End of the sequence of keys
    */
   template <class CG, class InputIt>
-  __device__ void add(CG const& group, InputIt first, InputIt last);
+  __device__ void add(CG group, InputIt first, InputIt last);
 
   /**
    * @brief Adds all keys in the range `[first, last)` to the filter.
@@ -255,11 +255,11 @@ class bloom_filter_ref {
    * @return `true` iff the key's fingerprint was present in the filter
    */
   template <class CG, class ProbeKey>
-  [[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const;
+  [[nodiscard]] __device__ bool contains(CG group, ProbeKey const& key) const;
 
   // TODO
   // template <class CG, class InputIt, class OutputIt>
-  // __device__ void contains(CG const& group, InputIt first, InputIt last, OutputIt output_begin)
+  // __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin)
   // const;
 
   /**
diff --git a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
index bc194f7df..1cbc50a0d 100644
--- a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
+++ b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
@@ -95,7 +95,7 @@ class bloom_filter_impl {
   }
 
   template <class CG>
-  __device__ constexpr void clear(CG const& group)
+  __device__ constexpr void clear(CG group)
   {
     for (int i = group.thread_rank(); i < num_blocks_ * words_per_block; i += group.size()) {
       words_[i] = 0;
@@ -149,7 +149,7 @@ class bloom_filter_impl {
   }
 
   template <class CG, class ProbeKey>
-  __device__ void add(CG const& group, ProbeKey const& key)
+  __device__ void add(CG group, ProbeKey const& key)
   {
     constexpr auto num_threads         = tile_size_v<CG>;
     constexpr auto optimal_num_threads = add_optimal_cg_size();
@@ -166,7 +166,7 @@ class bloom_filter_impl {
   }
 
   template <class CG, class InputIt>
-  __device__ void add(CG const& group, InputIt first, InputIt last)
+  __device__ void add(CG group, InputIt first, InputIt last)
   {
     namespace cg = cooperative_groups;
 
@@ -208,7 +208,7 @@ class bloom_filter_impl {
       typename policy_type::hash_result_type hash_value;
       size_type block_index;
 
-      auto const worker_group  = cg::tiled_partition<worker_num_threads>(group);
+      auto const worker_group  = cg::tiled_partition<worker_num_threads, CG>(group);
       auto const worker_offset = worker_num_threads * worker_group.meta_group_rank();
 
       auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads);
@@ -229,7 +229,7 @@ class bloom_filter_impl {
   }
 
   template <class CG, class HashValue, class BlockIndex>
-  __device__ void add_impl(CG const& group, HashValue const& hash_value, BlockIndex block_index)
+  __device__ void add_impl(CG group, HashValue const& hash_value, BlockIndex block_index)
   {
     constexpr auto num_threads = tile_size_v<CG>;
 
@@ -327,7 +327,7 @@ class bloom_filter_impl {
   }
 
   template <class CG, class ProbeKey>
-  [[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const
+  [[nodiscard]] __device__ bool contains(CG group, ProbeKey const& key) const
   {
     constexpr auto num_threads         = tile_size_v<CG>;
     constexpr auto optimal_num_threads = contains_optimal_cg_size();
@@ -359,7 +359,7 @@ class bloom_filter_impl {
 
   // TODO
   // template <class CG, class InputIt, class OutputIt>
-  // __device__ void contains(CG const& group, InputIt first, InputIt last, OutputIt output_begin)
+  // __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin)
   // const;
 
   template <class InputIt, class OutputIt>
@@ -432,7 +432,7 @@ class bloom_filter_impl {
   // [[nodiscard]] __host__ double expected_false_positive_rate(size_t unique_keys) const
   // [[nodiscard]] __host__ __device__ static uint32_t optimal_pattern_bits(size_t num_blocks)
   // template <typename CG, cuda::thread_scope NewScope = thread_scope>
-  // [[nodiscard]] __device__ constexpr auto make_copy(CG const& group, word_type* const
+  // [[nodiscard]] __device__ constexpr auto make_copy(CG group, word_type* const
   // memory_to_use, cuda_thread_scope<NewScope> scope = {}) const noexcept;
 
  private:
diff --git a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl
index 96d2c0573..cb5a47cbc 100644
--- a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl
+++ b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl
@@ -39,7 +39,7 @@ __host__ __device__ constexpr bloom_filter_ref<Key, Extent, Scope, Policy>::bloo
 
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 template <class CG>
-__device__ constexpr void bloom_filter_ref<Key, Extent, Scope, Policy>::clear(CG const& group)
+__device__ constexpr void bloom_filter_ref<Key, Extent, Scope, Policy>::clear(CG group)
 {
   impl_.clear(group);
 }
@@ -66,15 +66,14 @@ __device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(ProbeKey const
 
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 template <class CG, class ProbeKey>
-__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(CG const& group,
-                                                                  ProbeKey const& key)
+__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(CG group, ProbeKey const& key)
 {
   impl_.add(group, key);
 }
 
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 template <class CG, class InputIt>
-__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(CG const& group,
+__device__ void bloom_filter_ref<Key, Extent, Scope, Policy>::add(CG group,
                                                                   InputIt first,
                                                                   InputIt last)
 {
@@ -125,7 +124,7 @@ template <class ProbeKey>
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 template <class CG, class ProbeKey>
 [[nodiscard]] __device__ bool bloom_filter_ref<Key, Extent, Scope, Policy>::contains(
-  CG const& group, ProbeKey const& key) const
+  CG group, ProbeKey const& key) const
 {
   return impl_.contains(group, key);
 }
diff --git a/include/cuco/detail/bloom_filter/kernels.cuh b/include/cuco/detail/bloom_filter/kernels.cuh
index 91361b971..8af37fc84 100644
--- a/include/cuco/detail/bloom_filter/kernels.cuh
+++ b/include/cuco/detail/bloom_filter/kernels.cuh
@@ -44,7 +44,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void add(InputIt first,
   if (tile_start >= n) { return; }
   auto const tile_stop = (tile_start + items_per_tile < n) ? tile_start + items_per_tile : n;
 
-  auto const tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto const tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
 
   ref.add(tile, first + tile_start, first + tile_stop);
 }
@@ -63,7 +63,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void add_if_n(
   auto const loop_stride = cuco::detail::grid_stride() / CGSize;
   auto idx               = cuco::detail::global_thread_id() / CGSize;
 
-  [[maybe_unused]] auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
+  [[maybe_unused]] auto const tile =
+    cg::tiled_partition<CGSize, cg::thread_block>(cg::this_thread_block());
 
   while (idx < n) {
     if (pred(*(stencil + idx))) {
@@ -94,7 +95,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
   auto const loop_stride = cuco::detail::grid_stride() / CGSize;
   auto idx               = cuco::detail::global_thread_id() / CGSize;
 
-  [[maybe_unused]] auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
+  [[maybe_unused]] auto const tile =
+    cg::tiled_partition<CGSize, cg::thread_block>(cg::this_thread_block());
 
   if constexpr (CGSize == 1) {
     while (idx < n) {
@@ -103,7 +105,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
       idx += loop_stride;
     }
   } else {
-    auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
+    auto const tile = cg::tiled_partition<CGSize, cg::thread_block>(cg::this_thread_block());
     while (idx < n) {
       typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
       auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh
index 5b1f328fd..6d3f9b9c7 100644
--- a/include/cuco/detail/dynamic_map_kernels.cuh
+++ b/include/cuco/detail/dynamic_map_kernels.cuh
@@ -166,7 +166,7 @@ CUCO_KERNEL void insert(InputIt first,
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t thread_num_successes = 0;
 
-  auto tile = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
   auto tid  = blockDim.x * blockIdx.x + threadIdx.x;
   auto it   = first + tid / tile_size;
 
@@ -312,7 +312,7 @@ CUCO_KERNEL void erase(InputIt first,
   extern __shared__ unsigned long long submap_block_num_successes[];
 
   auto block = cg::this_thread_block();
-  auto tile  = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tile  = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
   auto tid   = block_size * block.group_index().x + block.thread_rank();
   auto it    = first + tid / tile_size;
 
@@ -456,9 +456,9 @@ CUCO_KERNEL void find(InputIt first,
                       Hash hash,
                       KeyEqual key_equal)
 {
-  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
-  auto tid                  = blockDim.x * blockIdx.x + threadIdx.x;
-  auto key_idx              = tid / tile_size;
+  auto tile    = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
+  auto tid     = blockDim.x * blockIdx.x + threadIdx.x;
+  auto key_idx = tid / tile_size;
   auto empty_value_sentinel = submap_views[0].get_empty_value_sentinel();
   __shared__ Value writeBuffer[block_size];
 
@@ -677,7 +677,7 @@ CUCO_KERNEL void contains(InputIt first,
                           Hash hash,
                           KeyEqual key_equal)
 {
-  auto tile    = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tile    = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
   auto tid     = blockDim.x * blockIdx.x + threadIdx.x;
   auto key_idx = tid / tile_size;
   __shared__ bool writeBuffer[block_size];
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh b/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh
index 8eb413207..2de3fbe12 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh
@@ -106,7 +106,7 @@ class hyperloglog_impl {
    * @param group CUDA Cooperative group this operation is executed in
    */
   template <class CG>
-  __device__ constexpr void clear(CG const& group) noexcept
+  __device__ constexpr void clear(CG group) noexcept
   {
     for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) {
       new (&(this->sketch_[i])) register_type{};
@@ -280,8 +280,7 @@ class hyperloglog_impl {
    * @param other Other estimator reference to be merged into `*this`
    */
   template <class CG, cuda::thread_scope OtherScope>
-  __device__ constexpr void merge(CG const& group,
-                                  hyperloglog_impl<T, OtherScope, Hash> const& other)
+  __device__ constexpr void merge(CG group, hyperloglog_impl<T, OtherScope, Hash> const& other)
   {
     // TODO find a better way to do error handling in device code
     // if (other.precision_ != this->precision_) { __trap(); }
@@ -362,7 +361,8 @@ class hyperloglog_impl {
     }
 
     // warp reduce Z and V
-    auto const warp = cooperative_groups::tiled_partition<32>(group);
+    auto const warp =
+      cooperative_groups::tiled_partition<32, cooperative_groups::thread_block>(group);
 #if defined(CUCO_HAS_CG_REDUCE_UPDATE_ASYNC)
     cooperative_groups::reduce_update_async(
       warp, block_sum, thread_sum, cooperative_groups::plus<fp_type>());
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.inl b/include/cuco/detail/hyperloglog/hyperloglog_ref.inl
index 096b68bc9..1f60596a1 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.inl
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.inl
@@ -25,7 +25,7 @@ __host__ __device__ constexpr hyperloglog_ref<T, Scope, Hash>::hyperloglog_ref(
 
 template <class T, cuda::thread_scope Scope, class Hash>
 template <class CG>
-__device__ constexpr void hyperloglog_ref<T, Scope, Hash>::clear(CG const& group) noexcept
+__device__ constexpr void hyperloglog_ref<T, Scope, Hash>::clear(CG group) noexcept
 {
   impl_.clear(group);
 }
@@ -70,7 +70,7 @@ __host__ constexpr void hyperloglog_ref<T, Scope, Hash>::add(InputIt first,
 template <class T, cuda::thread_scope Scope, class Hash>
 template <class CG, cuda::thread_scope OtherScope>
 __device__ constexpr void hyperloglog_ref<T, Scope, Hash>::merge(
-  CG const& group, hyperloglog_ref<T, OtherScope, Hash> const& other)
+  CG group, hyperloglog_ref<T, OtherScope, Hash> const& other)
 {
   impl_.merge(group, other.impl_);
 }
diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh
index 62df3df40..79e10502c 100644
--- a/include/cuco/detail/open_addressing/kernels.cuh
+++ b/include/cuco/detail/open_addressing/kernels.cuh
@@ -84,7 +84,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n(InputIt first,
         if (ref.insert(insert_element)) { thread_num_successes++; };
       } else {
         auto const tile =
-          cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
+          cooperative_groups::tiled_partition<CGSize, cooperative_groups::thread_block>(
+            cooperative_groups::this_thread_block());
         if (ref.insert(tile, insert_element) && tile.thread_rank() == 0) { thread_num_successes++; }
       }
     }
@@ -143,7 +144,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n(
         ref.insert(insert_element);
       } else {
         auto const tile =
-          cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
+          cooperative_groups::tiled_partition<CGSize, cooperative_groups::thread_block>(
+            cooperative_groups::this_thread_block());
         ref.insert(tile, insert_element);
       }
     }
@@ -178,7 +180,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first,
       ref.erase(erase_element);
     } else {
       auto const tile =
-        cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
+        cooperative_groups::tiled_partition<CGSize, cooperative_groups::thread_block>(
+          cooperative_groups::this_thread_block());
       ref.erase(tile, erase_element);
     }
     idx += loop_stride;
@@ -218,7 +221,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void for_each_n(InputIt first,
       ref.for_each(key, callback_op);
     } else {
       auto const tile =
-        cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
+        cooperative_groups::tiled_partition<CGSize, cooperative_groups::thread_block>(
+          cooperative_groups::this_thread_block());
       ref.for_each(tile, key, callback_op);
     }
     idx += loop_stride;
@@ -288,7 +292,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
       block.sync();
       if (idx < n) { *(output_begin + idx) = output_buffer[thread_idx]; }
     } else {
-      auto const tile = cg::tiled_partition<CGSize>(block);
+      auto const tile = cg::tiled_partition<CGSize, cg::thread_block>(block);
       if (idx < n) {
         typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
         auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
@@ -405,7 +409,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first,
       block.sync();
       if (idx < n) { *(output_begin + idx) = output_buffer[thread_idx]; }
     } else {
-      auto const tile = cg::tiled_partition<CGSize>(block);
+      auto const tile = cg::tiled_partition<CGSize, cg::thread_block>(block);
       if (idx < n) {
         typename cuda::std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
         auto const found                                                    = ref.find(tile, key);
@@ -500,7 +504,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first,
         *(inserted_begin + idx) = output_inserted_buffer[thread_idx];
       }
     } else {
-      auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
+      auto const tile = cg::tiled_partition<CGSize, cg::thread_block>(cg::this_thread_block());
       if (idx < n) {
         typename cuda::std::iterator_traits<InputIt>::value_type const& insert_element{
           *(first + idx)};
@@ -562,7 +566,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count(InputIt first,
       }
     } else {
       auto const tile =
-        cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
+        cooperative_groups::tiled_partition<CGSize, cooperative_groups::thread_block>(
+          cooperative_groups::this_thread_block());
       if constexpr (IsOuter) {
         auto temp_count = ref.count(tile, key);
         if (tile.all(temp_count == 0) and tile.thread_rank() == 0) { ++temp_count; }
@@ -621,7 +626,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count_each(InputIt first,
       }
     } else {
       auto const tile =
-        cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
+        cooperative_groups::tiled_partition<CGSize, cooperative_groups::thread_block>(
+          cooperative_groups::this_thread_block());
       if constexpr (IsOuter) {
         auto temp_count = ref.count(tile, key);
         if (tile.all(temp_count == 0) and tile.thread_rank() == 0) { ++temp_count; }
@@ -758,7 +764,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void rehash(
 
   auto constexpr cg_size = ContainerRef::cg_size;
   auto const block       = cg::this_thread_block();
-  auto const tile        = cg::tiled_partition<cg_size>(block);
+  auto const tile        = cg::tiled_partition<cg_size, cg::thread_block>(block);
 
   auto const thread_rank         = block.thread_rank();
   auto constexpr tiles_per_block = BlockSize / cg_size;  // tile.meta_group_size() but constexpr
diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh
index e6b9e5577..5d09093f7 100644
--- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh
+++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh
@@ -313,7 +313,7 @@ class open_addressing_ref_impl {
    * the ownership of the memory
    */
   template <typename CG>
-  __device__ void make_copy(CG const& g, value_type* const memory_to_use) const noexcept
+  __device__ void make_copy(CG g, value_type* const memory_to_use) const noexcept
   {
     auto const num_slots = this->capacity();
 #if defined(CUCO_HAS_CUDA_BARRIER)
@@ -348,7 +348,7 @@ class open_addressing_ref_impl {
    * @param tile The cooperative thread group used to initialize the container
    */
   template <typename CG>
-  __device__ constexpr void initialize(CG const& tile) noexcept
+  __device__ constexpr void initialize(CG tile) noexcept
   {
     auto tid          = tile.thread_rank();
     auto const extent = static_cast<size_type>(this->extent());
@@ -425,8 +425,8 @@ class open_addressing_ref_impl {
    *
    * @return True if the given element is successfully inserted
    */
-  template <bool SupportsErase, typename Value>
-  __device__ bool insert(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <bool SupportsErase, typename Value, typename ParentCG>
+  __device__ bool insert(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                          Value const& value) noexcept
   {
     auto const val = this->heterogeneous_value(value);
@@ -585,9 +585,9 @@ class open_addressing_ref_impl {
    * @return a pair consisting of an iterator to the element and a bool indicating whether the
    * insertion is successful or not.
    */
-  template <typename Value>
+  template <typename Value, typename ParentCG>
   __device__ cuda::std::pair<iterator, bool> insert_and_find(
-    cooperative_groups::thread_block_tile<cg_size> const& group, Value const& value) noexcept
+    cooperative_groups::thread_block_tile<cg_size, ParentCG> group, Value const& value) noexcept
   {
 #if __CUDA_ARCH__ < 700
     // Spinning to ensure that the write to the value part took place requires
@@ -727,8 +727,8 @@ class open_addressing_ref_impl {
    *
    * @return True if the given element is successfully erased
    */
-  template <typename ProbeKey>
-  __device__ bool erase(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <typename ProbeKey, typename ParentCG>
+  __device__ bool erase(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                         ProbeKey const& key) noexcept
   {
     auto probing_iter =
@@ -824,9 +824,10 @@ class open_addressing_ref_impl {
    *
    * @return A boolean indicating whether the probe key is present
    */
-  template <typename ProbeKey>
+  template <typename ProbeKey, typename ParentCG>
   [[nodiscard]] __device__ bool contains(
-    cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
+    cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
+    ProbeKey const& key) const noexcept
   {
     auto probing_iter =
       probing_scheme_.template make_iterator<bucket_size>(group, key, storage_ref_.extent());
@@ -907,9 +908,10 @@ class open_addressing_ref_impl {
    *
    * @return An iterator to the position at which the equivalent key is stored
    */
-  template <typename ProbeKey>
-  [[nodiscard]] __device__ iterator find(
-    cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
+  template <typename ProbeKey, typename ParentCG>
+  [[nodiscard]] __device__ iterator
+  find(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
+       ProbeKey const& key) const noexcept
   {
     auto probing_iter =
       probing_scheme_.template make_iterator<bucket_size>(group, key, storage_ref_.extent());
@@ -1003,9 +1005,10 @@ class open_addressing_ref_impl {
    *
    * @return Number of occurrences found by the current thread
    */
-  template <typename ProbeKey>
-  [[nodiscard]] __device__ size_type count(
-    cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
+  template <typename ProbeKey, typename ParentCG>
+  [[nodiscard]] __device__ size_type
+  count(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
+        ProbeKey const& key) const noexcept
   {
     auto probing_iter =
       probing_scheme_.template make_iterator<bucket_size>(group, key, storage_ref_.extent());
@@ -1195,8 +1198,8 @@ class open_addressing_ref_impl {
     auto constexpr max_matches_per_step = flushing_tile_size * bucket_size;
     auto constexpr buffer_size = buffer_multiplier * max_matches_per_step + flushing_tile_size;
 
-    auto const flushing_tile = cg::tiled_partition<flushing_tile_size>(block);
-    auto const probing_tile  = cg::tiled_partition<probing_tile_size>(block);
+    auto const flushing_tile = cg::tiled_partition<flushing_tile_size, cg::thread_block>(block);
+    auto const probing_tile  = cg::tiled_partition<probing_tile_size, cg::thread_block>(block);
 
     auto const flushing_tile_id = flushing_tile.meta_group_rank();
     auto const stride           = probing_tile.meta_group_size();
@@ -1208,7 +1211,7 @@ class open_addressing_ref_impl {
     if (flushing_tile.thread_rank() == 0) { counters[flushing_tile_id] = 0; }
     flushing_tile.sync();
 
-    auto flush_buffers = [&](auto const& tile) {
+    auto flush_buffers = [&](auto tile) {
       size_type offset = 0;
       auto const count = counters[flushing_tile_id];
       auto const rank  = tile.thread_rank();
@@ -1408,8 +1411,8 @@ class open_addressing_ref_impl {
    * @param key The key to search for
    * @param callback_op Function to apply to every matched slot
    */
-  template <class ProbeKey, class CallbackOp>
-  __device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <class ProbeKey, class CallbackOp, typename ParentCG>
+  __device__ void for_each(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                            ProbeKey const& key,
                            CallbackOp&& callback_op) const noexcept
   {
@@ -1472,8 +1475,8 @@ class open_addressing_ref_impl {
    * @param callback_op Function to apply to every matched slot
    * @param sync_op Function that is allowed to synchronize `group` inbetween probing buckets
    */
-  template <class ProbeKey, class CallbackOp, class SyncOp>
-  __device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <class ProbeKey, class CallbackOp, class SyncOp, typename ParentCG>
+  __device__ void for_each(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                            ProbeKey const& key,
                            CallbackOp&& callback_op,
                            SyncOp&& sync_op) const noexcept
diff --git a/include/cuco/detail/probe_sequence_impl.cuh b/include/cuco/detail/probe_sequence_impl.cuh
index a732363da..71285e3a8 100644
--- a/include/cuco/detail/probe_sequence_impl.cuh
+++ b/include/cuco/detail/probe_sequence_impl.cuh
@@ -206,9 +206,9 @@ class linear_probing_impl
    * @param k The key to get the slot for
    * @return Pointer to the initial slot for `k`
    */
-  template <typename ProbeKey>
-  __device__ __forceinline__ iterator
-  initial_slot(cooperative_groups::thread_block_tile<cg_size> const& g, ProbeKey const& k) noexcept
+  template <typename ProbeKey, typename ParentCG>
+  __device__ __forceinline__ iterator initial_slot(
+    cooperative_groups::thread_block_tile<cg_size, ParentCG> g, ProbeKey const& k) noexcept
   {
     return const_cast<iterator>(cuda::std::as_const(*this).initial_slot(g, k));
   }
@@ -224,9 +224,9 @@ class linear_probing_impl
    * @param k The key to get the slot for
    * @return Pointer to the initial slot for `k`
    */
-  template <typename ProbeKey>
+  template <typename ProbeKey, typename ParentCG>
   __device__ __forceinline__ const_iterator initial_slot(
-    cooperative_groups::thread_block_tile<cg_size> const& g, ProbeKey const& k) const noexcept
+    cooperative_groups::thread_block_tile<cg_size, ParentCG> g, ProbeKey const& k) const noexcept
   {
     auto const hash_value = [&]() {
       auto const tmp = hash_(k);
@@ -360,9 +360,9 @@ class double_hashing_impl
    * @param k The key to get the slot for
    * @return Pointer to the initial slot for `k`
    */
-  template <typename ProbeKey>
-  __device__ __forceinline__ iterator
-  initial_slot(cooperative_groups::thread_block_tile<cg_size> const& g, ProbeKey const& k) noexcept
+  template <typename ProbeKey, typename ParentCG>
+  __device__ __forceinline__ iterator initial_slot(
+    cooperative_groups::thread_block_tile<cg_size, ParentCG> g, ProbeKey const& k) noexcept
   {
     return const_cast<iterator>(cuda::std::as_const(*this).initial_slot(g, k));
   }
@@ -379,9 +379,9 @@ class double_hashing_impl
    * @param k The key to get the slot for
    * @return Pointer to the initial slot for `k`
    */
-  template <typename ProbeKey>
+  template <typename ProbeKey, typename ParentCG>
   __device__ __forceinline__ const_iterator initial_slot(
-    cooperative_groups::thread_block_tile<cg_size> const& g, ProbeKey const& k) const noexcept
+    cooperative_groups::thread_block_tile<cg_size, ParentCG> g, ProbeKey const& k) const noexcept
   {
     std::size_t index;
     auto const hash_value = hash1_(k);
diff --git a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl
index 5f8a55ae5..9e88512af 100644
--- a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl
+++ b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl
@@ -116,9 +116,9 @@ __host__ __device__ constexpr auto linear_probing<CGSize, Hash>::make_iterator(
 }
 
 template <int32_t CGSize, typename Hash>
-template <int32_t BucketSize, typename ProbeKey, typename Extent>
+template <int32_t BucketSize, typename ProbeKey, typename Extent, typename ParentCG>
 __host__ __device__ constexpr auto linear_probing<CGSize, Hash>::make_iterator(
-  cooperative_groups::thread_block_tile<cg_size> const& g,
+  cooperative_groups::thread_block_tile<cg_size, ParentCG> g,
   ProbeKey const& probe_key,
   Extent upper_bound) const noexcept
 {
@@ -182,9 +182,9 @@ __host__ __device__ constexpr auto double_hashing<CGSize, Hash1, Hash2>::make_it
 }
 
 template <int32_t CGSize, typename Hash1, typename Hash2>
-template <int32_t BucketSize, typename ProbeKey, typename Extent>
+template <int32_t BucketSize, typename ProbeKey, typename Extent, typename ParentCG>
 __host__ __device__ constexpr auto double_hashing<CGSize, Hash1, Hash2>::make_iterator(
-  cooperative_groups::thread_block_tile<cg_size> const& g,
+  cooperative_groups::thread_block_tile<cg_size, ParentCG> g,
   ProbeKey const& probe_key,
   Extent upper_bound) const noexcept
 {
diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl
index a9e39d08b..cf9d66804 100644
--- a/include/cuco/detail/static_map.inl
+++ b/include/cuco/detail/static_map.inl
@@ -522,7 +522,7 @@ __device__
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 template <typename CG, typename Hash, typename KeyEqual>
 __device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::insert(
-  CG const& g, value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept
+  CG g, value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept
 {
   auto current_slot = this->initial_slot(g, insert_pair.first, hash);
 
@@ -634,7 +634,7 @@ __device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::e
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 template <typename CG, typename Hash, typename KeyEqual>
 __device__ bool static_map<Key, Value, Scope, Allocator>::device_mutable_view::erase(
-  CG const& g, key_type const& k, Hash hash, KeyEqual key_equal) noexcept
+  CG g, key_type const& k, Hash hash, KeyEqual key_equal) noexcept
 {
   auto current_slot = this->initial_slot(g, k, hash);
   value_type const insert_pair =
@@ -834,7 +834,7 @@ __device__ bool static_map<Key, Value, Scope, Allocator>::device_view::contains(
 template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
 template <typename CG, typename ProbeKey, typename Hash, typename KeyEqual>
 __device__ cuda::std::enable_if_t<std::is_invocable_v<KeyEqual, ProbeKey, Key>, bool>
-static_map<Key, Value, Scope, Allocator>::device_view::contains(CG const& g,
+static_map<Key, Value, Scope, Allocator>::device_view::contains(CG g,
                                                                 ProbeKey const& k,
                                                                 Hash hash,
                                                                 KeyEqual key_equal) const noexcept
diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh
index ce9f68c5a..a1cdc5bf9 100644
--- a/include/cuco/detail/static_map/kernels.cuh
+++ b/include/cuco/detail/static_map/kernels.cuh
@@ -61,7 +61,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_assign(InputIt first,
       ref.insert_or_assign(insert_pair);
     } else {
       auto const tile =
-        cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
+        cooperative_groups::tiled_partition<CGSize, cooperative_groups::thread_block>(
+          cooperative_groups::this_thread_block());
       ref.insert_or_assign(tile, insert_pair);
     }
     idx += loop_stride;
@@ -119,7 +120,8 @@ __global__ void insert_or_apply(
       }
     } else {
       auto const tile =
-        cooperative_groups::tiled_partition<CGSize>(cooperative_groups::this_thread_block());
+        cooperative_groups::tiled_partition<CGSize, cooperative_groups::thread_block>(
+          cooperative_groups::this_thread_block());
       if constexpr (HasInit) {
         ref.insert_or_apply(tile, insert_pair, init, op);
       } else {
@@ -186,7 +188,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem(
   auto const loop_stride = cuco::detail::grid_stride() / CGSize;
   auto idx               = cuco::detail::global_thread_id() / CGSize;
 
-  auto warp                  = cg::tiled_partition<32>(block);
+  auto warp                  = cg::tiled_partition<32, cg::thread_block>(block);
   auto const warp_thread_idx = warp.thread_rank();
 
   // Shared map initialization
diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl
index 85849bdf9..367549e05 100644
--- a/include/cuco/detail/static_map/static_map_ref.inl
+++ b/include/cuco/detail/static_map/static_map_ref.inl
@@ -387,7 +387,7 @@ template <typename Key,
 template <typename CG, cuda::thread_scope NewScope>
 __device__ constexpr auto
 static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::make_copy(
-  CG const& tile,
+  CG tile,
   typename StorageRef::value_type* const memory_to_use,
   cuda_thread_scope<NewScope> scope) const noexcept
 {
@@ -412,7 +412,7 @@ template <typename Key,
 template <typename CG>
 __device__ constexpr void
 static_map_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::initialize(
-  CG const& tile) noexcept
+  CG tile) noexcept
 {
   this->impl_.initialize(tile);
 }
@@ -465,8 +465,8 @@ class operator_impl<
    *
    * @return True if the given element is successfully inserted
    */
-  template <typename Value>
-  __device__ bool insert(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <typename Value, typename ParentCG>
+  __device__ bool insert(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                          Value const& value) noexcept
   {
     auto& ref_ = static_cast<ref_type&>(*this);
@@ -556,8 +556,8 @@ class operator_impl<
    * @param group The Cooperative Group used to perform group insert
    * @param value The element to insert
    */
-  template <typename Value>
-  __device__ void insert_or_assign(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <typename Value, typename ParentCG>
+  __device__ void insert_or_assign(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                                    Value const& value) noexcept
   {
     ref_type& ref_ = static_cast<ref_type&>(*this);
@@ -753,8 +753,8 @@ class operator_impl<
    * @return Returns `true` if the given `value` is inserted successfully.
    */
 
-  template <typename Value, typename Op>
-  __device__ bool insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <typename Value, typename Op, typename ParentCG>
+  __device__ bool insert_or_apply(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                                   Value const& value,
                                   Op op)
   {
@@ -785,8 +785,8 @@ class operator_impl<
    *
    * @return Returns `true` if the given `value` is inserted successfully.
    */
-  template <typename Value, typename Init, typename Op>
-  __device__ bool insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <typename Value, typename Init, typename Op, typename ParentCG>
+  __device__ bool insert_or_apply(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                                   Value const& value,
                                   Init init,
                                   Op op)
@@ -843,9 +843,9 @@ class operator_impl<
    * @param op The callable object to perform binary operation between existing value at the slot
    *  and the element to insert.
    */
-  template <typename Value, typename Init, typename Op>
+  template <typename Value, typename Init, typename Op, typename ParentCG>
   __device__ bool dispatch_insert_or_apply(
-    cooperative_groups::thread_block_tile<cg_size> const& group,
+    cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
     Value const& value,
     Init init,
     Op op)
@@ -953,10 +953,9 @@ class operator_impl<
    *
    * @return Returns `true` if the given `value` is inserted successfully.
    */
-  template <bool UseDirectApply, typename Value, typename Op>
-  __device__ bool insert_or_apply_impl(cooperative_groups::thread_block_tile<cg_size> const& group,
-                                       Value const& value,
-                                       Op op)
+  template <bool UseDirectApply, typename Value, typename Op, typename ParentCG>
+  __device__ bool insert_or_apply_impl(
+    cooperative_groups::thread_block_tile<cg_size, ParentCG> group, Value const& value, Op op)
   {
     ref_type& ref_ = static_cast<ref_type&>(*this);
 
@@ -1149,9 +1148,9 @@ class operator_impl<
    * @return a pair consisting of an iterator to the element and a bool indicating whether the
    * insertion is successful or not.
    */
-  template <typename Value>
+  template <typename Value, typename ParentCG>
   __device__ cuda::std::pair<iterator, bool> insert_and_find(
-    cooperative_groups::thread_block_tile<cg_size> const& group, Value const& value) noexcept
+    cooperative_groups::thread_block_tile<cg_size, ParentCG> group, Value const& value) noexcept
   {
     ref_type& ref_ = static_cast<ref_type&>(*this);
     return ref_.impl_.insert_and_find(group, value);
@@ -1203,8 +1202,8 @@ class operator_impl<
    *
    * @return True if the given element is successfully erased
    */
-  template <typename ProbeKey>
-  __device__ bool erase(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <typename ProbeKey, typename ParentCG>
+  __device__ bool erase(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                         ProbeKey const& key) noexcept
   {
     auto& ref_ = static_cast<ref_type&>(*this);
@@ -1264,9 +1263,10 @@ class operator_impl<
    *
    * @return A boolean indicating whether the probe key is present
    */
-  template <typename ProbeKey>
+  template <typename ProbeKey, typename ParentCG>
   [[nodiscard]] __device__ bool contains(
-    cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
+    cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
+    ProbeKey const& key) const noexcept
   {
     auto const& ref_ = static_cast<ref_type const&>(*this);
     return ref_.impl_.contains(group, key);
@@ -1327,9 +1327,10 @@ class operator_impl<
    *
    * @return An iterator to the position at which the equivalent key is stored
    */
-  template <typename ProbeKey>
-  [[nodiscard]] __device__ iterator find(
-    cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
+  template <typename ProbeKey, typename ParentCG>
+  [[nodiscard]] __device__ iterator
+  find(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
+       ProbeKey const& key) const noexcept
   {
     auto const& ref_ = static_cast<ref_type const&>(*this);
     return ref_.impl_.find(group, key);
@@ -1395,8 +1396,8 @@ class operator_impl<
    * @param key The key to search for
    * @param callback_op Function to apply to the copy of the matched key-value pair
    */
-  template <class ProbeKey, class CallbackOp>
-  __device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <class ProbeKey, class CallbackOp, typename ParentCG>
+  __device__ void for_each(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                            ProbeKey const& key,
                            CallbackOp&& callback_op) const noexcept
   {
@@ -1452,8 +1453,8 @@ class operator_impl<
    *
    * @return Number of occurrences found by the current thread
    */
-  template <typename ProbeKey>
-  __device__ size_type count(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <typename ProbeKey, typename ParentCG>
+  __device__ size_type count(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                              ProbeKey const& key) const noexcept
   {
     auto const& ref_ = static_cast<ref_type const&>(*this);
@@ -1516,7 +1517,7 @@ class operator_impl<
             class OutputProbeIt,
             class OutputMatchIt,
             class AtomicCounter>
-  __device__ void retrieve(cooperative_groups::thread_block const& block,
+  __device__ void retrieve(cooperative_groups::thread_block block,
                            InputProbeIt input_probe_begin,
                            InputProbeIt input_probe_end,
                            OutputProbeIt output_probe,
diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh
index 74b3f1065..363931abb 100644
--- a/include/cuco/detail/static_map_kernels.cuh
+++ b/include/cuco/detail/static_map_kernels.cuh
@@ -147,7 +147,7 @@ CUCO_KERNEL void insert(
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t thread_num_successes = 0;
 
-  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
   int64_t const loop_stride = gridDim.x * block_size / tile_size;
   int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
 
@@ -254,7 +254,7 @@ CUCO_KERNEL void erase(
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t thread_num_successes = 0;
 
-  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
   int64_t const loop_stride = gridDim.x * block_size / tile_size;
   int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
 
@@ -324,7 +324,7 @@ CUCO_KERNEL void insert_if_n(InputIt first,
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t thread_num_successes = 0;
 
-  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
   int64_t const loop_stride = gridDim.x * block_size / tile_size;
   int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
 
@@ -440,7 +440,7 @@ template <std::size_t block_size,
 CUCO_KERNEL void find(
   InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
 {
-  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
   int64_t const loop_stride = gridDim.x * block_size / tile_size;
   int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
 #pragma nv_diagnostic push
@@ -558,7 +558,7 @@ template <std::size_t block_size,
 CUCO_KERNEL void contains(
   InputIt first, int64_t n, OutputIt output_begin, viewT view, Hash hash, KeyEqual key_equal)
 {
-  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
   int64_t const loop_stride = gridDim.x * block_size / tile_size;
   int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
   __shared__ bool writeBuffer[block_size / tile_size];
diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl
index 4e3b2881f..f6af1b4c3 100644
--- a/include/cuco/detail/static_multimap/device_view_impl.inl
+++ b/include/cuco/detail/static_multimap/device_view_impl.inl
@@ -77,9 +77,9 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
    * @param k The key to get the slot for
    * @return Pointer to the initial slot for `k`
    */
-  template <typename ProbeKey>
+  template <typename ProbeKey, typename ParentCG>
   __device__ __forceinline__ iterator
-  initial_slot(cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+  initial_slot(cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
                ProbeKey const& k) noexcept
   {
     return probe_sequence_.initial_slot(g, k);
@@ -96,9 +96,9 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
    * @param k The key to get the slot for
    * @return Pointer to the initial slot for `k`
    */
-  template <typename ProbeKey>
+  template <typename ProbeKey, typename ParentCG>
   __device__ __forceinline__ const_iterator
-  initial_slot(cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+  initial_slot(cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
                ProbeKey const& k) const noexcept
   {
     return probe_sequence_.initial_slot(g, k);
@@ -483,7 +483,7 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
    * @param output_begin Beginning of the output sequence of key/value pairs
    */
   template <typename CG, typename atomicT, typename OutputIt>
-  __device__ __forceinline__ void flush_output_buffer(CG const& g,
+  __device__ __forceinline__ void flush_output_buffer(CG g,
                                                       uint32_t const num_outputs,
                                                       value_type* output_buffer,
                                                       atomicT* num_matches,
@@ -541,7 +541,7 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
    * pairs
    */
   template <typename CG, typename atomicT, typename OutputIt1, typename OutputIt2>
-  __device__ __forceinline__ void flush_output_buffer(CG const& g,
+  __device__ __forceinline__ void flush_output_buffer(CG g,
                                                       uint32_t const num_outputs,
                                                       value_type* probe_output_buffer,
                                                       value_type* contained_output_buffer,
@@ -584,9 +584,13 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
    * @param equal The binary function to compare input element and slot content for equality
    * @return A boolean indicating whether the key/value pair represented by `element` was inserted
    */
-  template <bool is_pair_contains, bool uses_vector_load, typename ProbeT, typename Equal>
+  template <bool is_pair_contains,
+            bool uses_vector_load,
+            typename ProbeT,
+            typename Equal,
+            typename ParentCG>
   __device__ __forceinline__ cuda::std::enable_if_t<uses_vector_load, bool> contains(
-    cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+    cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
     ProbeT const& element,
     Equal equal) const noexcept
   {
@@ -650,9 +654,13 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
    * @param equal The binary function to compare input element and slot content for equality
    * @return A boolean indicating whether the key/value pair represented by `element` was inserted
    */
-  template <bool is_pair_contains, bool uses_vector_load, typename ProbeT, typename Equal>
+  template <bool is_pair_contains,
+            bool uses_vector_load,
+            typename ProbeT,
+            typename Equal,
+            typename ParentCG>
   __device__ __forceinline__ cuda::std::enable_if_t<not uses_vector_load, bool> contains(
-    cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+    cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
     ProbeT const& element,
     Equal equal) const noexcept
   {
@@ -706,7 +714,7 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
    */
   template <bool uses_vector_load, bool is_outer, typename CG, typename KeyEqual>
   __device__ __forceinline__ cuda::std::enable_if_t<uses_vector_load, std::size_t> count(
-    CG const& g, Key const& k, KeyEqual key_equal) noexcept
+    CG g, Key const& k, KeyEqual key_equal) noexcept
   {
     std::size_t count = 0;
     auto current_slot = initial_slot(g, k);
@@ -756,7 +764,7 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
    */
   template <bool uses_vector_load, bool is_outer, typename CG, typename KeyEqual>
   __device__ __forceinline__ cuda::std::enable_if_t<not uses_vector_load, std::size_t> count(
-    CG const& g, Key const& k, KeyEqual key_equal) noexcept
+    CG g, Key const& k, KeyEqual key_equal) noexcept
   {
     std::size_t count = 0;
     auto current_slot = initial_slot(g, k);
@@ -804,7 +812,7 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
    */
   template <bool uses_vector_load, bool is_outer, typename CG, typename PairEqual>
   __device__ __forceinline__ cuda::std::enable_if_t<uses_vector_load, std::size_t> pair_count(
-    CG const& g, value_type const& pair, PairEqual pair_equal) noexcept
+    CG g, value_type const& pair, PairEqual pair_equal) noexcept
   {
     std::size_t count = 0;
     auto key          = pair.first;
@@ -857,7 +865,7 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
    */
   template <bool uses_vector_load, bool is_outer, typename CG, typename PairEqual>
   __device__ __forceinline__ cuda::std::enable_if_t<not uses_vector_load, std::size_t> pair_count(
-    CG const& g, value_type const& pair, PairEqual pair_equal) noexcept
+    CG g, value_type const& pair, PairEqual pair_equal) noexcept
   {
     std::size_t count = 0;
     auto key          = pair.first;
@@ -923,8 +931,8 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
             typename atomicT,
             typename OutputIt,
             typename KeyEqual>
-  __device__ __forceinline__ void retrieve(FlushingCG const& flushing_cg,
-                                           ProbingCG const& probing_cg,
+  __device__ __forceinline__ void retrieve(FlushingCG flushing_cg,
+                                           ProbingCG probing_cg,
                                            Key const& k,
                                            uint32_t* flushing_cg_counter,
                                            value_type* output_buffer,
@@ -1033,7 +1041,7 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
             typename atomicT,
             typename OutputIt,
             typename KeyEqual>
-  __device__ __forceinline__ void retrieve(CG const& g,
+  __device__ __forceinline__ void retrieve(CG g,
                                            Key const& k,
                                            uint32_t* cg_counter,
                                            value_type* output_buffer,
@@ -1141,7 +1149,7 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
             typename OutputIt4,
             typename PairEqual>
   __device__ __forceinline__ cuda::std::enable_if_t<uses_vector_load, void> pair_retrieve(
-    ProbingCG const& probing_cg,
+    ProbingCG probing_cg,
     value_type const& pair,
     OutputIt1 probe_key_begin,
     OutputIt2 probe_val_begin,
@@ -1252,7 +1260,7 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
             typename OutputIt4,
             typename PairEqual>
   __device__ __forceinline__ cuda::std::enable_if_t<not uses_vector_load, void> pair_retrieve(
-    ProbingCG const& probing_cg,
+    ProbingCG probing_cg,
     value_type const& pair,
     OutputIt1 probe_key_begin,
     OutputIt2 probe_val_begin,
@@ -1348,8 +1356,8 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
             typename OutputIt1,
             typename OutputIt2,
             typename PairEqual>
-  __device__ __forceinline__ void pair_retrieve(FlushingCG const& flushing_cg,
-                                                ProbingCG const& probing_cg,
+  __device__ __forceinline__ void pair_retrieve(FlushingCG flushing_cg,
+                                                ProbingCG probing_cg,
                                                 value_type const& pair,
                                                 uint32_t* flushing_cg_counter,
                                                 value_type* probe_output_buffer,
@@ -1476,7 +1484,7 @@ class static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view_
             typename OutputIt1,
             typename OutputIt2,
             typename PairEqual>
-  __device__ __forceinline__ void pair_retrieve(CG const& g,
+  __device__ __forceinline__ void pair_retrieve(CG g,
                                                 value_type const& pair,
                                                 uint32_t* cg_counter,
                                                 value_type* probe_output_buffer,
diff --git a/include/cuco/detail/static_multimap/kernels.cuh b/include/cuco/detail/static_multimap/kernels.cuh
index f43667a15..1e7b9d985 100644
--- a/include/cuco/detail/static_multimap/kernels.cuh
+++ b/include/cuco/detail/static_multimap/kernels.cuh
@@ -83,7 +83,7 @@ CUCO_KERNEL void initialize(pair_atomic_type* const slots, Key k, Value v, int64
 template <uint32_t block_size, uint32_t tile_size, typename InputIt, typename viewT>
 CUCO_KERNEL void insert(InputIt first, int64_t n, viewT view)
 {
-  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
   int64_t const loop_stride = gridDim.x * block_size / tile_size;
   int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
 
@@ -131,7 +131,7 @@ template <uint32_t block_size,
           typename Predicate>
 CUCO_KERNEL void insert_if_n(InputIt first, StencilIt s, int64_t n, viewT view, Predicate pred)
 {
-  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
   int64_t const loop_stride = gridDim.x * block_size / tile_size;
   int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
 
@@ -178,7 +178,7 @@ template <bool is_pair_contains,
           typename Equal>
 CUCO_KERNEL void contains(InputIt first, int64_t n, OutputIt output_begin, viewT view, Equal equal)
 {
-  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
   int64_t const loop_stride = gridDim.x * block_size / tile_size;
   int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
   __shared__ bool writeBuffer[block_size / tile_size];
@@ -237,7 +237,7 @@ template <uint32_t block_size,
 CUCO_KERNEL void count(
   InputIt first, int64_t n, atomicT* num_matches, viewT view, KeyEqual key_equal)
 {
-  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
   int64_t const loop_stride = gridDim.x * block_size / tile_size;
   int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
 
@@ -296,7 +296,7 @@ template <uint32_t block_size,
 CUCO_KERNEL void pair_count(
   InputIt first, int64_t n, atomicT* num_matches, viewT view, PairEqual pair_equal)
 {
-  auto tile                 = cg::tiled_partition<tile_size>(cg::this_thread_block());
+  auto tile = cg::tiled_partition<tile_size, cg::thread_block>(cg::this_thread_block());
   int64_t const loop_stride = gridDim.x * block_size / tile_size;
   int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / tile_size;
 
@@ -374,8 +374,9 @@ CUCO_KERNEL void retrieve(InputIt first,
   constexpr uint32_t num_flushing_cgs = block_size / flushing_cg_size;
   const uint32_t flushing_cg_id       = threadIdx.x / flushing_cg_size;
 
-  auto flushing_cg          = cg::tiled_partition<flushing_cg_size>(cg::this_thread_block());
-  auto probing_cg           = cg::tiled_partition<probing_cg_size>(cg::this_thread_block());
+  auto flushing_cg =
+    cg::tiled_partition<flushing_cg_size, cg::thread_block>(cg::this_thread_block());
+  auto probing_cg = cg::tiled_partition<probing_cg_size, cg::thread_block>(cg::this_thread_block());
   int64_t const loop_stride = gridDim.x * block_size / probing_cg_size;
   int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / probing_cg_size;
 
@@ -488,8 +489,9 @@ CUCO_KERNEL void pair_retrieve(InputIt first,
   constexpr uint32_t num_flushing_cgs = block_size / flushing_cg_size;
   const uint32_t flushing_cg_id       = threadIdx.x / flushing_cg_size;
 
-  auto flushing_cg          = cg::tiled_partition<flushing_cg_size>(cg::this_thread_block());
-  auto probing_cg           = cg::tiled_partition<probing_cg_size>(cg::this_thread_block());
+  auto flushing_cg =
+    cg::tiled_partition<flushing_cg_size, cg::thread_block>(cg::this_thread_block());
+  auto probing_cg = cg::tiled_partition<probing_cg_size, cg::thread_block>(cg::this_thread_block());
   int64_t const loop_stride = gridDim.x * block_size / probing_cg_size;
   int64_t idx               = (block_size * blockIdx.x + threadIdx.x) / probing_cg_size;
 
diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl
index ad50e7eaa..50de45ad2 100644
--- a/include/cuco/detail/static_multimap/static_multimap.inl
+++ b/include/cuco/detail/static_multimap/static_multimap.inl
@@ -1126,9 +1126,10 @@ template <typename Key,
           cuda::thread_scope Scope,
           typename Allocator,
           class ProbeSequence>
+template <typename ParentCG>
 __device__ __forceinline__ void
 static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_mutable_view::insert(
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
   value_type const& insert_pair) noexcept
 {
   impl_.template insert<uses_vector_load()>(g, insert_pair);
@@ -1181,7 +1182,7 @@ template <typename Key,
 template <typename CG, typename atomicT, typename OutputIt>
 __device__ __forceinline__ void
 static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::flush_output_buffer(
-  CG const& g,
+  CG g,
   uint32_t const num_outputs,
   value_type* output_buffer,
   atomicT* num_matches,
@@ -1198,7 +1199,7 @@ template <typename Key,
 template <typename CG, typename atomicT, typename OutputIt1, typename OutputIt2>
 __device__ __forceinline__ void
 static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::flush_output_buffer(
-  CG const& g,
+  CG g,
   uint32_t const num_outputs,
   value_type* probe_output_buffer,
   value_type* contained_output_buffer,
@@ -1220,10 +1221,10 @@ template <typename Key,
           cuda::thread_scope Scope,
           typename Allocator,
           class ProbeSequence>
-template <typename ProbeKey, typename KeyEqual>
+template <typename ProbeKey, typename KeyEqual, typename ParentCG>
 __device__ __forceinline__ bool
 static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::contains(
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
   ProbeKey const& k,
   KeyEqual key_equal) const noexcept
 {
@@ -1236,10 +1237,10 @@ template <typename Key,
           cuda::thread_scope Scope,
           typename Allocator,
           class ProbeSequence>
-template <typename ProbePair, typename PairEqual>
+template <typename ProbePair, typename PairEqual, typename ParentCG>
 __device__ __forceinline__ bool
 static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::pair_contains(
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
   ProbePair const& p,
   PairEqual pair_equal) const noexcept
 {
@@ -1252,10 +1253,10 @@ template <typename Key,
           cuda::thread_scope Scope,
           typename Allocator,
           class ProbeSequence>
-template <typename KeyEqual>
+template <typename KeyEqual, typename ParentCG>
 __device__ __forceinline__ std::size_t
 static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::count(
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
   Key const& k,
   KeyEqual key_equal) noexcept
 {
@@ -1268,10 +1269,10 @@ template <typename Key,
           cuda::thread_scope Scope,
           typename Allocator,
           class ProbeSequence>
-template <typename KeyEqual>
+template <typename KeyEqual, typename ParentCG>
 __device__ __forceinline__ std::size_t
 static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::count_outer(
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
   Key const& k,
   KeyEqual key_equal) noexcept
 {
@@ -1284,10 +1285,10 @@ template <typename Key,
           cuda::thread_scope Scope,
           typename Allocator,
           class ProbeSequence>
-template <typename PairEqual>
+template <typename PairEqual, typename ParentCG>
 __device__ __forceinline__ std::size_t
 static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::pair_count(
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
   value_type const& pair,
   PairEqual pair_equal) noexcept
 {
@@ -1300,10 +1301,10 @@ template <typename Key,
           cuda::thread_scope Scope,
           typename Allocator,
           class ProbeSequence>
-template <typename PairEqual>
+template <typename PairEqual, typename ParentCG>
 __device__ __forceinline__ std::size_t
 static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::pair_count_outer(
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
   value_type const& pair,
   PairEqual pair_equal) noexcept
 {
@@ -1320,11 +1321,12 @@ template <uint32_t buffer_size,
           typename FlushingCG,
           typename atomicT,
           typename OutputIt,
-          typename KeyEqual>
+          typename KeyEqual,
+          typename ParentCG>
 __device__ __forceinline__ void
 static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::retrieve(
-  FlushingCG const& flushing_cg,
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& probing_cg,
+  FlushingCG flushing_cg,
+  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
   Key const& k,
   uint32_t* flushing_cg_counter,
   value_type* output_buffer,
@@ -1358,11 +1360,12 @@ template <uint32_t buffer_size,
           typename FlushingCG,
           typename atomicT,
           typename OutputIt,
-          typename KeyEqual>
+          typename KeyEqual,
+          typename ParentCG>
 __device__ __forceinline__ void
 static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::retrieve_outer(
-  FlushingCG const& flushing_cg,
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& probing_cg,
+  FlushingCG flushing_cg,
+  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
   Key const& k,
   uint32_t* flushing_cg_counter,
   value_type* output_buffer,
@@ -1396,10 +1399,11 @@ template <typename OutputIt1,
           typename OutputIt2,
           typename OutputIt3,
           typename OutputIt4,
-          typename PairEqual>
+          typename PairEqual,
+          typename ParentCG>
 __device__ __forceinline__ void
 static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::pair_retrieve(
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& probing_cg,
+  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
   value_type const& pair,
   OutputIt1 probe_key_begin,
   OutputIt2 probe_val_begin,
@@ -1427,11 +1431,12 @@ template <uint32_t buffer_size,
           typename atomicT,
           typename OutputIt1,
           typename OutputIt2,
-          typename PairEqual>
+          typename PairEqual,
+          typename ParentCG>
 __device__ __forceinline__ void
 static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::pair_retrieve(
-  FlushingCG const& flushing_cg,
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& probing_cg,
+  FlushingCG flushing_cg,
+  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
   value_type const& pair,
   uint32_t* flushing_cg_counter,
   value_type* probe_output_buffer,
@@ -1476,10 +1481,11 @@ template <typename OutputIt1,
           typename OutputIt2,
           typename OutputIt3,
           typename OutputIt4,
-          typename PairEqual>
+          typename PairEqual,
+          typename ParentCG>
 __device__ __forceinline__ void
 static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::pair_retrieve_outer(
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& probing_cg,
+  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
   value_type const& pair,
   OutputIt1 probe_key_begin,
   OutputIt2 probe_val_begin,
@@ -1507,11 +1513,12 @@ template <uint32_t buffer_size,
           typename atomicT,
           typename OutputIt1,
           typename OutputIt2,
-          typename PairEqual>
+          typename PairEqual,
+          typename ParentCG>
 __device__ __forceinline__ void
 static_multimap<Key, Value, Scope, Allocator, ProbeSequence>::device_view::pair_retrieve_outer(
-  FlushingCG const& flushing_cg,
-  cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& probing_cg,
+  FlushingCG flushing_cg,
+  cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
   value_type const& pair,
   uint32_t* flushing_cg_counter,
   value_type* probe_output_buffer,
diff --git a/include/cuco/detail/static_multimap/static_multimap_ref.inl b/include/cuco/detail/static_multimap/static_multimap_ref.inl
index 8d1e6c126..ad856dd86 100644
--- a/include/cuco/detail/static_multimap/static_multimap_ref.inl
+++ b/include/cuco/detail/static_multimap/static_multimap_ref.inl
@@ -391,9 +391,7 @@ template <typename Key,
 template <typename CG, cuda::thread_scope NewScope>
 __device__ constexpr auto
 static_multimap_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::make_copy(
-  CG const& tile,
-  bucket_type* const memory_to_use,
-  cuda_thread_scope<NewScope> scope) const noexcept
+  CG tile, bucket_type* const memory_to_use, cuda_thread_scope<NewScope> scope) const noexcept
 {
   impl_.make_copy(tile, memory_to_use);
   return static_multimap_ref<Key, T, NewScope, KeyEqual, ProbingScheme, StorageRef, Operators...>{
@@ -416,7 +414,7 @@ template <typename Key,
 template <typename CG>
 __device__ constexpr void
 static_multimap_ref<Key, T, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::initialize(
-  CG const& tile) noexcept
+  CG tile) noexcept
 {
   impl_.initialize(tile);
 }
@@ -470,8 +468,8 @@ class operator_impl<
    *
    * @return True if the given element is successfully inserted
    */
-  template <typename Value>
-  __device__ bool insert(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <typename Value, typename ParentCG>
+  __device__ bool insert(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                          Value const& value) noexcept
   {
     auto& ref_ = static_cast<ref_type&>(*this);
@@ -536,9 +534,10 @@ class operator_impl<
    *
    * @return A boolean indicating whether the probe key is present
    */
-  template <typename ProbeKey>
+  template <typename ProbeKey, typename ParentCG>
   [[nodiscard]] __device__ bool contains(
-    cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
+    cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
+    ProbeKey const& key) const noexcept
   {
     auto const& ref_ = static_cast<ref_type const&>(*this);
     return ref_.impl_.contains(group, key);
@@ -603,8 +602,8 @@ class operator_impl<
    * @param key The key to search for
    * @param callback_op Function to call on every element found
    */
-  template <class ProbeKey, class CallbackOp>
-  __device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <class ProbeKey, class CallbackOp, typename ParentCG>
+  __device__ void for_each(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                            ProbeKey const& key,
                            CallbackOp&& callback_op) const noexcept
   {
@@ -641,8 +640,8 @@ class operator_impl<
    * @param callback_op Function to call on every element found
    * @param sync_op Function that is allowed to synchronize `group` inbetween probing buckets
    */
-  template <class ProbeKey, class CallbackOp, class SyncOp>
-  __device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <class ProbeKey, class CallbackOp, class SyncOp, typename ParentCG>
+  __device__ void for_each(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                            ProbeKey const& key,
                            CallbackOp&& callback_op,
                            SyncOp&& sync_op) const noexcept
@@ -709,9 +708,10 @@ class operator_impl<
    *
    * @return An iterator to the position at which the equivalent key is stored
    */
-  template <typename ProbeKey>
-  [[nodiscard]] __device__ const_iterator find(
-    cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
+  template <typename ProbeKey, typename ParentCG>
+  [[nodiscard]] __device__ const_iterator
+  find(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
+       ProbeKey const& key) const noexcept
   {
     auto const& ref_ = static_cast<ref_type const&>(*this);
     return ref_.impl_.find(group, key);
@@ -765,8 +765,8 @@ class operator_impl<
    *
    * @return Number of occurrences found by the current thread
    */
-  template <typename ProbeKey>
-  __device__ size_type count(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <typename ProbeKey, typename ParentCG>
+  __device__ size_type count(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                              ProbeKey const& key) const noexcept
   {
     auto const& ref_ = static_cast<ref_type const&>(*this);
diff --git a/include/cuco/detail/static_multiset/static_multiset_ref.inl b/include/cuco/detail/static_multiset/static_multiset_ref.inl
index 054f7bb80..1a618059d 100644
--- a/include/cuco/detail/static_multiset/static_multiset_ref.inl
+++ b/include/cuco/detail/static_multiset/static_multiset_ref.inl
@@ -331,9 +331,7 @@ template <typename Key,
 template <typename CG, cuda::thread_scope NewScope>
 __device__ constexpr auto
 static_multiset_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::make_copy(
-  CG const& tile,
-  bucket_type* const memory_to_use,
-  cuda_thread_scope<NewScope> scope) const noexcept
+  CG tile, bucket_type* const memory_to_use, cuda_thread_scope<NewScope> scope) const noexcept
 {
   auto const storage_ref = this->storage_ref().make_copy(tile, memory_to_use);
   return static_multiset_ref<Key,
@@ -357,7 +355,7 @@ template <typename Key,
 template <typename CG>
 __device__ constexpr void
 static_multiset_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::initialize(
-  CG const& tile) noexcept
+  CG tile) noexcept
 {
   this->storage_ref().initialize(tile, this->empty_key_sentinel());
 }
@@ -409,8 +407,8 @@ class operator_impl<
    *
    * @return True if the given element is successfully inserted
    */
-  template <typename Value>
-  __device__ bool insert(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <typename Value, typename ParentCG>
+  __device__ bool insert(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                          Value const& value) noexcept
   {
     auto& ref_ = static_cast<ref_type&>(*this);
@@ -467,9 +465,10 @@ class operator_impl<
    *
    * @return A boolean indicating whether the probe key is present
    */
-  template <typename ProbeKey>
+  template <typename ProbeKey, typename ParentCG>
   [[nodiscard]] __device__ bool contains(
-    cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
+    cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
+    ProbeKey const& key) const noexcept
   {
     auto const& ref_ = static_cast<ref_type const&>(*this);
     return ref_.impl_.contains(group, key);
@@ -530,9 +529,10 @@ class operator_impl<
    *
    * @return An iterator to the position at which the equivalent key is stored
    */
-  template <typename ProbeKey>
-  [[nodiscard]] __device__ const_iterator find(
-    cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
+  template <typename ProbeKey, typename ParentCG>
+  [[nodiscard]] __device__ const_iterator
+  find(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
+       ProbeKey const& key) const noexcept
   {
     auto const& ref_ = static_cast<ref_type const&>(*this);
     return ref_.impl_.find(group, key);
@@ -713,8 +713,8 @@ class operator_impl<
    * @param key The key to search for
    * @param callback_op Function to call on every element found
    */
-  template <class ProbeKey, class CallbackOp>
-  __device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <class ProbeKey, class CallbackOp, typename ParentCG>
+  __device__ void for_each(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                            ProbeKey const& key,
                            CallbackOp&& callback_op) const noexcept
   {
@@ -751,8 +751,8 @@ class operator_impl<
    * @param callback_op Function to call on every element found
    * @param sync_op Function that is allowed to synchronize `group` inbetween probing buckets
    */
-  template <class ProbeKey, class CallbackOp, class SyncOp>
-  __device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <class ProbeKey, class CallbackOp, class SyncOp, typename ParentCG>
+  __device__ void for_each(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                            ProbeKey const& key,
                            CallbackOp&& callback_op,
                            SyncOp&& sync_op) const noexcept
@@ -810,8 +810,8 @@ class operator_impl<
    *
    * @return Number of occurrences found by the current thread
    */
-  template <typename ProbeKey>
-  __device__ size_type count(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <typename ProbeKey, typename ParentCG>
+  __device__ size_type count(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                              ProbeKey const& key) const noexcept
   {
     auto const& ref_ = static_cast<ref_type const&>(*this);
diff --git a/include/cuco/detail/static_set/static_set_ref.inl b/include/cuco/detail/static_set/static_set_ref.inl
index 07a5c1cf6..35514fa38 100644
--- a/include/cuco/detail/static_set/static_set_ref.inl
+++ b/include/cuco/detail/static_set/static_set_ref.inl
@@ -331,7 +331,7 @@ template <typename Key,
 template <typename CG, cuda::thread_scope NewScope>
 __device__ constexpr auto
 static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::make_copy(
-  CG const& tile,
+  CG tile,
   typename StorageRef::value_type* const memory_to_use,
   cuda_thread_scope<NewScope> scope) const noexcept
 {
@@ -354,7 +354,7 @@ template <typename Key,
 template <typename CG>
 __device__ constexpr void
 static_set_ref<Key, Scope, KeyEqual, ProbingScheme, StorageRef, Operators...>::initialize(
-  CG const& tile) noexcept
+  CG tile) noexcept
 {
   this->impl_.initialize(tile);
 }
@@ -404,8 +404,8 @@ class operator_impl<op::insert_tag,
    *
    * @return True if the given element is successfully inserted
    */
-  template <typename Value>
-  __device__ bool insert(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <typename Value, typename ParentCG>
+  __device__ bool insert(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                          Value const& value) noexcept
   {
     auto& ref_ = static_cast<ref_type&>(*this);
@@ -472,9 +472,9 @@ class operator_impl<op::insert_and_find_tag,
    * @return a pair consisting of an iterator to the element and a bool indicating whether the
    * insertion is successful or not.
    */
-  template <typename Value>
+  template <typename Value, typename ParentCG>
   __device__ cuda::std::pair<iterator, bool> insert_and_find(
-    cooperative_groups::thread_block_tile<cg_size> const& group, Value const& value) noexcept
+    cooperative_groups::thread_block_tile<cg_size, ParentCG> group, Value const& value) noexcept
   {
     ref_type& ref_ = static_cast<ref_type&>(*this);
     return ref_.impl_.insert_and_find(group, value);
@@ -524,8 +524,8 @@ class operator_impl<op::erase_tag,
    *
    * @return True if the given element is successfully erased
    */
-  template <typename ProbeKey>
-  __device__ bool erase(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <typename ProbeKey, typename ParentCG>
+  __device__ bool erase(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                         ProbeKey const& key) noexcept
   {
     auto& ref_ = static_cast<ref_type&>(*this);
@@ -582,9 +582,10 @@ class operator_impl<op::contains_tag,
    *
    * @return A boolean indicating whether the probe key is present
    */
-  template <typename ProbeKey>
+  template <typename ProbeKey, typename ParentCG>
   [[nodiscard]] __device__ bool contains(
-    cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
+    cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
+    ProbeKey const& key) const noexcept
   {
     auto const& ref_ = static_cast<ref_type const&>(*this);
     return ref_.impl_.contains(group, key);
@@ -643,9 +644,10 @@ class operator_impl<op::find_tag,
    *
    * @return An iterator to the position at which the equivalent key is stored
    */
-  template <typename ProbeKey>
-  [[nodiscard]] __device__ const_iterator find(
-    cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const noexcept
+  template <typename ProbeKey, typename ParentCG>
+  [[nodiscard]] __device__ const_iterator
+  find(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
+       ProbeKey const& key) const noexcept
   {
     auto const& ref_ = static_cast<ref_type const&>(*this);
     return ref_.impl_.find(group, key);
@@ -709,8 +711,8 @@ class operator_impl<op::for_each_tag,
    * @param key The key to search for
    * @param callback_op Function to apply to the copy of the matched slot
    */
-  template <class ProbeKey, class CallbackOp>
-  __device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <class ProbeKey, class CallbackOp, typename ParentCG>
+  __device__ void for_each(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                            ProbeKey const& key,
                            CallbackOp&& callback_op) const noexcept
   {
@@ -764,8 +766,8 @@ class operator_impl<op::count_tag,
    *
    * @return Number of occurrences found by the current thread
    */
-  template <typename ProbeKey>
-  __device__ size_type count(cooperative_groups::thread_block_tile<cg_size> const& group,
+  template <typename ProbeKey, typename ParentCG>
+  __device__ size_type count(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
                              ProbeKey const& key) const noexcept
   {
     auto const& ref_ = static_cast<ref_type const&>(*this);
diff --git a/include/cuco/detail/utils.cuh b/include/cuco/detail/utils.cuh
index 9c5f52f14..3acfbe12c 100644
--- a/include/cuco/detail/utils.cuh
+++ b/include/cuco/detail/utils.cuh
@@ -131,7 +131,7 @@ __host__ __device__ constexpr SizeType sanitize_hash(HashType hash) noexcept
  * @return Converted hash value
  */
 template <typename SizeType, typename CG, typename HashType>
-__device__ constexpr SizeType sanitize_hash(CG const& group, HashType hash) noexcept
+__device__ constexpr SizeType sanitize_hash(CG group, HashType hash) noexcept
 {
   auto const base_hash = sanitize_hash<SizeType>(hash);
   auto const max_size  = cuda::std::numeric_limits<SizeType>::max();
diff --git a/include/cuco/hyperloglog_ref.cuh b/include/cuco/hyperloglog_ref.cuh
index 8946fa8c1..83aad101a 100644
--- a/include/cuco/hyperloglog_ref.cuh
+++ b/include/cuco/hyperloglog_ref.cuh
@@ -75,7 +75,7 @@ class hyperloglog_ref {
    * @param group CUDA Cooperative group this operation is executed in
    */
   template <class CG>
-  __device__ constexpr void clear(CG const& group) noexcept;
+  __device__ constexpr void clear(CG group) noexcept;
 
   /**
    * @brief Asynchronously resets the estimator, i.e., clears the current count estimate.
@@ -144,8 +144,7 @@ class hyperloglog_ref {
    * @param other Other estimator reference to be merged into `*this`
    */
   template <class CG, cuda::thread_scope OtherScope>
-  __device__ constexpr void merge(CG const& group,
-                                  hyperloglog_ref<T, OtherScope, Hash> const& other);
+  __device__ constexpr void merge(CG group, hyperloglog_ref<T, OtherScope, Hash> const& other);
 
   /**
    * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator.
diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp
index f2bc89fd0..875635c01 100644
--- a/include/cuco/operator.hpp
+++ b/include/cuco/operator.hpp
@@ -56,8 +56,8 @@ inline namespace op {
  * template <typename Value>
  * __device__ bool insert(Value const& value) noexcept
  *
- * template <typename Value>
- * __device__ bool insert(cooperative_groups::thread_block_tile<cg_size> const& group,
+ * template <typename Value, typename ParentCG>
+ * __device__ bool insert(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
  *                        Value const& value) noexcept
  * ```
  *
@@ -85,9 +85,9 @@ struct insert_tag {
  * template <typename Value>
  * __device__ cuda::std::pair<iterator, bool> insert_and_find(Value const& value) noexcept
  *
- * template <typename Value>
+ * template <typename Value, typename ParentCG>
  * __device__ cuda::std::pair<iterator, bool> insert_and_find(
- *   cooperative_groups::thread_block_tile<cg_size> const& group, Value const& value) noexcept
+ *   cooperative_groups::thread_block_tile<cg_size, ParentCG> group, Value const& value) noexcept
  * ```
  *
  * Where:
@@ -114,8 +114,8 @@ struct insert_and_find_tag {
  * template <typename Value>
  * __device__ void insert_or_assign(Value const& value) noexcept
  *
- * template <typename Value>
- * __device__ void insert_or_assign(cooperative_groups::thread_block_tile<cg_size> const& group,
+ * template <typename Value, typename ParentCG>
+ * __device__ void insert_or_assign(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
  *                                  Value const& value) noexcept
  * ```
  *
@@ -144,13 +144,13 @@ struct insert_or_assign_tag {
  *           typename Op>
  * __device__ bool insert_or_apply(Value const& value, Init init, Op op)
  *
- * template <typename Value, typename Op>
- * __device__ bool insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
+ * template <typename Value, typename Op, typename ParentCG>
+ * __device__ bool insert_or_apply(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
  *                                 Value const& value,
  *                                 Op op)
  *
- * template <typename Value, typename Init, typename Op>
- * __device__ bool insert_or_apply(cooperative_groups::thread_block_tile<cg_size> const& group,
+ * template <typename Value, typename Init, typename Op, typename ParentCG>
+ * __device__ bool insert_or_apply(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
  *                                 Value const& value,
  *                                 Init init,
  *                                 Op op)
@@ -182,13 +182,13 @@ struct insert_or_apply_tag {
  * template <typename ProbeKey>
  * __device__ bool erase(ProbeKey const& key) noexcept
  *
- * template <typename ProbeKey>
- * __device__ bool erase(cooperative_groups::thread_block_tile<cg_size> const& group,
+ * template <typename ProbeKey, typename ParentCG>
+ * __device__ bool erase(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
  *                       ProbeKey const& key) noexcept
  * ```
  *
  * Where:
- * @see @tparam ProbeKey Input key type which is convertible to the containser's 'key_type'
+ * @see @tparam ProbeKey Input key type which is convertible to the container's 'key_type'
  *
  * @see @param group The Cooperative Group used to perform this operation
  * @see @param key The key to search for
@@ -207,9 +207,9 @@ struct erase_tag {
  * template <typename ProbeKey>
  * __device__ bool contains(ProbeKey const& key) const noexcept
  *
- * template <typename ProbeKey>
+ * template <typename ProbeKey, typename ParentCG>
  * __device__ bool contains(
- *   cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const
+ *   cooperative_groups::thread_block_tile<cg_size, ParentCG> group, ProbeKey const& key) const
  * noexcept
  * ```
  *
@@ -233,8 +233,8 @@ struct contains_tag {
  * template <typename ProbeKey>
  * __device__ size_type count(ProbeKey const& key) const noexcept
  *
- * template <typename ProbeKey>
- * __device__ size_type count(cooperative_groups::thread_block_tile<cg_size> const& group,
+ * template <typename ProbeKey, typename ParentCG>
+ * __device__ size_type count(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
  *                            ProbeKey const& key) const noexcept
  * ```
  *
@@ -258,9 +258,9 @@ struct count_tag {
  * template <typename ProbeKey>
  * __device__ const_iterator find(ProbeKey const& key) const noexcept
  *
- * template <typename ProbeKey>
+ * template <typename ProbeKey, typename ParentCG>
  * __device__ const_iterator find(
- *   cooperative_groups::thread_block_tile<cg_size> const& group, ProbeKey const& key) const
+ *   cooperative_groups::thread_block_tile<cg_size, ParentCG> group, ProbeKey const& key) const
  * noexcept
  * ```
  *
@@ -295,7 +295,7 @@ struct find_tag {
  *           class OutputProbeIt,
  *           class OutputMatchIt,
  *           class AtomicCounter>
- * __device__ void retrieve(cooperative_groups::thread_block const& block,
+ * __device__ void retrieve(cooperative_groups::thread_block  const& block,
  *                          InputProbeIt input_probe_begin,
  *                          InputProbeIt input_probe_end,
  *                          OutputProbeIt output_probe,
@@ -349,13 +349,13 @@ struct retrieve_tag {
  * template <class ProbeKey, class CallbackOp>
  * __device__ void for_each(ProbeKey const& key, CallbackOp&& callback_op) const noexcept
  *
- * template <class ProbeKey, class CallbackOp>
- * __device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
+ * template <class ProbeKey, class CallbackOp, typename ParentCG>
+ * __device__ void for_each(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
  *                          ProbeKey const& key,
  *                          CallbackOp&& callback_op) const noexcept
  *
- * template <class ProbeKey, class CallbackOp, class SyncOp>
- * __device__ void for_each(cooperative_groups::thread_block_tile<cg_size> const& group,
+ * template <class ProbeKey, class CallbackOp, class SyncOp, typename ParentCG>
+ * __device__ void for_each(cooperative_groups::thread_block_tile<cg_size, ParentCG> group,
  *                          ProbeKey const& key,
  *                          CallbackOp&& callback_op,
  *                          SyncOp&& sync_op) const noexcept
diff --git a/include/cuco/probing_scheme.cuh b/include/cuco/probing_scheme.cuh
index 94022d249..6506c99f5 100644
--- a/include/cuco/probing_scheme.cuh
+++ b/include/cuco/probing_scheme.cuh
@@ -92,9 +92,9 @@ class linear_probing : private detail::probing_scheme_base<CGSize> {
    * @param upper_bound Upper bound of the iteration
    * @return An iterator whose value_type is convertible to slot index type
    */
-  template <int32_t BucketSize, typename ProbeKey, typename Extent>
+  template <int32_t BucketSize, typename ProbeKey, typename Extent, typename ParentCG>
   __host__ __device__ constexpr auto make_iterator(
-    cooperative_groups::thread_block_tile<cg_size> const& g,
+    cooperative_groups::thread_block_tile<cg_size, ParentCG> g,
     ProbeKey const& probe_key,
     Extent upper_bound) const noexcept;
 
@@ -189,9 +189,9 @@ class double_hashing : private detail::probing_scheme_base<CGSize> {
    * @param upper_bound Upper bound of the iteration
    * @return An iterator whose value_type is convertible to slot index type
    */
-  template <int32_t BucketSize, typename ProbeKey, typename Extent>
+  template <int32_t BucketSize, typename ProbeKey, typename Extent, typename ParentCG>
   __host__ __device__ constexpr auto make_iterator(
-    cooperative_groups::thread_block_tile<cg_size> const& g,
+    cooperative_groups::thread_block_tile<cg_size, ParentCG> g,
     ProbeKey const& probe_key,
     Extent upper_bound) const noexcept;
 
diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
index 1d7bdeafe..870eb6a4a 100644
--- a/include/cuco/static_map.cuh
+++ b/include/cuco/static_map.cuh
@@ -1646,7 +1646,7 @@ class static_map {
      * @return Pointer to the initial slot for `k`
      */
     template <typename CG, typename ProbeKey, typename Hash>
-    __device__ iterator initial_slot(CG const& g, ProbeKey const& k, Hash hash) noexcept
+    __device__ iterator initial_slot(CG g, ProbeKey const& k, Hash hash) noexcept
     {
       return &slots_[(hash(k) + g.thread_rank()) % capacity_];
     }
@@ -1666,7 +1666,7 @@ class static_map {
      * @return Pointer to the initial slot for `k`
      */
     template <typename CG, typename ProbeKey, typename Hash>
-    __device__ const_iterator initial_slot(CG const& g, ProbeKey const& k, Hash hash) const noexcept
+    __device__ const_iterator initial_slot(CG g, ProbeKey const& k, Hash hash) const noexcept
     {
       return &slots_[(hash(k) + g.thread_rank()) % capacity_];
     }
@@ -1706,7 +1706,7 @@ class static_map {
      * @return The next slot after `s`
      */
     template <typename CG>
-    __device__ iterator next_slot(CG const& g, iterator s) noexcept
+    __device__ iterator next_slot(CG g, iterator s) noexcept
     {
       uint32_t index = s - slots_;
       return &slots_[(index + g.size()) % capacity_];
@@ -1724,7 +1724,7 @@ class static_map {
      * @return The next slot after `s`
      */
     template <typename CG>
-    __device__ const_iterator next_slot(CG const& g, const_iterator s) const noexcept
+    __device__ const_iterator next_slot(CG g, const_iterator s) const noexcept
     {
       uint32_t index = s - slots_;
       return &slots_[(index + g.size()) % capacity_];
@@ -2015,7 +2015,7 @@ class static_map {
      */
     template <typename CG>
     __device__ static device_mutable_view make_from_uninitialized_slots(
-      CG const& g,
+      CG g,
       pair_atomic_type* slots,
       std::size_t capacity,
       empty_key<Key> empty_key_sentinel,
@@ -2046,7 +2046,7 @@ class static_map {
      */
     template <typename CG>
     __device__ static device_mutable_view make_from_uninitialized_slots(
-      CG const& g,
+      CG g,
       pair_atomic_type* slots,
       std::size_t capacity,
       empty_key<Key> empty_key_sentinel,
@@ -2128,7 +2128,7 @@ class static_map {
     template <typename CG,
               typename Hash     = cuco::default_hash_function<key_type>,
               typename KeyEqual = cuda::std::equal_to<key_type>>
-    __device__ bool insert(CG const& g,
+    __device__ bool insert(CG g,
                            value_type const& insert_pair,
                            Hash hash          = Hash{},
                            KeyEqual key_equal = KeyEqual{}) noexcept;
@@ -2172,7 +2172,7 @@ class static_map {
     template <typename CG,
               typename Hash     = cuco::default_hash_function<key_type>,
               typename KeyEqual = cuda::std::equal_to<key_type>>
-    __device__ bool erase(CG const& g,
+    __device__ bool erase(CG g,
                           key_type const& k,
                           Hash hash          = Hash{},
                           KeyEqual key_equal = KeyEqual{}) noexcept;
@@ -2477,10 +2477,7 @@ class static_map {
               typename Hash     = cuco::default_hash_function<key_type>,
               typename KeyEqual = cuda::std::equal_to<key_type>>
     __device__ cuda::std::enable_if_t<std::is_invocable_v<KeyEqual, ProbeKey, Key>, bool> contains(
-      CG const& g,
-      ProbeKey const& k,
-      Hash hash          = Hash{},
-      KeyEqual key_equal = KeyEqual{}) const noexcept;
+      CG g, ProbeKey const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) const noexcept;
   };  // class device_view
 
   /**
diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh
index 91da7cf4f..51c2769ca 100644
--- a/include/cuco/static_map_ref.cuh
+++ b/include/cuco/static_map_ref.cuh
@@ -292,7 +292,7 @@ class static_map_ref
    */
   template <typename CG, cuda::thread_scope NewScope = thread_scope>
   [[nodiscard]] __device__ constexpr auto make_copy(
-    CG const& tile,
+    CG tile,
     typename StorageRef::value_type* const memory_to_use,
     cuda_thread_scope<NewScope> scope = {}) const noexcept;
 
@@ -306,7 +306,7 @@ class static_map_ref
    * @param tile The cooperative thread group used to initialize the map
    */
   template <typename CG>
-  __device__ constexpr void initialize(CG const& tile) noexcept;
+  __device__ constexpr void initialize(CG tile) noexcept;
 
  private:
   impl_type impl_;  ///< Static map ref implementation
diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh
index 1e86ee03c..9b8eaf8cb 100644
--- a/include/cuco/static_multimap.cuh
+++ b/include/cuco/static_multimap.cuh
@@ -1587,8 +1587,9 @@ class static_multimap {
      * @param g The Cooperative Group that performs the insert
      * @param insert_pair The pair to insert
      */
+    template <typename ParentCG>
     __device__ __forceinline__ void insert(
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
       value_type const& insert_pair) noexcept;
 
    private:
@@ -1670,7 +1671,7 @@ class static_multimap {
      * @param output_begin Beginning of the output sequence of key/value pairs
      */
     template <typename CG, typename atomicT, typename OutputIt>
-    __device__ __forceinline__ void flush_output_buffer(CG const& g,
+    __device__ __forceinline__ void flush_output_buffer(CG g,
                                                         uint32_t const num_outputs,
                                                         value_type* output_buffer,
                                                         atomicT* num_matches,
@@ -1700,7 +1701,7 @@ class static_multimap {
      * pairs
      */
     template <typename CG, typename atomicT, typename OutputIt1, typename OutputIt2>
-    __device__ __forceinline__ void flush_output_buffer(CG const& g,
+    __device__ __forceinline__ void flush_output_buffer(CG g,
                                                         uint32_t const num_outputs,
                                                         value_type* probe_output_buffer,
                                                         value_type* contained_output_buffer,
@@ -1733,9 +1734,11 @@ class static_multimap {
      * @return A boolean indicating whether the key/value pair
      * containing `k` was inserted
      */
-    template <typename ProbeKey, typename KeyEqual = cuda::std::equal_to<key_type>>
+    template <typename ProbeKey,
+              typename KeyEqual = cuda::std::equal_to<key_type>,
+              typename ParentCG = void>
     __device__ __forceinline__ bool contains(
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
       ProbeKey const& k,
       KeyEqual key_equal = KeyEqual{}) const noexcept;
 
@@ -1763,9 +1766,9 @@ class static_multimap {
      * for equality
      * @return A boolean indicating whether the input pair was inserted in the map
      */
-    template <typename ProbePair, typename PairEqual>
+    template <typename ProbePair, typename PairEqual, typename ParentCG>
     __device__ __forceinline__ bool pair_contains(
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
       ProbePair const& p,
       PairEqual pair_equal) const noexcept;
 
@@ -1782,9 +1785,9 @@ class static_multimap {
      * for equality
      * @return Number of matches found by the current thread
      */
-    template <typename KeyEqual = cuda::std::equal_to<key_type>>
+    template <typename KeyEqual = cuda::std::equal_to<key_type>, typename ParentCG = void>
     __device__ __forceinline__ std::size_t count(
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
       Key const& k,
       KeyEqual key_equal = KeyEqual{}) noexcept;
 
@@ -1802,9 +1805,9 @@ class static_multimap {
      * for equality
      * @return Number of matches found by the current thread
      */
-    template <typename KeyEqual = cuda::std::equal_to<key_type>>
+    template <typename KeyEqual = cuda::std::equal_to<key_type>, typename ParentCG = void>
     __device__ __forceinline__ std::size_t count_outer(
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
       Key const& k,
       KeyEqual key_equal = KeyEqual{}) noexcept;
 
@@ -1821,9 +1824,9 @@ class static_multimap {
      * for equality
      * @return Number of matches found by the current thread
      */
-    template <typename PairEqual>
+    template <typename PairEqual, typename ParentCG>
     __device__ __forceinline__ std::size_t pair_count(
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
       value_type const& pair,
       PairEqual pair_equal) noexcept;
 
@@ -1841,9 +1844,9 @@ class static_multimap {
      * for equality
      * @return Number of matches found by the current thread
      */
-    template <typename PairEqual>
+    template <typename PairEqual, typename ParentCG>
     __device__ __forceinline__ std::size_t pair_count_outer(
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& g,
+      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> g,
       value_type const& pair,
       PairEqual pair_equal) noexcept;
 
@@ -1874,10 +1877,11 @@ class static_multimap {
               typename FlushingCG,
               typename atomicT,
               typename OutputIt,
-              typename KeyEqual = cuda::std::equal_to<key_type>>
+              typename KeyEqual = cuda::std::equal_to<key_type>,
+              typename ParentCG = void>
     __device__ __forceinline__ void retrieve(
-      FlushingCG const& flushing_cg,
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& probing_cg,
+      FlushingCG flushing_cg,
+      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
       Key const& k,
       uint32_t* flushing_cg_counter,
       value_type* output_buffer,
@@ -1914,10 +1918,11 @@ class static_multimap {
               typename FlushingCG,
               typename atomicT,
               typename OutputIt,
-              typename KeyEqual = cuda::std::equal_to<key_type>>
+              typename KeyEqual = cuda::std::equal_to<key_type>,
+              typename ParentCG = void>
     __device__ __forceinline__ void retrieve_outer(
-      FlushingCG const& flushing_cg,
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& probing_cg,
+      FlushingCG flushing_cg,
+      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
       Key const& k,
       uint32_t* flushing_cg_counter,
       value_type* output_buffer,
@@ -1958,9 +1963,10 @@ class static_multimap {
               typename OutputIt2,
               typename OutputIt3,
               typename OutputIt4,
-              typename PairEqual>
+              typename PairEqual,
+              typename ParentCG>
     __device__ __forceinline__ void pair_retrieve(
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& probing_cg,
+      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
       value_type const& pair,
       OutputIt1 probe_key_begin,
       OutputIt2 probe_val_begin,
@@ -2002,10 +2008,11 @@ class static_multimap {
               typename atomicT,
               typename OutputIt1,
               typename OutputIt2,
-              typename PairEqual>
+              typename PairEqual,
+              typename ParentCG>
     __device__ __forceinline__ void pair_retrieve(
-      FlushingCG const& flushing_cg,
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& probing_cg,
+      FlushingCG flushing_cg,
+      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
       value_type const& pair,
       uint32_t* warp_counter,
       value_type* probe_output_buffer,
@@ -2050,9 +2057,10 @@ class static_multimap {
               typename OutputIt2,
               typename OutputIt3,
               typename OutputIt4,
-              typename PairEqual>
+              typename PairEqual,
+              typename ParentCG>
     __device__ __forceinline__ void pair_retrieve_outer(
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& probing_cg,
+      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
       value_type const& pair,
       OutputIt1 probe_key_begin,
       OutputIt2 probe_val_begin,
@@ -2094,10 +2102,11 @@ class static_multimap {
               typename atomicT,
               typename OutputIt1,
               typename OutputIt2,
-              typename PairEqual>
+              typename PairEqual,
+              typename ParentCG>
     __device__ __forceinline__ void pair_retrieve_outer(
-      FlushingCG const& flushing_cg,
-      cooperative_groups::thread_block_tile<ProbeSequence::cg_size> const& probing_cg,
+      FlushingCG flushing_cg,
+      cooperative_groups::thread_block_tile<ProbeSequence::cg_size, ParentCG> probing_cg,
       value_type const& pair,
       uint32_t* flushing_cg_counter,
       value_type* probe_output_buffer,
diff --git a/include/cuco/static_multimap_ref.cuh b/include/cuco/static_multimap_ref.cuh
index 2a0c69fd7..cbe27802f 100644
--- a/include/cuco/static_multimap_ref.cuh
+++ b/include/cuco/static_multimap_ref.cuh
@@ -291,7 +291,7 @@ class static_multimap_ref
    */
   template <typename CG, cuda::thread_scope NewScope = thread_scope>
   [[nodiscard]] __device__ constexpr auto make_copy(
-    CG const& tile,
+    CG tile,
     bucket_type* const memory_to_use,
     cuda_thread_scope<NewScope> scope = {}) const noexcept;
 
@@ -305,7 +305,7 @@ class static_multimap_ref
    * @param tile The cooperative thread group used to initialize the map
    */
   template <typename CG>
-  __device__ constexpr void initialize(CG const& tile) noexcept;
+  __device__ constexpr void initialize(CG tile) noexcept;
 
  private:
   impl_type impl_;  ///< Static map ref implementation
diff --git a/include/cuco/static_multiset_ref.cuh b/include/cuco/static_multiset_ref.cuh
index 313295d80..e23024006 100644
--- a/include/cuco/static_multiset_ref.cuh
+++ b/include/cuco/static_multiset_ref.cuh
@@ -271,7 +271,7 @@ class static_multiset_ref
    */
   template <typename CG, cuda::thread_scope NewScope = thread_scope>
   [[nodiscard]] __device__ constexpr auto make_copy(
-    CG const& tile,
+    CG tile,
     bucket_type* const memory_to_use,
     cuda_thread_scope<NewScope> scope = {}) const noexcept;
 
@@ -285,7 +285,7 @@ class static_multiset_ref
    * @param tile The cooperative thread group used to initialize the set
    */
   template <typename CG>
-  __device__ constexpr void initialize(CG const& tile) noexcept;
+  __device__ constexpr void initialize(CG tile) noexcept;
 
  private:
   impl_type impl_;
diff --git a/include/cuco/static_set_ref.cuh b/include/cuco/static_set_ref.cuh
index 40a037443..327c95d75 100644
--- a/include/cuco/static_set_ref.cuh
+++ b/include/cuco/static_set_ref.cuh
@@ -269,7 +269,7 @@ class static_set_ref
    */
   template <typename CG, cuda::thread_scope NewScope = thread_scope>
   [[nodiscard]] __device__ constexpr auto make_copy(
-    CG const& tile,
+    CG tile,
     typename StorageRef::value_type* const memory_to_use,
     cuda_thread_scope<NewScope> scope = {}) const noexcept;
 
@@ -283,7 +283,7 @@ class static_set_ref
    * @param tile The cooperative thread group used to initialize the set
    */
   template <typename CG>
-  __device__ constexpr void initialize(CG const& tile) noexcept;
+  __device__ constexpr void initialize(CG tile) noexcept;
 
  private:
   impl_type impl_;
diff --git a/tests/static_multimap/for_each_test.cu b/tests/static_multimap/for_each_test.cu
index f7290707d..826f915d6 100644
--- a/tests/static_multimap/for_each_test.cu
+++ b/tests/static_multimap/for_each_test.cu
@@ -67,7 +67,8 @@ CUCO_KERNEL void for_each_check_cooperative(Ref ref,
 
   while (idx < n) {
     auto const tile =
-      cooperative_groups::tiled_partition<Ref::cg_size>(cooperative_groups::this_thread_block());
+      cooperative_groups::tiled_partition<Ref::cg_size, cooperative_groups::thread_block>(
+        cooperative_groups::this_thread_block());
     auto const& key            = *(first + idx);
     std::size_t thread_matches = 0;
     if constexpr (Synced) {
@@ -80,7 +81,7 @@ CUCO_KERNEL void for_each_check_cooperative(Ref ref,
             thread_matches++;
           }
         },
-        [] __device__(auto const& group) { group.sync(); });
+        [] __device__(auto group) { group.sync(); });
     } else {
       ref.for_each(tile, key, [&] __device__(auto const slot) {
         auto const [slot_key, slot_value] = slot;
diff --git a/tests/static_multiset/for_each_test.cu b/tests/static_multiset/for_each_test.cu
index b987ba660..6d663f439 100644
--- a/tests/static_multiset/for_each_test.cu
+++ b/tests/static_multiset/for_each_test.cu
@@ -66,7 +66,8 @@ CUCO_KERNEL void for_each_check_cooperative(Ref ref,
 
   while (idx < n) {
     auto const tile =
-      cooperative_groups::tiled_partition<Ref::cg_size>(cooperative_groups::this_thread_block());
+      cooperative_groups::tiled_partition<Ref::cg_size, cooperative_groups::thread_block>(
+        cooperative_groups::this_thread_block());
     auto const& key            = *(first + idx);
     std::size_t thread_matches = 0;
     if constexpr (Synced) {
@@ -76,7 +77,7 @@ CUCO_KERNEL void for_each_check_cooperative(Ref ref,
         [&] __device__(auto const slot) {
           if (ref.key_eq()(key, slot)) { thread_matches++; }
         },
-        [] __device__(auto const& group) { group.sync(); });
+        [] __device__(auto group) { group.sync(); });
     } else {
       ref.for_each(tile, key, [&] __device__(auto const slot) {
         if (ref.key_eq()(key, slot)) { thread_matches++; }
diff --git a/tests/utility/probing_scheme_test.cu b/tests/utility/probing_scheme_test.cu
index 0d232df08..39048946b 100644
--- a/tests/utility/probing_scheme_test.cu
+++ b/tests/utility/probing_scheme_test.cu
@@ -66,7 +66,8 @@ __global__ void generate_cg_probing_sequence(Key key,
 
   if (tid < cg_size) {
     auto const tile =
-      cooperative_groups::tiled_partition<cg_size>(cooperative_groups::this_thread_block());
+      cooperative_groups::tiled_partition<cg_size, cooperative_groups::thread_block>(
+        cooperative_groups::this_thread_block());
 
     auto iter = probing_scheme.template make_iterator<BucketSize>(tile, key, upper_bound);
 

From a1bc5447f5df0a0f1a9f81a4185658f29acb4a1a Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 15 Aug 2025 17:21:27 -0700
Subject: [PATCH 18/24] Use CMAKE_ARGS in build.sh and some more minor
 improvements

---
 ci/build.sh                  | 45 +++++++++++++++++++++++-------------
 cmake/roaring_testdata.cmake |  2 ++
 2 files changed, 31 insertions(+), 16 deletions(-)

diff --git a/ci/build.sh b/ci/build.sh
index 7ac9029e3..035a5a4c1 100755
--- a/ci/build.sh
+++ b/ci/build.sh
@@ -51,7 +51,12 @@ HOST_COMPILER=${CXX:-g++} # $CXX if set, otherwise `g++`
 CUDA_ARCHS=native # detect system's GPU architectures
 CXX_STANDARD=17
 
-EXTRA_CMAKE_OPTIONS=()
+# Initialize CMAKE_ARGS from environment variable if available
+if [ -n "${CMAKE_ARGS:-}" ]; then
+    read -ra CMAKE_ARGS <<< "$CMAKE_ARGS"
+else
+    CMAKE_ARGS=()
+fi
 
 function usage {
     echo "cuCollections build script"
@@ -105,8 +110,15 @@ function usage {
     echo "    Enables verbose mode for detailed output and builds with C++17 standard."
     echo "    Build files will be written to <repo_root>/build/local and symlinked to <repo_root>/build/latest."
     echo
-    echo "Pass-through:"
-    echo "  -- [CMake args...]  Anything after -- is forwarded to CMake"
+    echo "  Using CMAKE_ARGS Environment Variable:"
+    echo "    $ CMAKE_ARGS=\"-DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_FEATURE=ON\" $0 -t"
+    echo "    $ export CMAKE_ARGS=\"-DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_FEATURE=ON\""
+    echo "    $ $0 -t"
+    echo "    Uses CMAKE_ARGS environment variable to pass additional CMake options."
+    echo "    Can be overridden by using -- followed by specific arguments."
+    echo
+    echo "  Pass-through to CMake:"
+    echo "    -- [CMake args...]  Anything after -- is forwarded to CMake (overrides CMAKE_ARGS env var)"
     echo
     exit 1
 }
@@ -131,7 +143,7 @@ while [ "${#args[@]}" -ne 0 ]; do
     --arch) CUDA_ARCHS="${args[1]}";    args=("${args[@]:2}");;
     --std)  CXX_STANDARD="${args[1]}";  args=("${args[@]:2}");;
     -v | -verbose | --verbose) VERBOSE=1; args=("${args[@]:1}");;
-    --) EXTRA_CMAKE_OPTIONS+=("${args[@]:1}"); break;;
+    --) CMAKE_ARGS=("${args[@]:1}"); break;;
     -h | -help | --help) usage ;;
     *) echo "Unrecognized option: ${args[0]}"; usage ;;
     esac
@@ -162,14 +174,12 @@ if [ "$BUILD_TESTS" == "OFF" ] && [ "$BUILD_EXAMPLES" == "OFF" ] && [ "$BUILD_BE
     BUILD_BENCHMARKS=ON
 fi
 
+BUILD_DIR="$BUILD_PREFIX/$BUILD_INFIX"
 # Trigger clean (re-)build
 if [ "$CLEAN_BUILD" -eq 1 ]; then
     rm -rf BUILD_DIR
 fi
-
-BUILD_DIR="$BUILD_PREFIX/$BUILD_INFIX"
 mkdir -p $BUILD_DIR
-export BUILD_DIR # TODO remove
 
 # The most recent build will be symlinked to cuCollections/build/latest
 rm -f $BUILD_PREFIX/latest
@@ -194,10 +204,10 @@ CMAKE_OPTIONS="
     -DBUILD_BENCHMARKS=${BUILD_BENCHMARKS} \
 "
 
-echo "========================================"
-echo "-- START: $(date)"
+echo "[INFO]=============================================="
+echo "-- TIMESTAMP: $(date -u +"%Y-%m-%d %H:%M:%S UTC")"
 echo "-- GIT_SHA: $(git rev-parse HEAD 2>/dev/null || echo 'Not a repository')"
-echo "-- PWD: $(pwd)"
+echo "-- SRC_DIR: $(dirname $(pwd))"
 echo "-- BUILD_DIR: ${BUILD_DIR}"
 echo "-- BUILD_TYPE: ${BUILD_TYPE}"
 echo "-- PARALLEL_LEVEL: ${PARALLEL_LEVEL}"
@@ -206,23 +216,26 @@ echo "-- BUILD_TESTS: ${BUILD_TESTS}"
 echo "-- BUILD_EXAMPLES: ${BUILD_EXAMPLES}"
 echo "-- BUILD_BENCHMARKS: ${BUILD_BENCHMARKS}"
 
-if [ ${#EXTRA_CMAKE_OPTIONS[@]} -gt 0 ]; then
-    echo "-- EXTRA_CMAKE_OPTIONS: ${EXTRA_CMAKE_OPTIONS[*]}"
+if [ ${#CMAKE_ARGS[@]} -gt 0 ]; then
+    echo "-- CMAKE_ARGS: ${CMAKE_ARGS[*]}"
 else
-    echo "-- EXTRA_CMAKE_OPTIONS: (none)"
+    echo "-- CMAKE_ARGS: (none)"
 fi
 
+
 # configure
-cmake -S .. -B $BUILD_DIR $CMAKE_OPTIONS "${EXTRA_CMAKE_OPTIONS[@]}"
-echo "========================================"
+echo "[CONFIGURE]========================================"
+cmake -S .. -B $BUILD_DIR $CMAKE_OPTIONS "${CMAKE_ARGS[@]}"
 
 if command -v sccache >/dev/null; then
     source "./sccache_stats.sh" start
+else
+    echo "sccache stats: N/A"
 fi
 
 #build
+echo "[BUILD]============================================"
 cmake --build $BUILD_DIR --parallel $PARALLEL_LEVEL
-echo "========================================"
 echo "Build complete"
 
 if command -v sccache >/dev/null; then
diff --git a/cmake/roaring_testdata.cmake b/cmake/roaring_testdata.cmake
index 8dded834c..168519866 100644
--- a/cmake/roaring_testdata.cmake
+++ b/cmake/roaring_testdata.cmake
@@ -35,5 +35,7 @@ rapids_cmake_download_with_retry("${ROARING_FORMATSPEC_BASE}/testdata64/portable
                                  "${CUCO_ROARING_DATA_DIR}/portable_bitmap64.bin"
                                  "b5a553a759167f5f9ccb3fa21552d943b4c73235635b753376f4faf62067d178")
 
+message(STATUS "Roaring Bitmap test data downloaded to: ${CUCO_ROARING_DATA_DIR}")
+
 # Define macro only when data is available
 add_compile_definitions(CUCO_ROARING_DATA_DIR="${CUCO_ROARING_DATA_DIR}")
\ No newline at end of file

From 830ca6518e8a9c4d1f684cd0b2c55967b817fedb Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Fri, 15 Aug 2025 18:18:51 -0700
Subject: [PATCH 19/24] Use CMAKE_ARGS in build script and other minor
 improvements

---
 ci/build.sh | 48 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/ci/build.sh b/ci/build.sh
index 3d244f334..1d3074fb9 100755
--- a/ci/build.sh
+++ b/ci/build.sh
@@ -37,6 +37,9 @@ resolve_path() {
 # Ensure the script is being executed in its containing directory
 cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )";
 
+# Determine repo root as the parent of the `ci` directory
+REPO_ROOT="$(cd .. && pwd)"
+
 # Script defaults
 BUILD_TESTS=${BUILD_TESTS:-OFF}
 BUILD_EXAMPLES=${BUILD_EXAMPLES:-OFF}
@@ -51,6 +54,13 @@ HOST_COMPILER=${CXX:-g++} # $CXX if set, otherwise `g++`
 CUDA_ARCHS=native # detect system's GPU architectures
 CXX_STANDARD=17
 
+# Initialize CMAKE_ARGS from environment variable if available
+if [ -n "${CMAKE_ARGS:-}" ]; then
+    read -ra CMAKE_ARGS <<< "$CMAKE_ARGS"
+else
+    CMAKE_ARGS=()
+fi
+
 function usage {
     echo "cuCollections build script"
     echo "Usage: $0 [OPTIONS]"
@@ -103,6 +113,16 @@ function usage {
     echo "    Enables verbose mode for detailed output and builds with C++17 standard."
     echo "    Build files will be written to <repo_root>/build/local and symlinked to <repo_root>/build/latest."
     echo
+    echo "  Using CMAKE_ARGS Environment Variable:"
+    echo "    $ CMAKE_ARGS=\"-DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_FEATURE=ON\" $0 -t"
+    echo "    $ export CMAKE_ARGS=\"-DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_FEATURE=ON\""
+    echo "    $ $0 -t"
+    echo "    Uses CMAKE_ARGS environment variable to pass additional CMake options."
+    echo "    Can be overridden by using -- followed by specific arguments."
+    echo
+    echo "  Pass-through to CMake:"
+    echo "    -- [CMake args...]  Anything after -- is forwarded to CMake (overrides CMAKE_ARGS env var)"
+    echo
     exit 1
 }
 
@@ -126,6 +146,7 @@ while [ "${#args[@]}" -ne 0 ]; do
     --arch) CUDA_ARCHS="${args[1]}";    args=("${args[@]:2}");;
     --std)  CXX_STANDARD="${args[1]}";  args=("${args[@]:2}");;
     -v | -verbose | --verbose) VERBOSE=1; args=("${args[@]:1}");;
+    --) CMAKE_ARGS=("${args[@]:1}"); break;;
     -h | -help | --help) usage ;;
     *) echo "Unrecognized option: ${args[0]}"; usage ;;
     esac
@@ -156,14 +177,12 @@ if [ "$BUILD_TESTS" == "OFF" ] && [ "$BUILD_EXAMPLES" == "OFF" ] && [ "$BUILD_BE
     BUILD_BENCHMARKS=ON
 fi
 
+BUILD_DIR="$BUILD_PREFIX/$BUILD_INFIX"
 # Trigger clean (re-)build
 if [ "$CLEAN_BUILD" -eq 1 ]; then
     rm -rf BUILD_DIR
 fi
-
-BUILD_DIR="$BUILD_PREFIX/$BUILD_INFIX"
 mkdir -p $BUILD_DIR
-export BUILD_DIR # TODO remove
 
 # The most recent build will be symlinked to cuCollections/build/latest
 rm -f $BUILD_PREFIX/latest
@@ -186,12 +205,13 @@ CMAKE_OPTIONS="
     -DBUILD_TESTS=${BUILD_TESTS} \
     -DBUILD_EXAMPLES=${BUILD_EXAMPLES} \
     -DBUILD_BENCHMARKS=${BUILD_BENCHMARKS} \
+    ${CMAKE_ARGS[*]}
 "
 
-echo "========================================"
-echo "-- START: $(date)"
-echo "-- GIT_SHA: $(git rev-parse HEAD 2>/dev/null || echo 'Not a repository')"
-echo "-- PWD: $(pwd)"
+echo "[INFO]=============================================="
+echo "-- TIMESTAMP: $(date -u +"%Y-%m-%d %H:%M:%S UTC")"
+echo "-- GIT_SHA: $(git rev-parse HEAD 2>/dev/null || echo 'N/A')"
+echo "-- SRC_DIR: ${REPO_ROOT}"
 echo "-- BUILD_DIR: ${BUILD_DIR}"
 echo "-- BUILD_TYPE: ${BUILD_TYPE}"
 echo "-- PARALLEL_LEVEL: ${PARALLEL_LEVEL}"
@@ -200,21 +220,29 @@ echo "-- BUILD_TESTS: ${BUILD_TESTS}"
 echo "-- BUILD_EXAMPLES: ${BUILD_EXAMPLES}"
 echo "-- BUILD_BENCHMARKS: ${BUILD_BENCHMARKS}"
 
+if [ ${#CMAKE_ARGS[@]} -gt 0 ]; then
+    echo "-- CMAKE_ARGS: ${CMAKE_ARGS[*]}"
+else
+    echo "-- CMAKE_ARGS: (none)"
+fi
+
 # configure
+echo "[CONFIGURE]========================================"
 cmake -S .. -B $BUILD_DIR $CMAKE_OPTIONS
-echo "========================================"
 
 if command -v sccache >/dev/null; then
     source "./sccache_stats.sh" start
+else
+    echo "sccache stats: N/A"
 fi
 
 #build
+echo "[BUILD]============================================"
 cmake --build $BUILD_DIR --parallel $PARALLEL_LEVEL
-echo "========================================"
 echo "Build complete"
 
 if command -v sccache >/dev/null; then
     source "./sccache_stats.sh" end
 else
     echo "sccache stats: N/A"
-fi
+fi
\ No newline at end of file

From 01169d805a2889077fc5128604f05c26f6e44cb1 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 18 Aug 2025 15:47:54 -0700
Subject: [PATCH 20/24] Address comments from code review

---
 README.md                                     |  2 +-
 examples/roaring_bitmap/host_bulk_example.cu  |  8 ++--
 .../roaring_bitmap/roaring_bitmap_impl.cuh    | 39 +++++++++-------
 .../roaring_bitmap/roaring_bitmap_storage.cuh | 11 ++---
 include/cuco/detail/roaring_bitmap/util.cuh   | 44 +++++++++++++------
 include/cuco/roaring_bitmap.cuh               |  5 ++-
 tests/roaring_bitmap/contains_test.cu         |  5 +--
 7 files changed, 65 insertions(+), 49 deletions(-)

diff --git a/README.md b/README.md
index ea40b39b8..8608e3680 100644
--- a/README.md
+++ b/README.md
@@ -266,4 +266,4 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
 `cuco::roaring_bitmap` implements a Roaring bitmap following the [Roaring bitmap format specification](https://github.com/RoaringBitmap/RoaringFormatSpec).
 
 #### Examples:
-- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/roaring_bitmap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy9WA1v2zYT_iv3qsAgN7blpB_ZnI_NjdPNWF97sNMVQ1MIlETbhGVRI6k4XpD__h5JfSbK2nV75wCxRR6fu-M9dzzqzpFUSsYT6Qw_3jkscoaHXScmySojK-oMnTCLiNN1JM9EqJ-959cJPIcLnu4FW60VuGEHjgZHr2D662Q8GcHFbP7LbD66msymfS1qxN-xkCaSRpAlERWg1hRGKQnxK5_pwq9UaDvgqD8AVwtcO_nctdM5MSh7nsGW7CHhCjJJEYZJWLKYAr0NaaqAJRDybRozkoQUdkytjaocx5gDv-UgPFAE5QmuSPFpWZcEokrT9WetVDr0vN1u1yfG7D4XKy-2wtJ7N7m4nC4ue2h6uex9EuPGgqC_Z0yg48EeSIqWhSRAe2OyAy6ArATFOcW15TvBFEtWXZB8qXZEUIMTMakECzLV2LzCTvS_LoDbRxLcuNECJotrB96MFpNF1-B8mFz9NHt_BR9G8_loejW5XMBsjsGajic6VPj0FkbT3-DnyXTcBYpbh6robSq0F2gq09tKI7uHC0obZiy5NUumNGRLFkJBIFjxGyoSdAtSKrbMUg2NjAxOzLZMEWXGHjlnVHnXyXXyjCVhnEUUTsMs5J7gRCCiHzC1JWk_zNbnj2QyxWKm9p4ShCnZX6fp-UOkiHhSRV6I_yK6PH9ykiWqfVLtU-pbBQ0BtRaZVF5Eb9AR_4aGiov-uk0k5iskRNw-mSUM906SuA5Rl1ti4CnZNtYy3jKoCZKsGkMW0gB6zy1jfzCZtEYAP8jijU9vCcac4v7a6UAwuoQx3WK00GtFMWZSxzjPnWZYkIUaBUkYc77JUgMMo18msqoKk8SmcK4JdpgYnGA-7Di8OOohEFgwwxhkN4XXL2vD4KZcKJNQyMAtUR1YCr7V1hj8j3Nr0hsj_daILJCin1yd0BIzeoU8z4I-Fg2vIVs8VWs6mMoplwx3bV_yFytBuAFm_dfuFs6inyivMoHE1nMhFwJ3HMdkFiMfYUq2NN53tcu4kcoILXkc853OFR0JOTQqevDROqurGc-UyBLZD1hiJl27S52vccgLYh54rw6Pj0n0naeNiIgiXquyzmNT_j07HhlRBD1n2uuXlR2WHv-gHa9feq3qOiWJf8DTiMJVncd50c9DvyUbzJFUVznojS_eX8z88ezD9N1sNPbns9F8Mv3Rv7pcXI1HV6Oz2VSfBgGWUqrKRDFFUFHExqzD-oB1J0H-wM90f4W_MYsDzmPLRhftHg5txiPtMFW_ybPF16zyU6LWaPwd4gLJUNeKJlRns7-hewln8PGT24HeOdgqNBw2ythpoRIMAGjuGyX6pMBWAEujVo4WMOlLtNG_KZZ0oTabYVF9ceSr804BBOB5cIGlCz38PaOYZMae4mSp6HCHvLwvOFFlSgHxjxFwfjka__eyv42e6aGeHivUGBfyDWlzyph-UkhrF9wWMdjgbg9O8OsUDgf6o38fnJmH2r6AgeunmVz7AcEIbzol9n1DCQIb0ArtFFsz-_vgYPNnmC-QyZ_FPa7hfvtluG2YtjA-QbC2_XQNaEBXLHE7XauCJpHbKcDvgca6G_p6Mr5--RfI2FoRDBX_H0zEElRx8ZFqW_o-R0vj3udpqcUsLW9NeLM4ttHG5--K56-I-Od0jZq6Dgd_Q9cDkcHtUY5WybaKvGqIPGkxs0nLHhjKdOIeaYg_MXdw-22-AA6A_c3MMCH90sy4qziC7XboEympUK5ulbUaPE8imih_SVC4rPFd7OHy37BFg_SxVBYv3J_Civx-Vvfg7r7Qr7_0g_7GpJihKpMp-tnmZt7GmlH34VHVzYW4HA6R6kTsrS5Mdfc_WqqPuc0RFF0uvDQrQioEnJ6iC28JiplLlpbDAT38UI8eM-twJ-Lc9twXsylm6L5040eq7MVTsj-MK8YUSelm5Q7qNiOcNbg4ltFTvcYs8M2vM7tY0TheuVb4KTQMdCffy4Ij1S3BdO4pSxIatVAm2Cs8tYNsuaTCLZV3qsjMKXbexicMK4ct9vliXxqDdkeuoDhFBd4JlR8SqU7DNRHPz93CFkF2fsqNjJl3rbq-rmIYHyRoQ3EOHcZcUrdmSV548_tE0esXnX3JHkve5q2j6k-a419s4kktxLYvqh8ALsHUldgZx5FOBjxs9DsEJBdLal1Kp-yt8paq0WK1eErAxktzVOK3vVznd4V6sJsFQTd955URthbozdWlwLC2puoN3k56-e2k5hFZ4dr8_mGt1_IPb9hWhXyy2nQrK4rpupfFLUnvnlZkNYsnN1CvMx0tLvArmbNyH_Q4X7qPlNbtyA3DSlDrA3SVY2p_d2_5l2d4Qw1OmDzXTc-WaFTTLT9jywivv6Z9L7p23bH748m8nt9l2w2aUn7EBNrduuqkdFNmYahfszQ_Z7ahb--ISvADLGjtl7aiLBfo33wV4F9Fs8dSE639wuYUFLH1Gi239TrXMMwrtVto_B4H0DmKw3rOEBxBHlbuWlyrlQNccoiTz_RpWOksz4j261tr2PL7GZLBkBILCMNqYN41Nq56107VB-Sfa-cL7n-dhwsfnUy5d9afJGJLZKzTdfSbTyyNonqT6yQ3YXh49Co7xGlrFk46PYQ7Cw8ODo-hR0S4PpNb_3gAvR72zQr_Kd0NRL2YbAPz7jdmQQ0zDMMYB2_s61oc0ITbOPfdYh7LcmMei5Zz_8n8_Q-9NYGl))
\ No newline at end of file
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/roaring_bitmap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy9WAtv27YW_ivnqsAgN7aVpI9szmNz43Qz1msPtrtiaAqBkmibsCxqJBXHC_Lf7yGpZ6KsXbc7B4gt8vA7D37n8FB3jqRSMp5IZ_DxzmGRMzjqOjFJVhlZUWfghFlEnK4jeSZC_ew9v07gOVzydC_Yaq3ADTtwfHj8Cia_jkfjIVxOZ79MZ8PFeDrpa1Ej_o6FNJE0giyJqAC1pjBMSYhf-UwXfqVC2wHH_UNwtcC1k89dO51Tg7LnGWzJHhKuIJMUYZiEJYsp0NuQpgpYAiHfpjEjSUhhx9TaqMpxjDnwWw7CA0VQnuCKFJ-WdUkgqjRdf9ZKpQPP2-12fWLM7nOx8mIrLL1348uryfyqh6aXy94nMQYWBP09YwIdD_ZAUrQsJAHaG5MdcAFkJSjOKa4t3wmmWLLqguRLtSOCGpyISSVYkKlG8Ao70f-6AIaPJBi44RzG82sH3gzn43nX4HwYL36avl_Ah-FsNpwsxldzmM5wsyajsd4qfHoLw8lv8PN4MuoCxdChKnqbCu0Fmsp0WGlkYzintGHGkluzZEpDtmQhFASCFb-hIkG3IKViyyzV0MjI4MRsyxRRZuyRc0aVd51cJ89YEsZZROEszELuCU4EIvoBU1uS9sNsffFIJlMsZmrvKUGYkv11ml48RIqIJ1XkhfgvosuLJydZoton1T6lvlXQEFBrkUnlRfQGHfFvaKi46K_bRGK-QkLE7ZNZwjB2ksR1iLqcZr7cS0W3jeVL5AMlzTHGWwY1b5JVY8hqMnq855bIP5gEWyOAH2Txxqe3BKlAMex2OhCMLmFEt7iJGAxFcSul3vo8pZq7heTUKMjNmPNNlhpgGP4yllWxGCc2s3NNsMN84QTTZMfhxXEPgcCCGSIh6Sm8flkbBjflQpk8Q2JuierAUvCttsbgf5xZk94Y6bdGZI7M_eTqPJeY6Cukfxb0sZZ4DdniqVrTwQxPuWQYtX1JaywQ4QaY9V-7WziLfqK8ygTyXc-FXAiMOI7JLEaawoRsabzvapcxkMoILXkc851OIbPhA6OiBx-ts7rI8UyJLJH9gCVm0rVR6nyNQ14Q88B7dXRyQqLvPG1ERBTxWpV1Hpvy79nxyIhi03OmvX5Z2WHp8Q_a8fql16quU5L4BzykKCzqPM7Pgnzrt2SDOZLq4ge90eX7y6k_mn6YvJsOR_5sOpyNJz_6i6v5YjRcDM-nE31IBFhhqSoTxdRGzP40xqzDsoHlKEH-wM90v8DfmMUB57Flo4t2DwY245F2mKrf5Nnia1b5KVFrNP4OcYFkqGtFE6qz2d_QvYRz-PjJ7UDvAmxxGgwa1e2sUAkGADT3jRJ9gGCHgBVTK0cLmPQl2ujfFEu6UJvNsNa-OPbVRacAAvA8uMTShR7-nlFMMmNPceBUdLhDXt4XnKgypYD4xwg4uxqO_nvV30bP9FBPjxVqjAt5QNqcMqafFtLaBbdFDDYY7cNT_DqDo0P90b8Pzs1DLS5g4PppJtd-QHCHN50S-76hBIENaIV2hh2b_X1wsPkzzBfI5M_intRwv_0y3DZMWxifIFhbPF0DGtAVS9xO16qgSeR2CvB7oLFukr6ejK9f_gUytlYEQ8X_BxOxBFVcfKTalr7P0dK493laajFLy1uzvVkc293G5--K56_Y8c_pGjZ1HR3-DV0PRA5vj3O0SrZV5FVD5EmLmU1a9sBQphP3WEP8ibmHt9_mC-AA2N_MDLOlX5oZdxVHsAsPfSIlFcrVHbRWg-dJRBPlLwkKlzW-iz1c_hu2aJA-lsrihfEprMivbXUP7u4L_fpLP-hvTIopqjKZop9tbuZtrBl1Hx5V3VyIy8EAqU7E3urCVHf_o6X6mNscQdHlwkuzIqRCwNkZuvCWoJi5e2k5HNDDD_XoMbMOIxHntue-mKCYofvSjR-psvdRyf6g5VFq0PQIcsSAVY27_W0mH_nYqcIzxA4w1FXHdMtb7LWx9jTPwDJ6BUuq64Pp3VOWJDRqIU2wV3huB9lySYVbWlNTPqPYexuvcGN5rl3PmTjjHkWuoDhFBV4WlR8Sqc7CNRHPL9zCFkF2fsqNjJl3rbq-rmO4Q0jRhuIcOoy5pG7Nkrz05jeK0vG8ty8jYOnbvHdUHUpz_ItNPK1tsu2M6keASzB5JfbGcaTTAY8b_XIB6cWS2h51SkrkTVWjyWrxlIDdL81Sid_21p3fFuqb3SwJuu27qIyw1cBwTEda87am6g3eT3r5_aTmEVnh2vwGYq3X8g-v3laFfLLedCsrium6l8U9SUdPK7KaxZMB1OtMT4sL_ErmvIyDHudL95HSuh25YVgLap2ArnNM7e_uLf_yHG-owQmT6brt2RKNavrlZ2wZ4QXYNPBF3657dn80npXVrN54g6aUHzGBdreuOi3dlFkY6vcvzc-5benbe6IS_ABLWvu1rSjMBfo3XwX4V9HswdREa7-yOQVFbMVGy23FzjUM8lptqxeGicTpmpiR3IQHRbu2oYXE93AIAzjCyWf6IKyUlcdD-82tdb_yqxmywLARKwfDMmDePjZueddO1QLkn2vnC65-nYcLHx1KuXfWnyRiS6Sq03X0u1CsiaJ6t-skN2F4dPwqO8JpaxZOOj2EOw8PDo5OoEdEuD6XW__kEHo9bJkV_lO6EYh6MdkG5m1wzIIaZhiGMQ7e2Be4OKCZtnHuu8U81uPGPFYr5_6T-fsfn-2NIg==))
\ No newline at end of file
diff --git a/examples/roaring_bitmap/host_bulk_example.cu b/examples/roaring_bitmap/host_bulk_example.cu
index 4e371eaa1..46d5481cb 100644
--- a/examples/roaring_bitmap/host_bulk_example.cu
+++ b/examples/roaring_bitmap/host_bulk_example.cu
@@ -24,6 +24,7 @@
 #include <thrust/logical.h>
 #include <thrust/universal_vector.h>
 
+#include <filesystem>
 #include <fstream>
 #include <iostream>
 #include <string>
@@ -95,10 +96,9 @@ bool check(std::string const& bitmap_file_path)
   }
 
   // Get file size
-  file.seekg(0, std::ios::end);
-  std::streamsize file_size = file.tellg();
-  file.seekg(0, std::ios::beg);
+  auto file_size = std::filesystem::file_size(bitmap_file_path);
 
+  // Allocate host memory for the bitmap file
   thrust::universal_host_pinned_vector<cuda::std::byte> buffer(file_size);
 
   // Read file into memory
@@ -130,7 +130,7 @@ int main()
   success &= check<cuda::std::uint32_t>(data_dir + "/bitmapwithruns.bin");
   success &= check<cuda::std::uint64_t>(data_dir + "/portable_bitmap64.bin");
 
-  std::cout << "success: " << (success ? "true" : "false") << std::endl;
+  std::cout << "success: " << std::boolalpha << success << std::endl;
 
   return success ? 0 : 1;
 #else
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
index 82762dbe0..b69ebf0dd 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
@@ -27,8 +27,7 @@
 #include <cuda/std/functional>
 #include <cuda/std/iterator>
 #include <cuda/stream_ref>
-#include <thrust/execution_policy.h>
-#include <thrust/fill.h>
+#include <thrust/iterator/constant_iterator.h>
 
 namespace cuco::detail {
 
@@ -75,9 +74,12 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
                                cuda::stream_ref stream = {}) const noexcept
   {
     if (this->empty()) {
-      auto nosync_exec_policy = thrust::cuda::par_nosync.on(stream.get());
-      thrust::fill(
-        nosync_exec_policy, contained, contained + cuda::std::distance(first, last), false);
+      cub::DeviceTransform::Transform(
+        thrust::constant_iterator<bool>(false),
+        contained,
+        cuda::std::distance(first, last),
+        cuda::proclaim_return_type<bool>([] __device__(auto /* dummy */) { return false; }),
+        stream.get());
     } else {
       cub::DeviceTransform::Transform(
         first,
@@ -176,8 +178,7 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
                                                     index * sizeof(cuda::std::uint32_t));
     }
     cuda::std::byte const* container = storage_ref_.data() + offset;
-    if (storage_ref_.metadata().has_run and
-        (storage_ref_.run_container_bitmap()[index / 8] & (1 << (index % 8)))) {
+    if (storage_ref_.metadata().has_run and check_bit(storage_ref_.run_container_bitmap(), index)) {
       return this->contains_run_container<Aligned>(container, lower);
     } else {
       cuda::std::uint32_t card;
@@ -188,7 +189,7 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
         card = 1u + misaligned_load<cuda::std::uint16_t>(
                       storage_ref_.key_cards() + (index * 2 + 1) * sizeof(cuda::std::uint16_t));
       }
-      if (card <= 4096) {
+      if (card <= storage_ref_type::metadata_type::max_array_container_card) {
         return this->contains_array_container<Aligned>(container, lower, card);
       } else {
         return this->contains_bitset_container(container, lower, card);
@@ -313,17 +314,21 @@ class roaring_bitmap_impl<cuda::std::uint64_t> {
                                OutputIt contained,
                                cuda::stream_ref stream = {}) const noexcept
   {
-    auto nosync_exec_policy = thrust::cuda::par_nosync.on(stream.get());
     if (this->empty()) {
-      thrust::fill(
-        nosync_exec_policy, contained, contained + cuda::std::distance(first, last), false);
+      cub::DeviceTransform::Transform(
+        thrust::constant_iterator<bool>(false),
+        contained,
+        cuda::std::distance(first, last),
+        cuda::proclaim_return_type<bool>([] __device__(auto /* dummy */) { return false; }),
+        stream.get());
     } else {
-      thrust::transform(nosync_exec_policy,
-                        first,
-                        last,
-                        contained,
-                        cuda::proclaim_return_type<bool>(
-                          [*this] __device__(auto key) { return this->contains(key); }));
+      cub::DeviceTransform::Transform(
+        first,
+        contained,
+        cuda::std::distance(first, last),
+        cuda::proclaim_return_type<bool>(
+          [*this] __device__(auto key) { return this->contains(key); }),
+        stream.get());
     }
   }
 
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh
index 349f1bb83..c2736fe54 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh
@@ -45,8 +45,7 @@ class roaring_bitmap_storage_ref<cuda::std::uint32_t> {
                                                  metadata_type const& metadata)
     : metadata_{metadata},
       data_{bitmap},
-      run_container_bitmap_{
-        reinterpret_cast<cuda::std::uint8_t const*>(bitmap + metadata.run_container_bitmap)},
+      run_container_bitmap_{bitmap + metadata.run_container_bitmap},
       key_cards_{bitmap + metadata.key_cards},
       container_offsets_{bitmap + metadata.container_offsets}
   {
@@ -64,7 +63,7 @@ class roaring_bitmap_storage_ref<cuda::std::uint32_t> {
 
   __host__ __device__ cuda::std::size_t size_bytes() const noexcept { return metadata_.size_bytes; }
 
-  __host__ __device__ cuda::std::uint8_t const* run_container_bitmap() const noexcept
+  __host__ __device__ cuda::std::byte const* run_container_bitmap() const noexcept
   {
     return run_container_bitmap_;
   }
@@ -79,7 +78,7 @@ class roaring_bitmap_storage_ref<cuda::std::uint32_t> {
  private:
   metadata_type metadata_;
   cuda::std::byte const* data_;
-  cuda::std::uint8_t const* run_container_bitmap_;
+  cuda::std::byte const* run_container_bitmap_;
   cuda::std::byte const* key_cards_;
   cuda::std::byte const* container_offsets_;
 };
@@ -208,10 +207,6 @@ class roaring_bitmap_storage<cuda::std::uint64_t, Allocator> {
       metadata_.num_buckets * sizeof(cuda::std::pair<cuda::std::uint32_t, bucket_ref_type>),
       cudaMemcpyHostToDevice,
       stream.get()));
-    // stream.wait();
-    // clear intermediate data
-    // bucket_metadata.clear();
-    // buckets_h.clear();
   }
 
   ref_type ref() const noexcept { return ref_; }
diff --git a/include/cuco/detail/roaring_bitmap/util.cuh b/include/cuco/detail/roaring_bitmap/util.cuh
index 01892e73a..a3cc04ae7 100644
--- a/include/cuco/detail/roaring_bitmap/util.cuh
+++ b/include/cuco/detail/roaring_bitmap/util.cuh
@@ -42,6 +42,14 @@ __host__ __device__ __forceinline__ T misaligned_load(cuda::std::byte const* ptr
   return value;
 }
 
+__host__ __device__ __forceinline__ bool check_bit(cuda::std::byte const* bitmap,
+                                                   cuda::std::uint32_t index)
+{
+  // check if the bit at index is set
+  return static_cast<cuda::std::uint8_t>(bitmap[index / 8]) &
+         (cuda::std::uint8_t(1) << (index % 8));
+}
+
 template <class T>
 struct roaring_bitmap_metadata {
   static_assert(cuco::dependent_false<T>, "T must be either uint32_t or uint64_t");
@@ -49,8 +57,10 @@ struct roaring_bitmap_metadata {
 
 template <>
 struct roaring_bitmap_metadata<cuda::std::uint32_t> {
+  static constexpr cuda::std::uint32_t max_array_container_card = 4096;
+
   cuda::std::size_t size_bytes             = 0;
-  cuda::std::uint32_t num_keys             = 0;
+  cuda::std::size_t num_keys               = 0;
   cuda::std::uint32_t run_container_bitmap = 0;
   cuda::std::uint32_t key_cards            = 0;
   cuda::std::uint32_t container_offsets    = 0;
@@ -63,14 +73,18 @@ struct roaring_bitmap_metadata<cuda::std::uint32_t> {
     constexpr cuda::std::uint32_t serial_cookie_no_runcontainer = 12346;
     constexpr cuda::std::uint32_t serial_cookie                 = 12347;
     // constexpr cuda::std::uint32_t frozen_cookie                 = 13766; // not implemented
-    constexpr cuda::std::int32_t no_offset_threshold = 4;
+    constexpr cuda::std::int32_t no_offset_threshold     = 4;
+    constexpr cuda::std::int32_t max_containers          = 1 << 16;
+    constexpr cuda::std::uint32_t cookie_mask            = 0xFFFF;
+    constexpr cuda::std::uint32_t cookie_shift           = 16;
+    constexpr cuda::std::uint32_t bitset_container_bytes = 8192;
 
     cuda::std::byte const* buf = bitmap;
 
     cuda::std::uint32_t cookie;
     cuda::std::memcpy(&cookie, buf, sizeof(cuda::std::uint32_t));
     buf += sizeof(cuda::std::uint32_t);
-    if ((cookie & 0xFFFF) != serial_cookie && cookie != serial_cookie_no_runcontainer) {
+    if ((cookie & cookie_mask) != serial_cookie && cookie != serial_cookie_no_runcontainer) {
       valid = false;
       NV_IF_TARGET(
         NV_IS_HOST,
@@ -80,13 +94,15 @@ struct roaring_bitmap_metadata<cuda::std::uint32_t> {
       return;
     }
 
-    if ((cookie & 0xFFFF) == serial_cookie)
-      num_containers = (cookie >> 16) + 1;
+    if ((cookie & cookie_mask) == serial_cookie)
+      // upper 16 bits of cookie are the number of containers - 1
+      num_containers = (cookie >> cookie_shift) + 1;
     else {
+      // following 4 bytes are the number of containers
       cuda::std::memcpy(&num_containers, buf, sizeof(cuda::std::uint32_t));
       buf += sizeof(cuda::std::uint32_t);
     }
-    if (num_containers < 0 or num_containers > (1 << 16)) {
+    if (num_containers < 0 or num_containers > max_containers) {
       valid = false;
       NV_IF_TARGET(
         NV_IS_HOST,
@@ -95,14 +111,16 @@ struct roaring_bitmap_metadata<cuda::std::uint32_t> {
       return;
     }
 
-    has_run = (cookie & 0xFFFF) == serial_cookie;
+    has_run = (cookie & cookie_mask) == serial_cookie;
     if (has_run) {
-      cuda::std::size_t s  = (num_containers + 7) / 8;
+      cuda::std::size_t s  = (num_containers + 7) / 8;  // ceil bytes to store run container bitmap
       run_container_bitmap = cuda::std::distance(bitmap, buf);
       buf += s;
     }
 
-    key_cards             = cuda::std::distance(bitmap, buf);
+    key_cards = cuda::std::distance(bitmap, buf);
+    // if the current address is aligned to 2 bytes, then all containers are aligned to at least 2
+    // bytes
     bool const aligned_16 = (reinterpret_cast<cuda::std::uintptr_t>(bitmap + key_cards) %
                              sizeof(cuda::std::uint16_t)) == 0;
     buf += num_containers * 2 * sizeof(cuda::std::uint16_t);
@@ -136,16 +154,14 @@ struct roaring_bitmap_metadata<cuda::std::uint32_t> {
     cuda::std::byte const* end =
       bitmap + misaligned_load<cuda::std::uint32_t>(
                  bitmap + container_offsets + (num_containers - 1) * sizeof(cuda::std::uint32_t));
-    if (has_run and (static_cast<cuda::std::uint8_t>(
-                       (bitmap + run_container_bitmap)[(num_containers - 1) / 8]) &
-                     (cuda::std::uint8_t(1) << ((num_containers - 1) % 8)))) {
+    if (has_run and check_bit(bitmap + run_container_bitmap, num_containers - 1)) {
       cuda::std::uint16_t const num_runs = misaligned_load<cuda::std::uint16_t>(end);
       end += sizeof(cuda::std::uint16_t) + num_runs * 2 * sizeof(cuda::std::uint16_t);
     } else {
-      if (card <= 4096) {  // TODO check if this is correct
+      if (card <= max_array_container_card) {
         end += card * sizeof(cuda::std::uint16_t);
       } else {
-        end += 8192;  // fixed size bitset container
+        end += bitset_container_bytes;  // fixed size bitset container
       }
     }
 
diff --git a/include/cuco/roaring_bitmap.cuh b/include/cuco/roaring_bitmap.cuh
index 4ca3fb8a2..a4be0175d 100644
--- a/include/cuco/roaring_bitmap.cuh
+++ b/include/cuco/roaring_bitmap.cuh
@@ -74,7 +74,8 @@ class roaring_bitmap {
    *       `contains_async`.
    *
    * @tparam InputIt  Device-accessible random access input iterator of keys convertible to `T`
-   * @tparam OutputIt Device-accessible random access output iterator to `bool`
+   * @tparam OutputIt Device-accessible random access output iterator whose `value_type` is
+   * constructible from `bool`
    *
    * @param first Beginning of the sequence of keys
    * @param last  End of the sequence of keys
@@ -153,7 +154,7 @@ class roaring_bitmap {
   [[nodiscard]] ref_type ref() const noexcept;
 
  private:
-  storage_type storage_;
+  storage_type storage_;  ///< Storage type
 };
 
 }  // namespace cuco
diff --git a/tests/roaring_bitmap/contains_test.cu b/tests/roaring_bitmap/contains_test.cu
index db3b9cd33..4a30e12b4 100644
--- a/tests/roaring_bitmap/contains_test.cu
+++ b/tests/roaring_bitmap/contains_test.cu
@@ -26,6 +26,7 @@
 
 #include <catch2/catch_test_macros.hpp>
 
+#include <filesystem>
 #include <fstream>
 #include <string>
 #include <vector>
@@ -70,9 +71,7 @@ bool check(std::string const& bitmap_file_path)
   std::ifstream file(bitmap_file_path, std::ios::binary);
   if (!file.is_open()) { return false; }
 
-  file.seekg(0, std::ios::end);
-  std::streamsize file_size = file.tellg();
-  file.seekg(0, std::ios::beg);
+  auto file_size = std::filesystem::file_size(bitmap_file_path);
 
   thrust::universal_host_pinned_vector<cuda::std::byte> buffer(file_size);
 

From 4e68e8adac1c075d4989ec3afb61459f1a6df3d4 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 18 Aug 2025 16:18:35 -0700
Subject: [PATCH 21/24] Code simplifications

---
 .../roaring_bitmap/roaring_bitmap_impl.cuh    | 70 +++++++++----------
 include/cuco/detail/roaring_bitmap/util.cuh   | 10 ++-
 2 files changed, 38 insertions(+), 42 deletions(-)

diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
index b69ebf0dd..16b2001ee 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
@@ -113,12 +113,12 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
 // linear search
 #pragma unroll
       for (cuda::std::uint32_t i = 0; i < storage_ref_.metadata().num_containers; i++) {
+        cuda::std::byte const* key_ptr =
+          storage_ref_.key_cards() + (i * 2) * sizeof(cuda::std::uint16_t);
         if constexpr (Aligned) {
-          key = aligned_load<cuda::std::uint16_t>(storage_ref_.key_cards() +
-                                                  (i * 2) * sizeof(cuda::std::uint16_t));
+          key = aligned_load<cuda::std::uint16_t>(key_ptr);
         } else {
-          key = misaligned_load<cuda::std::uint16_t>(storage_ref_.key_cards() +
-                                                     (i * 2) * sizeof(cuda::std::uint16_t));
+          key = misaligned_load<cuda::std::uint16_t>(key_ptr);
         }
         if (key == upper) { return this->contains_container<Aligned>(lower, i); }
         if (key > upper) { return false; }
@@ -129,12 +129,12 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
       cuda::std::uint32_t right = storage_ref_.metadata().num_containers;
       while (left < right) {
         cuda::std::uint32_t mid = left + (right - left) / 2;
+        cuda::std::byte const* key_ptr =
+          storage_ref_.key_cards() + (mid * 2) * sizeof(cuda::std::uint16_t);
         if constexpr (Aligned) {
-          key = aligned_load<cuda::std::uint16_t>(storage_ref_.key_cards() +
-                                                  (mid * 2) * sizeof(cuda::std::uint16_t));
+          key = aligned_load<cuda::std::uint16_t>(key_ptr);
         } else {
-          key = misaligned_load<cuda::std::uint16_t>(storage_ref_.key_cards() +
-                                                     (mid * 2) * sizeof(cuda::std::uint16_t));
+          key = misaligned_load<cuda::std::uint16_t>(key_ptr);
         }
 
         if (key == upper) {
@@ -170,29 +170,29 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
   __device__ bool contains_container(cuda::std::uint16_t lower, cuda::std::uint32_t index) const
   {
     cuda::std::uint32_t offset;
+    cuda::std::byte const* offset_ptr =
+      storage_ref_.container_offsets() + index * sizeof(cuda::std::uint32_t);
     if (offsets_aligned_) {
-      offset = aligned_load<cuda::std::uint32_t>(storage_ref_.container_offsets() +
-                                                 index * sizeof(cuda::std::uint32_t));
+      offset = aligned_load<cuda::std::uint32_t>(offset_ptr);
     } else {
-      offset = misaligned_load<cuda::std::uint32_t>(storage_ref_.container_offsets() +
-                                                    index * sizeof(cuda::std::uint32_t));
+      offset = misaligned_load<cuda::std::uint32_t>(offset_ptr);
     }
     cuda::std::byte const* container = storage_ref_.data() + offset;
     if (storage_ref_.metadata().has_run and check_bit(storage_ref_.run_container_bitmap(), index)) {
       return this->contains_run_container<Aligned>(container, lower);
     } else {
       cuda::std::uint32_t card;
+      cuda::std::byte const* card_ptr =
+        storage_ref_.key_cards() + (index * 2 + 1) * sizeof(cuda::std::uint16_t);
       if constexpr (Aligned) {
-        card = 1u + aligned_load<cuda::std::uint16_t>(
-                      storage_ref_.key_cards() + (index * 2 + 1) * sizeof(cuda::std::uint16_t));
+        card = 1u + aligned_load<cuda::std::uint16_t>(card_ptr);
       } else {
-        card = 1u + misaligned_load<cuda::std::uint16_t>(
-                      storage_ref_.key_cards() + (index * 2 + 1) * sizeof(cuda::std::uint16_t));
+        card = 1u + misaligned_load<cuda::std::uint16_t>(card_ptr);
       }
       if (card <= storage_ref_type::metadata_type::max_array_container_card) {
         return this->contains_array_container<Aligned>(container, lower, card);
       } else {
-        return this->contains_bitset_container(container, lower, card);
+        return this->contains_bitset_container(container, lower);
       }
     }
   }
@@ -206,10 +206,11 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
     // Use linear search for small arrays, binary search for larger ones
     if (card < binary_search_threshold) {
       for (cuda::std::uint32_t i = 0; i < card; i++) {
+        cuda::std::byte const* elem_ptr = container + i * sizeof(cuda::std::uint16_t);
         if constexpr (Aligned) {
-          elem = aligned_load<cuda::std::uint16_t>(container + i * sizeof(cuda::std::uint16_t));
+          elem = aligned_load<cuda::std::uint16_t>(elem_ptr);
         } else {
-          elem = misaligned_load<cuda::std::uint16_t>(container + i * sizeof(cuda::std::uint16_t));
+          elem = misaligned_load<cuda::std::uint16_t>(elem_ptr);
         }
         if (elem == lower) { return true; }
       }
@@ -219,12 +220,12 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
       cuda::std::uint32_t right = card;
 
       while (left < right) {
-        cuda::std::uint32_t mid = left + (right - left) / 2;
+        cuda::std::uint32_t mid         = left + (right - left) / 2;
+        cuda::std::byte const* elem_ptr = container + mid * sizeof(cuda::std::uint16_t);
         if constexpr (Aligned) {
-          elem = aligned_load<cuda::std::uint16_t>(container + mid * sizeof(cuda::std::uint16_t));
+          elem = aligned_load<cuda::std::uint16_t>(elem_ptr);
         } else {
-          elem =
-            misaligned_load<cuda::std::uint16_t>(container + mid * sizeof(cuda::std::uint16_t));
+          elem = misaligned_load<cuda::std::uint16_t>(elem_ptr);
         }
         if (elem == lower) {
           return true;
@@ -239,11 +240,9 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
   }
 
   __device__ bool contains_bitset_container(cuda::std::byte const* container,
-                                            cuda::std::uint16_t lower,
-                                            cuda::std::uint32_t card) const
+                                            cuda::std::uint16_t lower) const
   {
-    return static_cast<cuda::std::uint8_t>(container[lower / 8]) &
-           (cuda::std::uint8_t(1) << (lower % 8));
+    return check_bit(container, lower);
   }
 
   template <bool Aligned>
@@ -262,19 +261,18 @@ class roaring_bitmap_impl<cuda::std::uint32_t> {
     cuda::std::uint32_t end;
 
     for (cuda::std::uint32_t i = 0; i < num_runs; i++) {
+      // the first 16 bits of the run container denotes the number of runs
+      // followed by the sequence of runs as (start, end) U16 pairs
+      cuda::std::byte const* start_ptr = container + (i * 2 + 1) * sizeof(cuda::std::uint16_t);
       // TODO load start+end in one instruction
       if constexpr (Aligned) {
-        start =
-          aligned_load<cuda::std::uint16_t>(container + (i * 2 + 1) * sizeof(cuda::std::uint16_t));
-        end =
-          static_cast<cuda::std::uint32_t>(start) +
-          aligned_load<cuda::std::uint16_t>(container + (i * 2 + 2) * sizeof(cuda::std::uint16_t));
+        start = aligned_load<cuda::std::uint16_t>(start_ptr);
+        end   = static_cast<cuda::std::uint32_t>(start) +
+              aligned_load<cuda::std::uint16_t>(start_ptr + sizeof(cuda::std::uint16_t));
       } else {
-        start = misaligned_load<cuda::std::uint16_t>(container +
-                                                     (i * 2 + 1) * sizeof(cuda::std::uint16_t));
+        start = misaligned_load<cuda::std::uint16_t>(start_ptr);
         end   = static_cast<cuda::std::uint32_t>(start) +
-              misaligned_load<cuda::std::uint16_t>(container +
-                                                   (i * 2 + 2) * sizeof(cuda::std::uint16_t));
+              misaligned_load<cuda::std::uint16_t>(start_ptr + sizeof(cuda::std::uint16_t));
       }
       if (start <= lower && end >= lower) { return true; }
       if (start > lower) { break; }
diff --git a/include/cuco/detail/roaring_bitmap/util.cuh b/include/cuco/detail/roaring_bitmap/util.cuh
index a3cc04ae7..c59f65fad 100644
--- a/include/cuco/detail/roaring_bitmap/util.cuh
+++ b/include/cuco/detail/roaring_bitmap/util.cuh
@@ -138,14 +138,12 @@ struct roaring_bitmap_metadata<cuda::std::uint32_t> {
 
     cuda::std::uint32_t card = 0;
     for (cuda::std::int32_t i = 0; i < num_containers; i++) {
+      cuda::std::byte const* card_ptr =
+        bitmap + key_cards + (i * 2 + 1) * sizeof(cuda::std::uint16_t);
       if (aligned_16) {
-        card = aligned_load<cuda::std::uint16_t>(bitmap + key_cards +
-                                                 (i * 2 + 1) * sizeof(cuda::std::uint16_t)) +
-               1u;
+        card = 1u + aligned_load<cuda::std::uint16_t>(card_ptr);
       } else {
-        card = misaligned_load<cuda::std::uint16_t>(bitmap + key_cards +
-                                                    (i * 2 + 1) * sizeof(cuda::std::uint16_t)) +
-               1u;
+        card = 1u + misaligned_load<cuda::std::uint16_t>(card_ptr);
       }
       num_keys += card;
     }

From 1dfc1399c063ee61e071e61076ba55c6efbaf745 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Mon, 18 Aug 2025 16:44:12 -0700
Subject: [PATCH 22/24] Resolve merge conflict in build.sh

---
 ci/build.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ci/build.sh b/ci/build.sh
index e826033cf..1d3074fb9 100755
--- a/ci/build.sh
+++ b/ci/build.sh
@@ -72,9 +72,9 @@ function usage {
     echo "  --prefix: Build directory prefix (Defaults to <repo_root>/build)"
     echo "  -i/--infix: Build directory infix (Defaults to local)"
     echo "  -d/--debug: Debug build"
-    echo "  -p/--parallel: Build parallelism (Defaults to $PARALLEL_LEVEL if set, otherwise the system's number of CPU cores)"
-    echo "  --cuda: CUDA compiler (Defaults to $CUDACXX if set, otherwise nvcc)"
-    echo "  --cxx: Host compiler (Defaults to $CXX if set, otherwise g++)"
+    echo "  -p/--parallel: Build parallelism (Defaults to \$PARALLEL_LEVEL if set, otherwise the system's number of CPU cores)"
+    echo "  --cuda: CUDA compiler (Defaults to \$CUDACXX if set, otherwise nvcc)"
+    echo "  --cxx: Host compiler (Defaults to \$CXX if set, otherwise g++)"
     echo "  --arch: Target CUDA arches, e.g. \"60-real;70;80-virtual\" (Defaults to the system's native GPU archs)"
     echo "  --std: CUDA/C++ standard (Defaults to 17)"
     echo "  -v/-verbose/--verbose: Enable shell echo for debugging"

From 245592e8f59d0556c8863b02d440c5379d0c1e03 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Tue, 19 Aug 2025 15:26:18 -0700
Subject: [PATCH 23/24] Use std::fs::file_size in benchmark

---
 benchmarks/roaring_bitmap/contains_bench.cu | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/benchmarks/roaring_bitmap/contains_bench.cu b/benchmarks/roaring_bitmap/contains_bench.cu
index da66bea4d..ac7ddc55f 100644
--- a/benchmarks/roaring_bitmap/contains_bench.cu
+++ b/benchmarks/roaring_bitmap/contains_bench.cu
@@ -27,6 +27,7 @@
 #include <thrust/device_vector.h>
 #include <thrust/universal_vector.h>
 
+#include <filesystem>
 #include <fstream>
 #include <string>
 
@@ -43,9 +44,7 @@ void roaring_bitmap_contains(nvbench::state& state, nvbench::type_list<T>)
   if (!file.is_open()) { state.skip("Bitmap file not found"); }
 
   // Get file size
-  file.seekg(0, std::ios::end);
-  std::streamsize file_size = file.tellg();
-  file.seekg(0, std::ios::beg);
+  auto const file_size = std::filesystem::file_size(bitmap_file);
 
   thrust::universal_host_pinned_vector<cuda::std::byte> buffer(file_size);
 

From 67d19ec0a63c67bf339cb5d4ef698ef004e0bc33 Mon Sep 17 00:00:00 2001
From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com>
Date: Tue, 19 Aug 2025 15:46:32 -0700
Subject: [PATCH 24/24] Move to experimental namespace

---
 README.md                                     |  4 ++--
 benchmarks/roaring_bitmap/contains_bench.cu   |  2 +-
 examples/roaring_bitmap/host_bulk_example.cu  |  3 ++-
 .../detail/roaring_bitmap/roaring_bitmap.inl  |  4 ++--
 .../roaring_bitmap/roaring_bitmap_impl.cuh    |  4 ++--
 .../roaring_bitmap/roaring_bitmap_ref.inl     |  4 ++--
 .../roaring_bitmap/roaring_bitmap_storage.cuh | 22 ++++++++++---------
 include/cuco/detail/roaring_bitmap/util.cuh   |  4 ++--
 include/cuco/roaring_bitmap.cuh               |  4 ++--
 include/cuco/roaring_bitmap_ref.cuh           |  4 ++--
 tests/roaring_bitmap/contains_test.cu         |  3 ++-
 11 files changed, 31 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index 8608e3680..ae00028e9 100644
--- a/README.md
+++ b/README.md
@@ -263,7 +263,7 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
 
 ### roaring_bitmap
 
-`cuco::roaring_bitmap` implements a Roaring bitmap following the [Roaring bitmap format specification](https://github.com/RoaringBitmap/RoaringFormatSpec).
+`cuco::experimental::roaring_bitmap` implements a Roaring bitmap following the [Roaring bitmap format specification](https://github.com/RoaringBitmap/RoaringFormatSpec).
 
 #### Examples:
-- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/roaring_bitmap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy9WAtv27YW_ivnqsAgN7aVpI9szmNz43Qz1msPtrtiaAqBkmibsCxqJBXHC_Lf7yGpZ6KsXbc7B4gt8vA7D37n8FB3jqRSMp5IZ_DxzmGRMzjqOjFJVhlZUWfghFlEnK4jeSZC_ew9v07gOVzydC_Yaq3ADTtwfHj8Cia_jkfjIVxOZ79MZ8PFeDrpa1Ej_o6FNJE0giyJqAC1pjBMSYhf-UwXfqVC2wHH_UNwtcC1k89dO51Tg7LnGWzJHhKuIJMUYZiEJYsp0NuQpgpYAiHfpjEjSUhhx9TaqMpxjDnwWw7CA0VQnuCKFJ-WdUkgqjRdf9ZKpQPP2-12fWLM7nOx8mIrLL1348uryfyqh6aXy94nMQYWBP09YwIdD_ZAUrQsJAHaG5MdcAFkJSjOKa4t3wmmWLLqguRLtSOCGpyISSVYkKlG8Ao70f-6AIaPJBi44RzG82sH3gzn43nX4HwYL36avl_Ah-FsNpwsxldzmM5wsyajsd4qfHoLw8lv8PN4MuoCxdChKnqbCu0Fmsp0WGlkYzintGHGkluzZEpDtmQhFASCFb-hIkG3IKViyyzV0MjI4MRsyxRRZuyRc0aVd51cJ89YEsZZROEszELuCU4EIvoBU1uS9sNsffFIJlMsZmrvKUGYkv11ml48RIqIJ1XkhfgvosuLJydZoton1T6lvlXQEFBrkUnlRfQGHfFvaKi46K_bRGK-QkLE7ZNZwjB2ksR1iLqcZr7cS0W3jeVL5AMlzTHGWwY1b5JVY8hqMnq855bIP5gEWyOAH2Txxqe3BKlAMex2OhCMLmFEt7iJGAxFcSul3vo8pZq7heTUKMjNmPNNlhpgGP4yllWxGCc2s3NNsMN84QTTZMfhxXEPgcCCGSIh6Sm8flkbBjflQpk8Q2JuierAUvCttsbgf5xZk94Y6bdGZI7M_eTqPJeY6Cukfxb0sZZ4DdniqVrTwQxPuWQYtX1JaywQ4QaY9V-7WziLfqK8ygTyXc-FXAiMOI7JLEaawoRsabzvapcxkMoILXkc851OIbPhA6OiBx-ts7rI8UyJLJH9gCVm0rVR6nyNQ14Q88B7dXRyQqLvPG1ERBTxWpV1Hpvy79nxyIhi03OmvX5Z2WHp8Q_a8fql16quU5L4BzykKCzqPM7Pgnzrt2SDOZLq4ge90eX7y6k_mn6YvJsOR_5sOpyNJz_6i6v5YjRcDM-nE31IBFhhqSoTxdRGzP40xqzDsoHlKEH-wM90v8DfmMUB57Flo4t2DwY245F2mKrf5Nnia1b5KVFrNP4OcYFkqGtFE6qz2d_QvYRz-PjJ7UDvAmxxGgwa1e2sUAkGADT3jRJ9gGCHgBVTK0cLmPQl2ujfFEu6UJvNsNa-OPbVRacAAvA8uMTShR7-nlFMMmNPceBUdLhDXt4XnKgypYD4xwg4uxqO_nvV30bP9FBPjxVqjAt5QNqcMqafFtLaBbdFDDYY7cNT_DqDo0P90b8Pzs1DLS5g4PppJtd-QHCHN50S-76hBIENaIV2hh2b_X1wsPkzzBfI5M_intRwv_0y3DZMWxifIFhbPF0DGtAVS9xO16qgSeR2CvB7oLFukr6ejK9f_gUytlYEQ8X_BxOxBFVcfKTalr7P0dK493laajFLy1uzvVkc293G5--K56_Y8c_pGjZ1HR3-DV0PRA5vj3O0SrZV5FVD5EmLmU1a9sBQphP3WEP8ibmHt9_mC-AA2N_MDLOlX5oZdxVHsAsPfSIlFcrVHbRWg-dJRBPlLwkKlzW-iz1c_hu2aJA-lsrihfEprMivbXUP7u4L_fpLP-hvTIopqjKZop9tbuZtrBl1Hx5V3VyIy8EAqU7E3urCVHf_o6X6mNscQdHlwkuzIqRCwNkZuvCWoJi5e2k5HNDDD_XoMbMOIxHntue-mKCYofvSjR-psvdRyf6g5VFq0PQIcsSAVY27_W0mH_nYqcIzxA4w1FXHdMtb7LWx9jTPwDJ6BUuq64Pp3VOWJDRqIU2wV3huB9lySYVbWlNTPqPYexuvcGN5rl3PmTjjHkWuoDhFBV4WlR8Sqc7CNRHPL9zCFkF2fsqNjJl3rbq-rmO4Q0jRhuIcOoy5pG7Nkrz05jeK0vG8ty8jYOnbvHdUHUpz_ItNPK1tsu2M6keASzB5JfbGcaTTAY8b_XIB6cWS2h51SkrkTVWjyWrxlIDdL81Sid_21p3fFuqb3SwJuu27qIyw1cBwTEda87am6g3eT3r5_aTmEVnh2vwGYq3X8g-v3laFfLLedCsrium6l8U9SUdPK7KaxZMB1OtMT4sL_ErmvIyDHudL95HSuh25YVgLap2ArnNM7e_uLf_yHG-owQmT6brt2RKNavrlZ2wZ4QXYNPBF3657dn80npXVrN54g6aUHzGBdreuOi3dlFkY6vcvzc-5benbe6IS_ABLWvu1rSjMBfo3XwX4V9HswdREa7-yOQVFbMVGy23FzjUM8lptqxeGicTpmpiR3IQHRbu2oYXE93AIAzjCyWf6IKyUlcdD-82tdb_yqxmywLARKwfDMmDePjZueddO1QLkn2vnC65-nYcLHx1KuXfWnyRiS6Sq03X0u1CsiaJ6t-skN2F4dPwqO8JpaxZOOj2EOw8PDo5OoEdEuD6XW__kEHo9bJkV_lO6EYh6MdkG5m1wzIIaZhiGMQ7e2Be4OKCZtnHuu8U81uPGPFYr5_6T-fsfn-2NIg==))
\ No newline at end of file
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/roaring_bitmap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy9WA1v2zYT_iv3qsAgN7aVpB_ZnI_NjdPNWF97sN0VQ1MIlETbhGVRI6k4XpD__h5JfSbK2nV75wCxRR7vnjs-dzzqzpFUSsYT6Qw-3jkscgZHXScmySojK-oMnDCLiNN1JM9EqJ-959cJPIdLnu4FW60VuGEHjg-PX8Hk1_FoPITL6eyX6Wy4GE8nfS1qxN-xkCaSRpAlERWg1hSGKQnxK5_pwq9UaBxw3D8EVwtcO_nctdM5NVr2PIMt2UPCFWSSohomYcliCvQ2pKkClkDIt2nMSBJS2DG1NqZyPQYO_JYr4YEiKE9wRYpPy7okEFVC15-1UunA83a7XZ8Y2H0uVl5shaX3bnx5NZlf9RB6uex9EmNgQdDfMybQ8WAPJEVkIQkQb0x2wAWQlaA4p7hGvhNMsWTVBcmXakcENXoiJpVgQaYawStwov91AQwfSTBwwzmM59cOvBnOx_Ou0fNhvPhp-n4BH4az2XCyGF_NYTrDzZqMxnqr8OktDCe_wc_jyagLFEOHpuhtKrQXCJXpsNLIxnBOaQPGkltYMqUhW7IQCgLBit9QkaBbkFKxZZZqCDIyemK2ZYooM_bIOWPKu06uk2csCeMsonAWZiH3BCcCNfoBU1uS9sNsffFIJlMsZmrvKUGYkv11ml481BQRT6rIC_FfRJcXT06yRLVPqn1KfWugIaDWIpPKi-gNOuLf0FBx0V-3icR8hYSI2yezhGHsJInrKupymvlyLxXdNpYvkQ-UNMcYbxnUvElWjSFrydjxnlsi_2ASbI0K_CCLNz69JUgFimG304FgdAkjusVNxGAoilsp9dbnKdXcLSSn1oLcjDnfZKlRDMNfxrIqFuPEZnZuCXaYL5xgmuw4vDjuoSKwygyRkPQUXr-sDYObcqFMniExt0R1YCn4VqMx-j_OLKQ3RvqtEZkjcz-5Os8lJvoK6Z8FfawlXkO2eKrWdDDDUy4ZRm1f0hoLRLgBZv3X7hbOop8orzKBfNdzIRcCI45jMouRpjAhWxrvu9plDKQyQksex3ynU8hs-MCY6MFH66wucjxTIktkP2CJmXRtlDpf45AXxDzwXh2dnJDoO0-DiIgiXquxzmMo_x6ORyCKTc-Z9vplhcPS4x_E8fql12quU5L4BzykKCzqPM7Pgnzrt2SDOZLq4ge90eX7y6k_mn6YvJsOR_5sOpyNJz_6i6v5YjRcDM-nE31IBFhhqSoTxdRGzP40xqzDsoHlKEH-wM90v8DfmMUB57Flo4u4BwOb8Ug7TNVv8mzxNav8lKg1gr9DvUAytLWiCdXZ7G_oXsI5fPzkdqB3AbY4DQaN6nZWmASjADT3jRF9gGCHgBVTG0cETPoSMfo3xZIu1GYzrLUvjn110SkUAXgeXGLpQg9_zygmmcFTHDgVHe6Ql_cFJ6pMKVT8YwScXQ1H_73qb6NneqinxwozxoU8IG1OGeinhbR2wW0Rgw1G-_AUv87g6FB_9O-Dc_NQiwsYdf00k2s_ILjDm06p-75hBBUbpZW2M-zY7O-Dg82f6XyBTP6s3pOa3m-_TG-bTlsYnyBYWzxdozSgK5a4na41QZPI7RTK74HGukn6ejK-fvkXyNhaEQwV_x9MxBJUcfGRaVv6PkdL497naanFLC1vzfZmcWx3G5-_K56_Ysc_Z2vYtHV0-DdsPRA5vD3OtVWyrSKvGiJPImY2adkDoEwn7rFW8SdwD2-_zRfAAbC_mRlmS780M-4qjmAXHvpESiqUqztobQbPk4gmyl8SFC5rfBd7uPw3bBGQPpbK4oXxKVDk17a6B3f3hX39pR_0NybFFE2ZTNHPNjfzNtaMug-Pqm4uxOVggFQnYm9tYaq7_9FSfcxtjkrR5cJLsyKkQsDZGbrwlqCYuXtpORzQww_t6DGzDiMR59hzX0xQzNB96caPVNn7qGR_0PIoNdr0CHLEKKsad_vbTD7ysVOFZ4gdYKirjumWt9hrY-1pnoFl9AqWVNcH07unLElo1EKaYK_w3A6y5ZIKt0RTMz6j2Hsbr3BjeW5dz5k44x5FrqA4RQVeFpUfEqnOwjURzy_cAosgOz_lRsbMu9ZcX9cx3CGkaMNwrjqMuaRuDUleevMbRel43tuXEbD0xWpPBdsif0mMCBq3kKpfaY67doe_FPZpbeNtt1Q_FlyCCS2xX44jnSJ4BOkXDkg5ltT2rVPSJG-0Go1Xi_cE7B5q5kr8tjfx_AZRJ0CzTOhW8KICYSuE4Z2OvuZyzdQbvLP08jtLzSOywrX5rcSi1_IPr-PWhHyyBnUrFMV03cvi7qSjpw1Zy-LJAOp1ps_FBX4lc17GQY_zpfvIaB1HDgzrQ6070LWPqf3dveVknvcNMzhhsl-3QluitZoe-hlbRngpNk190cvrPt4fjWdlhas346Ap5UdMIO7WVaelmzILQ_1Opvk5t21-e59UKj_AMtd-lSuKdaH9m69S-Fe12cOqqa39GucUFLFVHJHbKp5bGOT121Y0DBOJ0zUxIzmEB4W8tqGFxPdwCAM4wsln-nCsjJVHRvttrnW_8usassCwESsHwzJg3kg2bn7XTtUW5J9r5wuug52HCx8dVLl31p8kYkukqtN19PtRrJOiet_rJDdheHT8KjvCaQsLJ50eqjsPDw6OTqBHRLg-l1v_5BB6PSysCv8p3RxEvZhsA_OGOGZBTWcYhjEO3tiXujigmbZx7rvFPNboxjxWK-f-
\ No newline at end of file
diff --git a/benchmarks/roaring_bitmap/contains_bench.cu b/benchmarks/roaring_bitmap/contains_bench.cu
index ac7ddc55f..20cb27a27 100644
--- a/benchmarks/roaring_bitmap/contains_bench.cu
+++ b/benchmarks/roaring_bitmap/contains_bench.cu
@@ -51,7 +51,7 @@ void roaring_bitmap_contains(nvbench::state& state, nvbench::type_list<T>)
   file.read(reinterpret_cast<char*>(thrust::raw_pointer_cast(buffer.data())), file_size);
   file.close();
 
-  cuco::roaring_bitmap<T> roaring_bitmap(thrust::raw_pointer_cast(buffer.data()));
+  cuco::experimental::roaring_bitmap<T> roaring_bitmap(thrust::raw_pointer_cast(buffer.data()));
 
   thrust::device_vector<T> items(num_items);
 
diff --git a/examples/roaring_bitmap/host_bulk_example.cu b/examples/roaring_bitmap/host_bulk_example.cu
index 46d5481cb..e70e6db3a 100644
--- a/examples/roaring_bitmap/host_bulk_example.cu
+++ b/examples/roaring_bitmap/host_bulk_example.cu
@@ -106,7 +106,8 @@ bool check(std::string const& bitmap_file_path)
   file.close();
 
   // Create roaring bitmap from the file
-  cuco::roaring_bitmap<KeyType> roaring_bitmap(thrust::raw_pointer_cast(buffer.data()));
+  cuco::experimental::roaring_bitmap<KeyType> roaring_bitmap(
+    thrust::raw_pointer_cast(buffer.data()));
 
   // Generate query keys (all should be contained in the bitmap)
   auto keys = generate_keys();
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
index 7159cc6ae..ff8dc3d13 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl
@@ -19,7 +19,7 @@
 #include <cuda/std/cstddef>
 #include <cuda/stream_ref>
 
-namespace cuco {
+namespace cuco::experimental {
 
 template <class T, class Allocator>
 roaring_bitmap<T, Allocator>::roaring_bitmap(cuda::std::byte const* bitmap,
@@ -85,4 +85,4 @@ typename roaring_bitmap<T, Allocator>::ref_type roaring_bitmap<T, Allocator>::re
 {
   return ref_type{storage_.ref()};
 }
-}  // namespace cuco
\ No newline at end of file
+}  // namespace cuco::experimental
\ No newline at end of file
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
index 16b2001ee..7276dfae8 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh
@@ -29,7 +29,7 @@
 #include <cuda/stream_ref>
 #include <thrust/iterator/constant_iterator.h>
 
-namespace cuco::detail {
+namespace cuco::experimental::detail {
 
 // primary template
 template <class T>
@@ -372,4 +372,4 @@ class roaring_bitmap_impl<cuda::std::uint64_t> {
   storage_ref_type storage_ref_;
 };
 
-}  // namespace cuco::detail
\ No newline at end of file
+}  // namespace cuco::experimental::detail
\ No newline at end of file
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
index 9536bb79f..01738ac7f 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl
@@ -22,7 +22,7 @@
 #include <cuda/std/type_traits>
 #include <cuda/stream_ref>
 
-namespace cuco {
+namespace cuco::experimental {
 
 template <class T>
 __host__ __device__ roaring_bitmap_ref<T>::roaring_bitmap_ref(storage_ref_type const& storage_ref)
@@ -87,4 +87,4 @@ __host__ __device__ cuda::std::size_t roaring_bitmap_ref<T>::size_bytes() const
   return impl_.size_bytes();
 }
 
-}  // namespace cuco
\ No newline at end of file
+}  // namespace cuco::experimental
\ No newline at end of file
diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh
index c2736fe54..4c33f5ee4 100644
--- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh
+++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh
@@ -30,7 +30,7 @@
 #include <utility>
 #include <vector>
 
-namespace cuco::detail {
+namespace cuco::experimental::detail {
 
 template <class T>
 struct roaring_bitmap_storage_ref {
@@ -140,8 +140,8 @@ class roaring_bitmap_storage<cuda::std::uint32_t, Allocator> {
     : allocator_{alloc},
       metadata_{bitmap},
       data_{allocator_.allocate(metadata_.size_bytes),
-            detail::custom_deleter<cuda::std::size_t, allocator_type>{metadata_.size_bytes,
-                                                                      allocator_}},
+            cuco::detail::custom_deleter<cuda::std::size_t, allocator_type>{metadata_.size_bytes,
+                                                                            allocator_}},
       ref_{data_.get(), metadata_}
   {
     CUCO_CUDA_TRY(cudaMemcpyAsync(
@@ -153,7 +153,8 @@ class roaring_bitmap_storage<cuda::std::uint32_t, Allocator> {
  private:
   allocator_type allocator_;
   typename ref_type::metadata_type metadata_;
-  std::unique_ptr<cuda::std::byte, custom_deleter<cuda::std::size_t, allocator_type>> data_;
+  std::unique_ptr<cuda::std::byte, cuco::detail::custom_deleter<cuda::std::size_t, allocator_type>>
+    data_;
   ref_type ref_;
 };
 
@@ -186,10 +187,10 @@ class roaring_bitmap_storage<cuda::std::uint64_t, Allocator> {
           return typename ref_type::metadata_type{bitmap, bucket_metadata};
         }(bucket_metadata_)},
       data_{allocator_.allocate(metadata_.size_bytes),
-            detail::custom_deleter<cuda::std::size_t, allocator_type>{metadata_.size_bytes,
-                                                                      allocator_}},
+            cuco::detail::custom_deleter<cuda::std::size_t, allocator_type>{metadata_.size_bytes,
+                                                                            allocator_}},
       buckets_{bucket_allocator_.allocate(metadata_.num_buckets),
-               detail::custom_deleter<cuda::std::size_t, bucket_allocator_type>{
+               cuco::detail::custom_deleter<cuda::std::size_t, bucket_allocator_type>{
                  metadata_.num_buckets, bucket_allocator_}},
       ref_{data_.get(), metadata_, buckets_.get()}
   {
@@ -217,11 +218,12 @@ class roaring_bitmap_storage<cuda::std::uint64_t, Allocator> {
   std::vector<typename ref_type::metadata_type::bucket_metadata> bucket_metadata_;
   std::vector<cuda::std::pair<cuda::std::uint32_t, bucket_ref_type>> buckets_h_;
   typename ref_type::metadata_type metadata_;
-  std::unique_ptr<cuda::std::byte, custom_deleter<cuda::std::size_t, allocator_type>> data_;
+  std::unique_ptr<cuda::std::byte, cuco::detail::custom_deleter<cuda::std::size_t, allocator_type>>
+    data_;
   std::unique_ptr<cuda::std::pair<cuda::std::uint32_t, bucket_ref_type>,
-                  custom_deleter<cuda::std::size_t, bucket_allocator_type>>
+                  cuco::detail::custom_deleter<cuda::std::size_t, bucket_allocator_type>>
     buckets_;
   ref_type ref_;
 };
 
-}  // namespace cuco::detail
\ No newline at end of file
+}  // namespace cuco::experimental::detail
\ No newline at end of file
diff --git a/include/cuco/detail/roaring_bitmap/util.cuh b/include/cuco/detail/roaring_bitmap/util.cuh
index c59f65fad..1807b471d 100644
--- a/include/cuco/detail/roaring_bitmap/util.cuh
+++ b/include/cuco/detail/roaring_bitmap/util.cuh
@@ -26,7 +26,7 @@
 #include <nv/target>
 #include <vector>
 
-namespace cuco::detail {
+namespace cuco::experimental::detail {
 
 template <class T>
 __host__ __device__ __forceinline__ T aligned_load(cuda::std::byte const* ptr)
@@ -237,4 +237,4 @@ struct roaring_bitmap_metadata<cuda::std::uint64_t> {
     valid      = true;
   }
 };
-}  // namespace cuco::detail
\ No newline at end of file
+}  // namespace cuco::experimental::detail
\ No newline at end of file
diff --git a/include/cuco/roaring_bitmap.cuh b/include/cuco/roaring_bitmap.cuh
index a4be0175d..e7c083bf3 100644
--- a/include/cuco/roaring_bitmap.cuh
+++ b/include/cuco/roaring_bitmap.cuh
@@ -23,7 +23,7 @@
 #include <cuda/std/cstddef>
 #include <cuda/stream_ref>
 
-namespace cuco {
+namespace cuco::experimental {
 
 /**
  * @brief GPU-accelerated container that owns a serialized Roaring bitmap.
@@ -157,6 +157,6 @@ class roaring_bitmap {
   storage_type storage_;  ///< Storage type
 };
 
-}  // namespace cuco
+}  // namespace cuco::experimental
 
 #include <cuco/detail/roaring_bitmap/roaring_bitmap.inl>
\ No newline at end of file
diff --git a/include/cuco/roaring_bitmap_ref.cuh b/include/cuco/roaring_bitmap_ref.cuh
index 88b704c28..071640a3b 100644
--- a/include/cuco/roaring_bitmap_ref.cuh
+++ b/include/cuco/roaring_bitmap_ref.cuh
@@ -21,7 +21,7 @@
 #include <cuda/std/cstddef>
 #include <cuda/stream_ref>
 
-namespace cuco {
+namespace cuco::experimental {
 
 /**
  * @brief Non-owning reference to a Roaring bitmap stored in its serialized format.
@@ -145,6 +145,6 @@ class roaring_bitmap_ref {
   impl_type impl_;
 };
 
-}  // namespace cuco
+}  // namespace cuco::experimental
 
 #include <cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl>
\ No newline at end of file
diff --git a/tests/roaring_bitmap/contains_test.cu b/tests/roaring_bitmap/contains_test.cu
index 4a30e12b4..42e0db3c8 100644
--- a/tests/roaring_bitmap/contains_test.cu
+++ b/tests/roaring_bitmap/contains_test.cu
@@ -78,7 +78,8 @@ bool check(std::string const& bitmap_file_path)
   file.read(reinterpret_cast<char*>(thrust::raw_pointer_cast(buffer.data())), file_size);
   file.close();
 
-  cuco::roaring_bitmap<KeyType> roaring_bitmap(thrust::raw_pointer_cast(buffer.data()));
+  cuco::experimental::roaring_bitmap<KeyType> roaring_bitmap(
+    thrust::raw_pointer_cast(buffer.data()));
 
   auto keys = generate_keys();
   thrust::device_vector<bool> contained(keys.size(), false);