From 9f0c40eca01edacd3b7760c605a73644e2c3ed07 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 4 Jul 2025 16:37:51 -0700 Subject: [PATCH 01/24] Simple lookup working --- examples/CMakeLists.txt | 29 +- examples/roaring_bitmap/bitmapwithoutruns.bin | Bin 0 -> 72616 bytes examples/roaring_bitmap/host_bulk_example.cu | 75 +++++ .../detail/roaring_bitmap/roaring_bitmap.inl | 94 ++++++ .../roaring_bitmap/roaring_bitmap_impl.cuh | 279 ++++++++++++++++++ .../roaring_bitmap/roaring_bitmap_ref.inl | 80 +++++ include/cuco/roaring_bitmap.cuh | 85 ++++++ include/cuco/roaring_bitmap_ref.cuh | 66 +++++ 8 files changed, 694 insertions(+), 14 deletions(-) create mode 100644 examples/roaring_bitmap/bitmapwithoutruns.bin create mode 100644 examples/roaring_bitmap/host_bulk_example.cu create mode 100644 include/cuco/detail/roaring_bitmap/roaring_bitmap.inl create mode 100644 include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh create mode 100644 include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl create mode 100644 include/cuco/roaring_bitmap.cuh create mode 100644 include/cuco/roaring_bitmap_ref.cuh diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 12b508404..e2328b496 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -33,17 +33,18 @@ endfunction(ConfigureExample) ### Example sources ############################################################################### ################################################################################################### -ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu") -ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu") -ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu") -ConfigureExample(STATIC_SET_SHARED_MEMORY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/shared_memory_example.cu") -ConfigureExample(STATIC_SET_MAPPING_TABLE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/mapping_table_example.cu") -ConfigureExample(STATIC_MULTISET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multiset/host_bulk_example.cu") -ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu") -ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_ref_example.cu") -ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu") -ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu") -ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu") -ConfigureExample(HYPERLOGLOG_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/host_bulk_example.cu") -ConfigureExample(HYPERLOGLOG_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/device_ref_example.cu") -ConfigureExample(BLOOM_FILTER_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/bloom_filter/host_bulk_example.cu") +# ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu") +# ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu") +# ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu") +# ConfigureExample(STATIC_SET_SHARED_MEMORY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/shared_memory_example.cu") +# ConfigureExample(STATIC_SET_MAPPING_TABLE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/mapping_table_example.cu") +# ConfigureExample(STATIC_MULTISET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multiset/host_bulk_example.cu") +# ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu") +# ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_ref_example.cu") +# ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu") +# ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu") +# ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu") +# ConfigureExample(HYPERLOGLOG_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/host_bulk_example.cu") +# ConfigureExample(HYPERLOGLOG_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/device_ref_example.cu") +# ConfigureExample(BLOOM_FILTER_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/bloom_filter/host_bulk_example.cu") +ConfigureExample(ROARING_BITMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/roaring_bitmap/host_bulk_example.cu") diff --git a/examples/roaring_bitmap/bitmapwithoutruns.bin b/examples/roaring_bitmap/bitmapwithoutruns.bin new file mode 100644 index 0000000000000000000000000000000000000000..a99fd50aff79b98fa93b3219fc6226fa238d72e2 GIT binary patch literal 72616 zcmeI((;Mqb-01P7u^Kf^(llz)c6l$`wr$(CZQHi(wU=$%wte>dN1Susi}^kCedcB^ zWn;g=imxDp$dwDpiR8cJjX{C?7{d^C~dV-@h#MH*@{}PCxpu z6!3q!|5f0>8vNIQ|5^Y+%Xj_P&HwuO-y!H3a*o)}%;$&8?bYf=emjLHB)_G)m+n}` z^;xIq?4P%J!SY43mvqXK72j34R%3siE5D#o8lePchP!4eV+9{H+cK-`J;!A zZ$G*E^!&3^%oi4aTXt{Nv32V=P2bjkSMz=44`x5=oFvb_yKwEw{u|5hjDOJmN&Oea z-=zPbeSv;F`+W}Hj?JfrbM3|IN`5_sDY)PKdnt~kS)YD-=Kk56=PsW=dts+IS^8bM zYnAp_TV88?z3z?bH!I#MeLJla)a_Z%bA7iD{GXSA0Rlh(2mk>f00e*l5C8%|00;m9 zAOHk_01yBI|2G6mO!UD50zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00;m9AOHk_ z01yBIKmZ5;0U!VbfB+Bx0zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00;m9AOHk_ z01yBIKmZ5;0U!VbfB+Bx0zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00;m9AOHk_ z01yBIKmZ5;0U!VbfB+Bx0zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00;m9AOHk_ z01yBIKmZ5;0U!VbfB+Bx0zd!=00AHX1b_e#00KY&2>d@2fFK0IAqrw49+Dsh(jWt} zAQ$qXB$NtD3#ErLL0O?3P;Mw6R1hix6^BYeWuXdCWvCid6RHE%hZ;dmp%zeUs2$W1 z>H>9#dO>}m0nlJ*7&H3L19~7D zumiC`Jdh%gI*=}qA&@zcEs!&iCy+l-C{Q#|B2YR|E>JO0B~U$3D^NGkAka9_EYLF0 zCeS|6DbO|0BhWk0FEB7LBrrTMDlj%MAuu^GEif}MCon&-D6llJBCtBJF0e7MC9plP zE3h|kAaE#fG;kttI&dyobtJ#Z^5 znj$Tb)<`>~Bhm%wj`TwMA_I`Y$S`CiG6orsOhTq2GmzQHJY*rV1X+%(Le?T1kj=<8 zWGAu**^eAVjv&X8Q^;B50&*F-hTKH%Aor0+$W!D6@)~)Ed_=w=-;rO)UnGEHD2Xy? z5Eao7s-h9pL><&a6KF~_4f+q75zT^TM{}Wh(E?~;v=~|vErXUvE1^}<8fa~_9@-FX zf;LB6p>5F)XlJw=+7s=A_D2VyL(viFXmlJp5uJigM`xjP(FN#YbQ!u5U4yPiH=$e6 z9q4X!ANntP7(IrbM9-k-(M#x6^agqxy@x(TpPx1Ns^LhWV zCSWoa#&j%-*;ovVV=1uISUM~NmKn>2<;3z}`LRM+QLF@38Y_oY#HwJ`v07MNtO3>- zYlgMN+F;QHMJBppaPGje=i`W(HI(7@Yi#@;|W6!Xc*c5RTwDPT?%h z;}Wjm8gAeg?&3b4#8csE@$`5mJS(09&yDB93*tra;&>^%EM5Vxj90^J;&t%)cq60|Z8p1VaP~ zkq8kg5g|;%Av_{Mq$JW1{}368EJSu97m=4JKolm55haN-M0uhTQI)7c)F$c?4T&a1 zbD|Z|mgqoqCb|(li9SSsVh}Nu7(t9C#t{>VDa3SQ7BQDtKrAMf5i5x`#Cl>Av6a|C z>?ZaR{}P9ZW5h|~3~`>gL|i3q5VwhY#6#i<@tk-?yd^#mpNVh8PvQ>&lPF1$G|7hCCf=o@OBQubh$!ug!G7p)bEJPM1OOU0>a%4ra3R#`3Mb;%7kd4V^ zWJ|IQ*`Dk~b|rg|y~%#$KynBv2J3%Q-# zMeZdJkcY^lWgq$gkuN@;3=l2!&G=#Zo*a zQ3|C|24zt$a06{yNoHL506hpJCCqMA}I zsMb_Fsw35f>Q42d`cea^!PGEnBsGQ_PfenxQZuO8)I4e-wS-zut)kXa8>r3HHfkre zhuTjaq>fO>sZ-Qh>H>9{x<=ik?ojusN7Pg51@)SGM}4HeP~WLv)L$wC%KbdVP5 z5UtV?+N2%YqZ4#WIt~2~osrH$XQy+~dFcXlVY(Pyk}gA+rz_D_=^Auxx*pw-ZbCPw zThVRl4s>U_8{L!cL-(f#(L?DG^k{k2VtN_9l3qivr#I1C=^gZL zdLR8SeV9H*pQO*w=jluIRr&^fo4!Xsq@U2w=~why`UCx${zm_#|IjdlG6X|293wC? z6J~TK%GgYdi8Cpf)J!@i1CyD_#^hx3F!`B6Oi`u;Q<^EqRAj0!)tOpMU8Vukm}$ne zWZE$8nNCbsrU%oT>BkIYhA_jKQOsCo0yCMJ#>`~qF!Py3%u;3rvzl4QY-F}D+nHU= zUgiLEh&jrfU`{jVn2XF6<~nnWxyw9Y9y8CFm&_aHJ@bkA%KTt{GZ2fgI7_iC%d--z zuo`Qy7VEM;n`Be5Y1#B_CN?XZgU!w6V+*oH*y3y{wk%tLt;|+qYqE9N`fMY%Dcgc= z&9-AZvR&BjY%jJiJAfU`4r52MW7zTRBz7u0gPqOJV;8bZ*yZdhb}hSs-OO%dcd~of z{p>;Z2z#78#hzs^u$S3u>`nF#d!K#8K4o99ui1C(NA?T*o&ClBWdj_>ksQMXIgtx- zDi`5Q&fz>R!KLKVaQ|=_xh!0EE*F=VE5H@zig6{mGF*AC5?7V0!PVyKaSgd9Tyw4! z*Ou$Rb>_NpJ-I$ye{K*rlpDc~=EiXoxhdRqZWcF}Tfi;mmT@b&HQah`6StMy!R_Yu zasP6MxntZ(?hJRHyTo1PZg97`d)!0r3HO|P#l7V|aG$wv+)wTg2M5t05u}4$PzcJw za8M6MgLW_$j0aN$QwP%pGXygSvjuYo^91t;3k8b?O9V>?%LOY2s|2eDYX$2D8w48% zn+012+XUMOI|aK2djxw2`vnIEhXjWQM+L_QCj=)4rv+yQ=LF{m7X_CFR|HoF*9A8Q zw*VhzXpEcS-{yYDR|H}si zOdthD2nwPQ5>z1~n1UmCLPAI>q!IoRG74FQ>_RRfuTVfJEEE$;3T1@yLM5T9P(!FK z)Ds#CO@!t`E1|8>LFg=W6M71Lg#N-HVW==d7%hwwCJIx8>B1~wuCPE@EG!dN3TuS* z!X{y>utV4_>=XVK4hzSGlfoI{yl_dlD%=om3-^SF!V}@S@Je_qd=Neh--MsS9|0Co zkq~K-69rKg!=f%mMO%!CaWRFMT1+Qq5HpL}#GGOtF~3+yEGm`|ON-^iieeS9x>!rB zD>e`ti_OHAVjHo&*h%av_7HoE{ltOd5OKITN*pUr5GRY%#F^q8alW`nTq>>*SBvY! zjp7z@ySPi-D;^LJiATi~;%V`mcu~9}UKekPcf|+dWAT~zQhX!67e9$##UJ8t5t0xI zmneyqcuA5JNs|o8l3dA`l2R%ut(0EMBxRLyNV%naQbDPRR9q@0m6a+;m8EJ@O{tDl zUuq;Zm0C!xrFK$Bsf*NI>LvA+21tXYVbVxxj5J=FBu$lONVBDR(n4v8v|L&xt(7)N zo26~iPHB&{Upgoqk&a8Jq_ffm>9TZ9x+&d}?n{rPr_u}Qwe(KOTVPQQb5LJ zQfB0!EXpBSl_Rn#JF+JyoJGzq=aTcv1?0kVF}b8%MlLT`lB>!!F;Sd@k zLUf1=2_ZQY4(Xw2$PUFq@lc9T>QK5+hEV2EwouMco>2Z!p-|CKiBRcKxlqMWl~DCi ztx(-igHYp8vrx-Wn^5~ur%=~Wk5KPWztF(YkkIhZsLCm~*#n6?|_0X-*-Oz*3Silw-UuOyXJN?Ikol1a&`DNmIb%4_AF@=^Js zd{=%cf0aNO3zK0c91M%$P*@E|!e-bBd*MVlWjIavpK!)-mT>lPu5jLPfpFn)v2e+7 znQ-}VrEt}7jd1O7y>P>DlW_BJt8m+Jhj8a`w{XvJpK$;1pzzS}i16s}xbVdAl<@TM ztnl3Mg7D(-vhd3An(+GYrtsGAj_~gAzVN@{!{KA$li@Sr^WjV3tKl2r+u?iRhv6sT z=iyi3x8V=r&*5+3pW#1YSVdJrrBzN9R9OwHx*An&HKxYZ6l!WUoti<-tY%Yls(IA> zY9Y0#T0$+YmQyRLRn+QgEw!%NKy9oxQ(LNS)b?s8wX51g?XC7x2dYEV;p!-LtU5uR ztWHyBs&mx&>LPWixj*#4c3NfBegNwcx{q4 zRhyyB*5+vowI$keZI!lG+n{aMwrM-HJ=%Wlpmsz%uAS1(Y8SN2+BNN_c1OFfJ<^_P zFSOU%JME+PMfS^?U^o)8IJ-ePu&#M>E z3+u)7l6o1vyk1GKs@KqK>-F@8dK105-b!z)chEcQ-SnP%AHBamNFS7wgOPmHHZey}n7`s_)Qu>-+S7^~3rx{iJ?IKd)cXuj)7S+xk8Iq5edF zuD{aX>L2va`ZxWj{zr!+XoQH+5iTM`EhB9r?IWEcT_Zgry(9f110zEs!y}_2 zV;zlW>tWm+JY*aI98g-2NMkAxC(ZXnLv@<#yU5xHVFQcz9z!+={ zGe#O?jPb@KW2!O3m~G5678*;8<;E&wt+Bz_Y-}@j8hecW#zEtVaojj%oHZ^OmyK)2 zP2-Mn-*{v^HC`C6jd#XJ(5H5!SUQ77s}6Va5>G|_*e z8KYUE*`v9ld7}lQg`>ryC8K4c<)f9NRiibcwWIZ-4Wmt>&7-ZNZKEBcoul2NJ)?c1 z{iB1TL!%?2qod=Z6Qfh2)1$MZbE6BQi=)e;E2C?o>!X{ZTcbOoyQBM}|3(i-k3~;L z&qU8hFGa6LZ$xiL??oR*pG2QWUq#T+)U?f* z88=gysm*j|1~apn&CF@$G4q>+%%Wxqv$R>xtY}s-tDCjVx@H5jvDwUQX|^%jo1M(A zW)HKs+0Ptk4l#$Dqs+191aq=E&75h@G3T3$%%$cEbG5n7+-Pnwx0}1nz2*V)ka^TR zVV*Y6nHS9~=5_OydDnbkJ~p43FU>dRd-Id|)%;=pHX#eKaEr27i?<|8u{6uDEX%cg zD`};&(pu@QOjcGahn3sPXBD)HSjDYUR#~fpRoSX$)wJqZ^{qx$Q>%s5+G=NYw7OW` ztzK4NYk)P_8fJ~O##rO6N!C>n-`>li45$m{h z$~tRZur6EItee&y>%R5KdTPC}UR&?1kJcCKyY(=Dvm4q??B;eWyRF^9?re9n zd)j^M{`Meys6E0SZI81j+EeW5_AGm@y}({`4n_w0xE6Z^US%6@Bqus_@1?4R}@8+K5KaA=2f1V?tlj_yPq+le`G zCxw&RN#|s6GCSFvoK7Amzf;I5>XdLwJLQ~;P8FxRQ_HFAG;kU_&777_8>hX~$?59! zaC$raoPo{|XSg%U8S6}NCOgxdna&(%zO%?#>a1{9JL{Z{&K768v&-4*9B>XfN1YSS zY3H1C(YfMWcWya%od?ci=b7`;dE>lyJ~>~VAI@(FauFAIDVKG5S8^3sa}C#WUDtP$ zZYnpeo8HajW_5G8x!rtjLAQuo+%4snbt|})-D+-4w~kxiZR9p}Tez*=c5X+vi`(7p z<@R+4xP#qc?nrlxJKmk-PIYIvv)y^_LU)O~++F3abvL-1-EHnpcaOW@J?I{BkGrSb zv+f1=vU|E3bgyN}$b?hE&|`_BF7esRCMzudoWAcn=r7!wP|#8@b%#v(B@=ES^M zB9=0iCiYJ(V=PN7dn{KhZ>&J9aI9FYWUNfAe5_KeYOF@AcC22kVXR54d8}2eZLCAA zbF5peXRJ@Ge{4`}Xlz7mbZlH~Vr)umdTdr~Zfrqpaco&^Wo%7seQZ-~YivhscWhtm z-`L^UvDnGjnb`T*rP$Tjjo9tjz1YLpli2gvtJvGvhuG)Xx7g3vpBU_+9^ug*=Lw$d zg+1MidbSty;$8|bwU^Gz;AQr*c{#m2UVg8TSJW%vmG;Vc6}>86b+49J*K6Q4_L_Mu zy*6HZuano+>*4kG`gsGrA>MFrlsDF!;7#_Xc{9B^-h6M7x71tVt@hS=8@(;wc5j!r z*E`@H@{W2Zywlz}@1l3byYAic?s^Zr$KEsVrT4~r?|t&VdOy719^@lF?o&SN^S_ntmO>zTe1i>bLM) z`|bRWeiy&H-^=go5AX;3!~Bu{7=OG!$)D=a@Mrt;{DuA!f4RTPU+Zu1H~ZWCo&Fww zzkkp_;ve@<`DgtL{$>A~f78F?-}fK+PyHAEYyX}9(f{Iq_ka0+{XiUxlW`^pxc-44~cnmI_|W)>_~`h!_{8{>`1JU!_}ut{_~Q7o_{#X2`1<&! z_}2K2`0n_=_`mVP@ni9m@iX!B@k{Zm@f-2m@q6)y@h9=;@mKM;@elFO@o({;@jr1m zfhLFqo!}BeLQaGedLo*z6R|`*ks^^gkuH%TkvWkqku#Aekv~x=Q8ZB^Q94mBQ87^^ zQ9V&BQ8&>b(Kyj8(K68{(LT{B(KXQ{(L2#EF)%SCF+4FUF*Y$FF*z|UF*7kIF+Z^= zu{5zFu{yCXu`#hFu|2UXu{UubaVT*#aUyX#aV~K&aV2p*aVv2*@gVUy@htH&@h0&; z@hR~&@gwm&0VRf z00e*l5C8%|00;m9AOHk_01yBIKmZ5;0U!VbfB+Bx0zd!=00AKI|3Tn?{ih3XK_CDG qfB+Bx0zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00{gq6Zk)YaHA&x literal 0 HcmV?d00001 diff --git a/examples/roaring_bitmap/host_bulk_example.cu b/examples/roaring_bitmap/host_bulk_example.cu new file mode 100644 index 000000000..85870f74b --- /dev/null +++ b/examples/roaring_bitmap/host_bulk_example.cu @@ -0,0 +1,75 @@ +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include + +int main(int argc, char* argv[]) +{ + if (argc != 2) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return -1; + } + + // Open file + std::ifstream file(argv[1], std::ios::binary); + if (!file.is_open()) { + std::cerr << "Failed to open " << argv[1] << std::endl; + return -1; + } + + // Get file size + file.seekg(0, std::ios::end); + std::streamsize file_size = file.tellg(); + file.seekg(0, std::ios::beg); + + // Allocate pinned host memory using cudaMallocHost + char* buffer; + CUCO_CUDA_TRY(cudaMallocHost(&buffer, file_size)); + + // Read file into memory + file.read(buffer, file_size); + file.close(); + + cuda::std::span bitmap(reinterpret_cast(buffer), + file_size); + cuco::roaring_bitmap roaring_bitmap(bitmap); + + std::vector keys; + for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) { + keys.push_back(k); + } + for (int k = 100000; k < 200000; ++k) { + keys.push_back(3 * k); + } + for (int k = 700000; k < 800000; ++k) { + keys.push_back(k); + } + + thrust::universal_vector keys_d(keys.begin(), keys.end()); + thrust::universal_vector contained(keys.size(), false); + + roaring_bitmap.contains(keys_d.begin(), keys_d.end(), contained.begin()); + + for (size_t i = 0; i < keys.size(); i++) { + if (not contained[i]) { + std::cout << "Error: " << keys_d[i] << " is not contained" << std::endl; + } + } + + // check if all elements are contained + bool all_contained = thrust::all_of(contained.begin(), contained.end(), ::cuda::std::identity{}); + std::cout << "all_contained: " << all_contained << std::endl; + + // Free the allocated memory + CUCO_CUDA_TRY(cudaFreeHost(buffer)); + + return 0; +} \ No newline at end of file diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl new file mode 100644 index 000000000..3a17a82d0 --- /dev/null +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2025 NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include +#include +#include + +namespace cuco { + +template +__host__ roaring_bitmap::roaring_bitmap( + cuda::std::span compressed_bitmap, + cuda_thread_scope scope, + Allocator const& alloc, + cuda::stream_ref stream) + : allocator_{alloc}, + data_{allocator_.allocate(compressed_bitmap.size()), + detail::custom_deleter{compressed_bitmap.size(), + allocator_}}, + ref_{compressed_bitmap, + cuda::std::span(data_.get(), compressed_bitmap.size()), + scope} // TODO move after memcpy? +{ + CUCO_CUDA_TRY(cudaMemcpyAsync(data_.get(), + compressed_bitmap.data(), + compressed_bitmap.size(), + cudaMemcpyHostToDevice, + stream.get())); + stream.wait(); // TODO check if this is necessary +} + +template +template +__host__ void roaring_bitmap::contains(InputIt first, + InputIt last, + OutputIt output, + cuda::stream_ref stream) const +{ + ref_.contains(first, last, output, stream); +} + +template +template +__host__ void roaring_bitmap::contains_async( + InputIt first, InputIt last, OutputIt output, cuda::stream_ref stream) const noexcept +{ + ref_.contains_async(first, last, output, stream); +} + +template +__host__ cuda::std::size_t roaring_bitmap::size() const noexcept +{ + return ref_.size(); +} + +template +__host__ cuda::std::span roaring_bitmap::data() + const noexcept +{ + return ref_.data(); +} + +template +__host__ typename roaring_bitmap::allocator_type +roaring_bitmap::allocator() const noexcept +{ + return allocator_; +} + +template +__host__ typename roaring_bitmap::ref_type<> +roaring_bitmap::ref() const noexcept +{ + return ref_; +} +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh new file mode 100644 index 000000000..248428b69 --- /dev/null +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2025 NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cuco::detail { + +// primary template +template +class roaring_bitmap_impl { + static_assert(cuco::dependent_false, "T must be either uint32_t or uint64_t"); +}; + +template +class roaring_bitmap_impl { + // Constants from the Roaring format spec + static constexpr cuda::std::uint32_t serial_cookie_no_runcontainer = 12346; + static constexpr cuda::std::uint32_t serial_cookie = 12347; + static constexpr cuda::std::uint32_t frozen_cookie = 13766; + static constexpr cuda::std::int32_t no_offset_threshold = 4; + + public: + static constexpr auto thread_scope = Scope; + + __host__ roaring_bitmap_impl(cuda::std::span compressed_bitmap_h, + cuda::std::span compressed_bitmap_d, + cuda_thread_scope /* scope */) + : data_{compressed_bitmap_d} + { + bool success = this->read_header(compressed_bitmap_h); + CUCO_EXPECTS(success, "Failed to read compressed bitmap"); + } + + __device__ roaring_bitmap_impl(cuda::std::span compressed_bitmap, + cuda_thread_scope /* scope */) + : data_{compressed_bitmap} + { + this->read_header(compressed_bitmap); // TODO error handling + } + + template + __host__ void contains(InputIt first, + InputIt last, + OutputIt contained, + cuda::stream_ref stream = {}) const + { + this->contains_async(first, last, contained, stream); + stream.wait(); + } + + template + __host__ void contains_async(InputIt first, + InputIt last, + OutputIt contained, + cuda::stream_ref stream = {}) const noexcept + { + auto nosync_exec_policy = thrust::cuda::par_nosync.on(stream.get()); + if (this->empty()) { + thrust::fill( + nosync_exec_policy, contained, contained + cuda::std::distance(first, last), false); + } else { + thrust::transform(nosync_exec_policy, + first, + last, + contained, + cuda::proclaim_return_type( + [*this] __device__(auto key) { return this->contains(key); })); + } + } + + __device__ bool contains(cuda::std::uint32_t value) const + { + cuda::std::uint16_t upper = value >> 16; + cuda::std::uint16_t lower = value & 0xFFFF; + + // TODO binary search on key_cards_ + for (cuda::std::int32_t i = 0; i < num_containers_; i++) { + if (key_cards_[i * 2] == upper) { + cuda::std::uint32_t card = key_cards_[i * 2 + 1] + 1; + cuda::std::uint16_t const* container = + reinterpret_cast(data_.data() + this->container_offset(i)); + if (this->is_run_container(i)) { + return this->contains_run_container(container, lower, card); + } else { + if (card <= 4096) { // TODO check if this is correct + return this->contains_array_container(container, lower, card); + } else { + return this->contains_bitset_container(container, lower, card); + } + } + } + } + return false; + } + + [[nodiscard]] __host__ __device__ cuda::std::size_t size() const noexcept { return size_; } + + [[nodiscard]] __host__ __device__ bool empty() const noexcept { return size_ == 0; } + + [[nodiscard]] __host__ __device__ cuda::std::span data() const noexcept + { + return data_; + } + + private: + __device__ bool is_run_container(cuda::std::int32_t i) const + { + if (not has_run_) return false; + return run_container_bitmap_[i / 8] & (1 << (i % 8)); + } + + __device__ bool contains_array_container(cuda::std::uint16_t const* container, + cuda::std::uint16_t lower, + cuda::std::uint32_t card) const + { + // TODO binary search on container + // if (card < 256) -> linear search + for (cuda::std::uint32_t i = 0; i < card; i++) { + if (container[i] == lower) { return true; } + } + return false; + } + + __device__ bool contains_bitset_container(cuda::std::uint16_t const* container, + cuda::std::uint16_t lower, + cuda::std::uint32_t card) const + { + // check if bit at position lower is set + return container[lower / 16] & (1 << (lower % 16)); + } + + __device__ bool contains_run_container(cuda::std::uint16_t const* container, + cuda::std::uint16_t lower, + cuda::std::uint32_t card) const + { + // TODO implement + return false; + } + + __device__ cuda::std::uint32_t container_offset(cuda::std::int32_t i) const + { + cuda::std::uint32_t offset; + cuda::std::memcpy( + &offset, offsets_ + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t)); + return offset; + } + + __host__ __device__ bool read_header(cuda::std::span compressed_bitmap) + { + cuda::std::size_t length = compressed_bitmap.size(); + cuda::std::byte const* buf = compressed_bitmap.data(); + [[maybe_unused]] cuda::std::size_t readbytes = 0; + + // cookie and num_containers + if (length < 4) { + // printf("length is less than 4\n"); + return false; + } + + cuda::std::uint32_t cookie; + cuda::std::memcpy(&cookie, buf, sizeof(cuda::std::uint32_t)); + readbytes += sizeof(cuda::std::uint32_t); + buf += sizeof(cuda::std::uint32_t); + if ((cookie & 0xFFFF) != serial_cookie && cookie != serial_cookie_no_runcontainer) { + // printf("cookie is not serial cookie or serial cookie no runcontainer\n"); + return false; + } + + if ((cookie & 0xFFFF) == serial_cookie) + num_containers_ = (cookie >> 16) + 1; + else { + readbytes += sizeof(cuda::std::uint32_t); + if (readbytes > length) { + // printf("readbytes is greater than length\n"); + return false; + } + cuda::std::memcpy(&num_containers_, buf, sizeof(cuda::std::uint32_t)); + buf += sizeof(cuda::std::uint32_t); + } + if (num_containers_ < 0) { + // printf("num_containers_ is less than 0\n"); + return false; + } + if (num_containers_ > (1 << 16)) { + // printf("num_containers_ is greater than 65536\n"); + return false; + } + // printf("num_containers_: %d\n", num_containers_); + + has_run_ = (cookie & 0xFFFF) == serial_cookie; + if (has_run_) { + cuda::std::size_t s = (num_containers_ + 7) / 8; + readbytes += s; + if (readbytes > length) { + // printf("readbytes is greater than length\n"); + return false; + } + run_container_bitmap_ = reinterpret_cast(buf); + buf += s; + } + // printf("has_run: %d\n", has_run_); + + key_cards_ = reinterpret_cast(buf); + readbytes += num_containers_ * 2 * sizeof(cuda::std::uint16_t); + if (readbytes > length) { + // printf("readbytes is greater than length\n"); + return false; + } + buf += num_containers_ * 2 * sizeof(cuda::std::uint16_t); + + if ((!has_run_) || (num_containers_ >= no_offset_threshold)) { + readbytes += num_containers_ * 4; + if (readbytes > length) { + // printf("readbytes is greater than length\n"); + return false; + } + offsets_ = buf; + buf += num_containers_ * 4; + } + + readbytes += num_containers_ * 4; + if (readbytes > length) { + // printf("readbytes is greater than length\n"); + return false; + } + + size_ = 0; + for (cuda::std::int32_t i = 0; i < num_containers_; i++) { + // cuda::std::uint16_t key = key_cards_[i * 2]; + cuda::std::uint32_t card = key_cards_[i * 2 + 1] + 1; + size_ += card; + // printf("key: %d, card: %d\n", key, card); + } + + return true; + } + + cuda::std::span data_; + cuda::std::size_t size_; + cuda::std::int32_t num_containers_; + cuda::std::uint8_t const* run_container_bitmap_; + cuda::std::uint16_t const* key_cards_; + cuda::std::byte const* offsets_; + bool has_run_; +}; + +template +class roaring_bitmap_impl { + using bucket_type = roaring_bitmap_impl; + // TODO implement +}; + +} // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl new file mode 100644 index 000000000..b66ea9e31 --- /dev/null +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2025 NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include +#include +#include + +namespace cuco { + +template +__host__ roaring_bitmap_ref::roaring_bitmap_ref( + cuda::std::span compressed_bitmap_h, + cuda::std::span compressed_bitmap_d, + cuda_thread_scope scope) + : impl_{compressed_bitmap_h, compressed_bitmap_d, scope} +{ +} + +template +__device__ roaring_bitmap_ref::roaring_bitmap_ref( + cuda::std::span compressed_bitmap, cuda_thread_scope scope) + : impl_{compressed_bitmap, scope} +{ +} + +template +template +__host__ void roaring_bitmap_ref::contains(InputIt first, + InputIt last, + OutputIt output, + cuda::stream_ref stream) const +{ + impl_.contains(first, last, output, stream); +} + +template +template +__host__ void roaring_bitmap_ref::contains_async(InputIt first, + InputIt last, + OutputIt output, + cuda::stream_ref stream) const noexcept +{ + impl_.contains_async(first, last, output, stream); +} + +template +__device__ bool roaring_bitmap_ref::contains(T value) const +{ + return impl_.contains(value); +} + +template +__host__ __device__ cuda::std::size_t roaring_bitmap_ref::size() const noexcept +{ + return impl_.size(); +} + +template +__host__ __device__ cuda::std::span roaring_bitmap_ref::data() + const noexcept +{ + return impl_.data(); +} +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/roaring_bitmap.cuh b/include/cuco/roaring_bitmap.cuh new file mode 100644 index 000000000..b850431a7 --- /dev/null +++ b/include/cuco/roaring_bitmap.cuh @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2025 NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace cuco { + +template > +class roaring_bitmap { + public: + static constexpr auto thread_scope = Scope; + + using allocator_type = Allocator; + + template + using ref_type = roaring_bitmap_ref; + + __host__ roaring_bitmap(cuda::std::span compressed_bitmap, + cuda_thread_scope scope = {}, + Allocator const& alloc = {}, + cuda::stream_ref stream = {}); + + roaring_bitmap(roaring_bitmap const& other) = default; + roaring_bitmap(roaring_bitmap&& other) = default; + roaring_bitmap& operator=(roaring_bitmap const& other) = default; + roaring_bitmap& operator=(roaring_bitmap&& other) = default; + + ~roaring_bitmap() = default; + + template + __host__ void contains(InputIt first, + InputIt last, + OutputIt contained, + cuda::stream_ref stream = {}) const; + + template + __host__ void contains_async(InputIt first, + InputIt last, + OutputIt contained, + cuda::stream_ref stream = {}) const noexcept; + + // TODO contains_if, contains_if_async, empty + + [[nodiscard]] __host__ cuda::std::size_t size() const noexcept; + + [[nodiscard]] __host__ cuda::std::span data() const noexcept; + + [[nodiscard]] __host__ allocator_type allocator() const noexcept; + + [[nodiscard]] __host__ ref_type<> ref() const noexcept; + + private: + allocator_type allocator_; + std::unique_ptr> data_; + ref_type<> ref_; +}; + +} // namespace cuco + +#include \ No newline at end of file diff --git a/include/cuco/roaring_bitmap_ref.cuh b/include/cuco/roaring_bitmap_ref.cuh new file mode 100644 index 000000000..a26474cd9 --- /dev/null +++ b/include/cuco/roaring_bitmap_ref.cuh @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2025 NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +#include +#include +#include + +namespace cuco { + +template +class roaring_bitmap_ref { + using impl_type = detail::roaring_bitmap_impl; + + public: + static constexpr auto thread_scope = impl_type::thread_scope; + + // This is tricky as it is not clear if compressed_bitmap resides in host or device memory. + __host__ roaring_bitmap_ref(cuda::std::span compressed_bitmap_h, + cuda::std::span compressed_bitmap_d, + cuda_thread_scope scope = {}); + + __device__ roaring_bitmap_ref(cuda::std::span compressed_bitmap, + cuda_thread_scope scope = {}); + + template + __host__ void contains(InputIt first, + InputIt last, + OutputIt contained, + cuda::stream_ref stream = {}) const; + + template + __host__ void contains_async(InputIt first, + InputIt last, + OutputIt contained, + cuda::stream_ref stream = {}) const noexcept; + + __device__ bool contains(T value) const; + + [[nodiscard]] __host__ __device__ cuda::std::size_t size() const noexcept; + + [[nodiscard]] __host__ __device__ cuda::std::span data() const noexcept; + + private: + impl_type impl_; +}; + +} // namespace cuco + +#include \ No newline at end of file From 7ff8399905de735b0e9219d89a1a9a5294c8e5e4 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 4 Jul 2025 17:10:42 -0700 Subject: [PATCH 02/24] Preliminary benchmark --- benchmarks/CMakeLists.txt | 5 ++ benchmarks/roaring_bitmap/contains_bench.cu | 94 +++++++++++++++++++++ examples/CMakeLists.txt | 28 +++--- 3 files changed, 113 insertions(+), 14 deletions(-) create mode 100644 benchmarks/roaring_bitmap/contains_bench.cu diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 17b5b21c1..ebab3e888 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -100,3 +100,8 @@ ConfigureBench(HYPERLOGLOG_BENCH ConfigureBench(BLOOM_FILTER_BENCH bloom_filter/add_bench.cu bloom_filter/contains_bench.cu) + +################################################################################################### +# - roaring_bitmap benchmarks --------------------------------------------------------------------- +ConfigureBench(ROARING_BITMAP_BENCH + roaring_bitmap/contains_bench.cu) \ No newline at end of file diff --git a/benchmarks/roaring_bitmap/contains_bench.cu b/benchmarks/roaring_bitmap/contains_bench.cu new file mode 100644 index 000000000..f8a0fdcfd --- /dev/null +++ b/benchmarks/roaring_bitmap/contains_bench.cu @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include + +#include + +#include + +#include + +#include +#include +#include +#include + +void roaring_bitmap_contains(nvbench::state& state) +{ + namespace fs = std::filesystem; + + // Get the path of the current source file + fs::path source_file_path = __FILE__; + fs::path source_dir = source_file_path.parent_path(); + + fs::path path = source_dir / "../../examples/roaring_bitmap/bitmapwithoutruns.bin"; + fs::path full_path = path.lexically_normal(); + + // Open file + std::ifstream file(full_path, std::ios::binary); + if (!file.is_open()) { state.skip("Failed to open bitmap file"); } + + // Get file size + file.seekg(0, std::ios::end); + std::streamsize file_size = file.tellg(); + file.seekg(0, std::ios::beg); + + // Allocate pinned host memory using cudaMallocHost + char* buffer; + CUCO_CUDA_TRY(cudaMallocHost(&buffer, file_size)); + + // Read file into memory + file.read(buffer, file_size); + file.close(); + + cuda::std::span bitmap(reinterpret_cast(buffer), + file_size); + cuco::roaring_bitmap roaring_bitmap(bitmap); + + std::vector keys; + for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) { + keys.push_back(k); + } + for (cuda::std::uint32_t k = 100000; k < 200000; ++k) { + keys.push_back(3 * k); + } + for (cuda::std::uint32_t k = 700000; k < 800000; ++k) { + keys.push_back(k); + } + + // multiply the keys for the benchmark + for (int i = 0; i < 13; i++) { + keys.insert(keys.end(), keys.begin(), keys.end()); + } + + thrust::device_vector keys_d(keys.begin(), keys.end()); + thrust::device_vector contained(keys.size(), false); + + state.add_element_count(keys.size()); + state.add_global_memory_reads(keys.size(), "InputSize"); + + state.exec([&](nvbench::launch& launch) { + roaring_bitmap.contains_async( + keys_d.begin(), keys_d.end(), contained.begin(), {launch.get_stream()}); + }); + + CUCO_CUDA_TRY(cudaFreeHost(buffer)); +} + +NVBENCH_BENCH(roaring_bitmap_contains) + .set_name("roaring_bitmap_contains") + .set_max_noise(cuco::benchmark::defaults::MAX_NOISE); \ No newline at end of file diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e2328b496..08bf51197 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -33,18 +33,18 @@ endfunction(ConfigureExample) ### Example sources ############################################################################### ################################################################################################### -# ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu") -# ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu") -# ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu") -# ConfigureExample(STATIC_SET_SHARED_MEMORY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/shared_memory_example.cu") -# ConfigureExample(STATIC_SET_MAPPING_TABLE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/mapping_table_example.cu") -# ConfigureExample(STATIC_MULTISET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multiset/host_bulk_example.cu") -# ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu") -# ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_ref_example.cu") -# ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu") -# ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu") -# ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu") -# ConfigureExample(HYPERLOGLOG_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/host_bulk_example.cu") -# ConfigureExample(HYPERLOGLOG_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/device_ref_example.cu") -# ConfigureExample(BLOOM_FILTER_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/bloom_filter/host_bulk_example.cu") +ConfigureExample(STATIC_SET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/host_bulk_example.cu") +ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_ref_example.cu") +ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu") +ConfigureExample(STATIC_SET_SHARED_MEMORY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/shared_memory_example.cu") +ConfigureExample(STATIC_SET_MAPPING_TABLE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/mapping_table_example.cu") +ConfigureExample(STATIC_MULTISET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multiset/host_bulk_example.cu") +ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu") +ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_ref_example.cu") +ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu") +ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu") +ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu") +ConfigureExample(HYPERLOGLOG_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/host_bulk_example.cu") +ConfigureExample(HYPERLOGLOG_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/device_ref_example.cu") +ConfigureExample(BLOOM_FILTER_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/bloom_filter/host_bulk_example.cu") ConfigureExample(ROARING_BITMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/roaring_bitmap/host_bulk_example.cu") From 77a4c1d387f4430941303e8bfdef4e84b78f1942 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 4 Jul 2025 18:19:44 -0700 Subject: [PATCH 03/24] Optimizations --- benchmarks/roaring_bitmap/contains_bench.cu | 6 +- .../roaring_bitmap/roaring_bitmap_impl.cuh | 87 ++++++++++++++----- 2 files changed, 69 insertions(+), 24 deletions(-) diff --git a/benchmarks/roaring_bitmap/contains_bench.cu b/benchmarks/roaring_bitmap/contains_bench.cu index f8a0fdcfd..443136315 100644 --- a/benchmarks/roaring_bitmap/contains_bench.cu +++ b/benchmarks/roaring_bitmap/contains_bench.cu @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #include #include @@ -38,7 +39,6 @@ void roaring_bitmap_contains(nvbench::state& state) fs::path path = source_dir / "../../examples/roaring_bitmap/bitmapwithoutruns.bin"; fs::path full_path = path.lexically_normal(); - // Open file std::ifstream file(full_path, std::ios::binary); if (!file.is_open()) { state.skip("Failed to open bitmap file"); } @@ -47,11 +47,9 @@ void roaring_bitmap_contains(nvbench::state& state) std::streamsize file_size = file.tellg(); file.seekg(0, std::ios::beg); - // Allocate pinned host memory using cudaMallocHost char* buffer; CUCO_CUDA_TRY(cudaMallocHost(&buffer, file_size)); - // Read file into memory file.read(buffer, file_size); file.close(); @@ -70,7 +68,7 @@ void roaring_bitmap_contains(nvbench::state& state) keys.push_back(k); } - // multiply the keys for the benchmark + // multiply the keys for more accurate benchmark numbers for (int i = 0; i < 13; i++) { keys.insert(keys.end(), keys.begin(), keys.end()); } diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh index 248428b69..7910d485b 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #pragma once #include @@ -44,6 +45,7 @@ class roaring_bitmap_impl { static constexpr cuda::std::uint32_t serial_cookie = 12347; static constexpr cuda::std::uint32_t frozen_cookie = 13766; static constexpr cuda::std::int32_t no_offset_threshold = 4; + static constexpr cuda::std::uint32_t binary_search_threshold = 8; // TODO determine optimal value public: static constexpr auto thread_scope = Scope; @@ -99,20 +101,25 @@ class roaring_bitmap_impl { cuda::std::uint16_t upper = value >> 16; cuda::std::uint16_t lower = value & 0xFFFF; - // TODO binary search on key_cards_ - for (cuda::std::int32_t i = 0; i < num_containers_; i++) { - if (key_cards_[i * 2] == upper) { - cuda::std::uint32_t card = key_cards_[i * 2 + 1] + 1; - cuda::std::uint16_t const* container = - reinterpret_cast(data_.data() + this->container_offset(i)); - if (this->is_run_container(i)) { - return this->contains_run_container(container, lower, card); + // Binary search on key_cards_ to find container with matching upper key + cuda::std::uint32_t left = 0; + cuda::std::uint32_t right = num_containers_; + + if (num_containers_ < binary_search_threshold) { + for (cuda::std::uint32_t i = 0; i < num_containers_; i++) { + if (key_cards_[i * 2] == upper) { return this->contains_container(lower, i); } + } + } else { + while (left < right) { + cuda::std::uint32_t mid = left + (right - left) / 2; + cuda::std::uint16_t mid_key = key_cards_[mid * 2]; + + if (mid_key == upper) { + return this->contains_container(lower, mid); + } else if (mid_key < upper) { + left = mid + 1; } else { - if (card <= 4096) { // TODO check if this is correct - return this->contains_array_container(container, lower, card); - } else { - return this->contains_bitset_container(container, lower, card); - } + right = mid; } } } @@ -135,16 +142,48 @@ class roaring_bitmap_impl { return run_container_bitmap_[i / 8] & (1 << (i % 8)); } + __device__ bool contains_container(cuda::std::uint16_t lower, cuda::std::uint32_t index) const + { + cuda::std::uint32_t card = key_cards_[index * 2 + 1] + 1; + cuda::std::uint16_t const* container = + reinterpret_cast(data_.data() + this->container_offset(index)); + if (this->is_run_container(index)) { + return this->contains_run_container(container, lower, card); + } else { + if (card <= 4096) { // TODO check if this is correct + return this->contains_array_container(container, lower, card); + } else { + return this->contains_bitset_container(container, lower, card); + } + } + } + __device__ bool contains_array_container(cuda::std::uint16_t const* container, cuda::std::uint16_t lower, cuda::std::uint32_t card) const { - // TODO binary search on container - // if (card < 256) -> linear search - for (cuda::std::uint32_t i = 0; i < card; i++) { - if (container[i] == lower) { return true; } + // Use linear search for small arrays, binary search for larger ones + if (card < binary_search_threshold) { + for (cuda::std::uint32_t i = 0; i < card; i++) { + if (container[i] == lower) { return true; } + } + return false; + } else { + cuda::std::uint32_t left = 0; + cuda::std::uint32_t right = card; + + while (left < right) { + cuda::std::uint32_t mid = left + (right - left) / 2; + if (container[mid] == lower) { + return true; + } else if (container[mid] < lower) { + left = mid + 1; + } else { + right = mid; + } + } + return false; } - return false; } __device__ bool contains_bitset_container(cuda::std::uint16_t const* container, @@ -166,8 +205,13 @@ class roaring_bitmap_impl { __device__ cuda::std::uint32_t container_offset(cuda::std::int32_t i) const { cuda::std::uint32_t offset; - cuda::std::memcpy( - &offset, offsets_ + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t)); + if (offsets_aligned_) { + offset = + *reinterpret_cast(offsets_ + i * sizeof(cuda::std::uint32_t)); + } else { + cuda::std::memcpy( + &offset, offsets_ + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t)); + } return offset; } @@ -241,6 +285,8 @@ class roaring_bitmap_impl { return false; } offsets_ = buf; + offsets_aligned_ = + (reinterpret_cast(offsets_) % sizeof(cuda::std::uint32_t)) == 0; buf += num_containers_ * 4; } @@ -267,6 +313,7 @@ class roaring_bitmap_impl { cuda::std::uint8_t const* run_container_bitmap_; cuda::std::uint16_t const* key_cards_; cuda::std::byte const* offsets_; + bool offsets_aligned_; bool has_run_; }; From 142ac06b758a44fe935bdc2a79cff63a6ad2b8b3 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 7 Jul 2025 17:54:35 -0700 Subject: [PATCH 04/24] v2 --- benchmarks/roaring_bitmap/contains_bench.cu | 5 +- examples/roaring_bitmap/host_bulk_example.cu | 14 +- .../detail/roaring_bitmap/roaring_bitmap.inl | 29 +- .../roaring_bitmap/roaring_bitmap_impl.cuh | 256 ++++++++++-------- .../roaring_bitmap/roaring_bitmap_ref.inl | 24 +- include/cuco/roaring_bitmap.cuh | 3 +- include/cuco/roaring_bitmap_ref.cuh | 13 +- 7 files changed, 190 insertions(+), 154 deletions(-) diff --git a/benchmarks/roaring_bitmap/contains_bench.cu b/benchmarks/roaring_bitmap/contains_bench.cu index 443136315..2f727c541 100644 --- a/benchmarks/roaring_bitmap/contains_bench.cu +++ b/benchmarks/roaring_bitmap/contains_bench.cu @@ -53,9 +53,8 @@ void roaring_bitmap_contains(nvbench::state& state) file.read(buffer, file_size); file.close(); - cuda::std::span bitmap(reinterpret_cast(buffer), - file_size); - cuco::roaring_bitmap roaring_bitmap(bitmap); + cuco::roaring_bitmap roaring_bitmap( + reinterpret_cast(buffer)); std::vector keys; for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) { diff --git a/examples/roaring_bitmap/host_bulk_example.cu b/examples/roaring_bitmap/host_bulk_example.cu index 85870f74b..bbbbe6005 100644 --- a/examples/roaring_bitmap/host_bulk_example.cu +++ b/examples/roaring_bitmap/host_bulk_example.cu @@ -38,9 +38,8 @@ int main(int argc, char* argv[]) file.read(buffer, file_size); file.close(); - cuda::std::span bitmap(reinterpret_cast(buffer), - file_size); - cuco::roaring_bitmap roaring_bitmap(bitmap); + cuco::roaring_bitmap roaring_bitmap( + reinterpret_cast(buffer)); std::vector keys; for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) { @@ -58,13 +57,18 @@ int main(int argc, char* argv[]) roaring_bitmap.contains(keys_d.begin(), keys_d.end(), contained.begin()); + size_t num_errors = 0; for (size_t i = 0; i < keys.size(); i++) { if (not contained[i]) { - std::cout << "Error: " << keys_d[i] << " is not contained" << std::endl; + if (num_errors <= 10) { + std::cout << "Error: " << keys_d[i] << " is not contained" << std::endl; + } + num_errors++; } } + if (num_errors > 0) { std::cout << "num_errors: " << num_errors << std::endl; } - // check if all elements are contained + // check if all elements are contained and written to output bool all_contained = thrust::all_of(contained.begin(), contained.end(), ::cuda::std::identity{}); std::cout << "all_contained: " << all_contained << std::endl; diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl index 3a17a82d0..efb64c448 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl @@ -26,25 +26,20 @@ namespace cuco { template -__host__ roaring_bitmap::roaring_bitmap( - cuda::std::span compressed_bitmap, - cuda_thread_scope scope, - Allocator const& alloc, - cuda::stream_ref stream) +__host__ roaring_bitmap::roaring_bitmap(cuda::std::byte const* bitmap, + cuda_thread_scope scope, + Allocator const& alloc, + cuda::stream_ref stream) : allocator_{alloc}, - data_{allocator_.allocate(compressed_bitmap.size()), - detail::custom_deleter{compressed_bitmap.size(), - allocator_}}, - ref_{compressed_bitmap, - cuda::std::span(data_.get(), compressed_bitmap.size()), - scope} // TODO move after memcpy? + metadata_{ref_type<>::read_metadata(bitmap)}, + data_{ + allocator_.allocate(metadata_.size_bytes), + detail::custom_deleter{metadata_.size_bytes, allocator_}}, + ref_{data_.get(), metadata_, scope} { - CUCO_CUDA_TRY(cudaMemcpyAsync(data_.get(), - compressed_bitmap.data(), - compressed_bitmap.size(), - cudaMemcpyHostToDevice, - stream.get())); - stream.wait(); // TODO check if this is necessary + CUCO_CUDA_TRY(cudaMemcpyAsync( + data_.get(), bitmap, metadata_.size_bytes, cudaMemcpyHostToDevice, stream.get())); + // stream.wait(); // TODO check if this is necessary } template diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh index 7910d485b..8dc6fe633 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh @@ -30,8 +30,30 @@ #include #include +#include + namespace cuco::detail { +template +struct roaring_bitmap_metadata { + static_assert(cuco::dependent_false, "T must be either uint32_t or uint64_t"); +}; + +template <> +struct roaring_bitmap_metadata { + cuda::std::size_t size_bytes = 0; + cuda::std::size_t num_keys = 0; + cuda::std::size_t run_container_bitmap = 0; + cuda::std::size_t key_cards = 0; + cuda::std::size_t container_offsets = 0; + cuda::std::int32_t num_containers = 0; + bool has_run = false; + bool offsets_aligned = false; + bool valid = false; +}; + +// TODO implement roaring_bitmap_metadata + // primary template template class roaring_bitmap_impl { @@ -48,22 +70,33 @@ class roaring_bitmap_impl { static constexpr cuda::std::uint32_t binary_search_threshold = 8; // TODO determine optimal value public: + using metadata_type = roaring_bitmap_metadata; static constexpr auto thread_scope = Scope; - __host__ roaring_bitmap_impl(cuda::std::span compressed_bitmap_h, - cuda::std::span compressed_bitmap_d, - cuda_thread_scope /* scope */) - : data_{compressed_bitmap_d} + __host__ __device__ roaring_bitmap_impl(cuda::std::byte const* bitmap, + metadata_type metadata, + cuda_thread_scope /* scope */) { - bool success = this->read_header(compressed_bitmap_h); - CUCO_EXPECTS(success, "Failed to read compressed bitmap"); + NV_IF_TARGET( + NV_IS_HOST, + CUCO_EXPECTS(metadata.valid, "Invalid bitmap format");) // TODO device error handling + + if (metadata.valid) { + data_ = cuda::std::span{bitmap, metadata.size_bytes}; + size_ = metadata.num_keys; + num_containers_ = metadata.num_containers; + run_container_bitmap_ = + reinterpret_cast(bitmap + metadata.run_container_bitmap); + key_cards_ = reinterpret_cast(bitmap + metadata.key_cards); + offsets_ = reinterpret_cast(bitmap + metadata.container_offsets); + offsets_aligned_ = metadata.offsets_aligned; + has_run_ = metadata.has_run; + } } - __device__ roaring_bitmap_impl(cuda::std::span compressed_bitmap, - cuda_thread_scope /* scope */) - : data_{compressed_bitmap} + __device__ roaring_bitmap_impl(cuda::std::byte const* bitmap, cuda_thread_scope scope) + : roaring_bitmap_impl(bitmap, read_metadata(bitmap), scope) { - this->read_header(compressed_bitmap); // TODO error handling } template @@ -135,19 +168,102 @@ class roaring_bitmap_impl { return data_; } + __host__ __device__ static metadata_type const read_metadata( + cuda::std::byte const* bitmap) noexcept + { + cuda::std::byte const* buf = bitmap; + metadata_type metadata; + + cuda::std::uint32_t cookie; + cuda::std::memcpy(&cookie, buf, sizeof(cuda::std::uint32_t)); + buf += sizeof(cuda::std::uint32_t); + if ((cookie & 0xFFFF) != serial_cookie && cookie != serial_cookie_no_runcontainer) { + metadata.valid = false; + return metadata; + } + + if ((cookie & 0xFFFF) == serial_cookie) + metadata.num_containers = (cookie >> 16) + 1; + else { + cuda::std::memcpy(&metadata.num_containers, buf, sizeof(cuda::std::uint32_t)); + buf += sizeof(cuda::std::uint32_t); + } + if (metadata.num_containers < 0) { + metadata.valid = false; + return metadata; + } + if (metadata.num_containers > (1 << 16)) { + metadata.valid = false; + return metadata; + } + + metadata.has_run = (cookie & 0xFFFF) == serial_cookie; + if (metadata.has_run) { + metadata.valid = false; + return metadata; // TODO run container bitmap is not supported yet + cuda::std::size_t s = (metadata.num_containers + 7) / 8; + metadata.run_container_bitmap = cuda::std::distance(bitmap, buf); + buf += s; + } + + metadata.key_cards = cuda::std::distance(bitmap, buf); + buf += metadata.num_containers * 2 * sizeof(cuda::std::uint16_t); + + if ((!metadata.has_run) || (metadata.num_containers >= no_offset_threshold)) { + metadata.container_offsets = cuda::std::distance(bitmap, buf); + metadata.offsets_aligned = + (reinterpret_cast(bitmap + metadata.container_offsets) % + sizeof(cuda::std::uint32_t)) == 0; + buf += metadata.num_containers * 4; + } + + metadata.num_keys = 0; + cuda::std::uint16_t const* key_cards = + reinterpret_cast(bitmap + metadata.key_cards); + cuda::std::uint32_t card = 0; + for (cuda::std::int32_t i = 0; i < metadata.num_containers; i++) { + // cuda::std::uint16_t key = key_cards[i * 2]; + card = key_cards[i * 2 + 1] + 1; + metadata.num_keys += card; + } + + // find end of roaring bitmap + cuda::std::byte const* end = bitmap + container_offset(bitmap + metadata.container_offsets, + metadata.offsets_aligned, + metadata.num_containers - 1); + if (is_run_container( + reinterpret_cast(bitmap + metadata.run_container_bitmap), + metadata.has_run, + metadata.num_containers - 1)) { + // TODO implement + } else { + if (card <= 4096) { // TODO check if this is correct + end += card * sizeof(cuda::std::uint16_t); + } else { + end += 8192; // fixed size bitset container + } + } + + metadata.size_bytes = static_cast(cuda::std::distance(bitmap, end)); + metadata.valid = true; + return metadata; + } + private: - __device__ bool is_run_container(cuda::std::int32_t i) const + __host__ __device__ static bool is_run_container(cuda::std::uint8_t const* run_container_bitmap, + bool has_run, + cuda::std::int32_t i) { - if (not has_run_) return false; - return run_container_bitmap_[i / 8] & (1 << (i % 8)); + if (not has_run) return false; + return run_container_bitmap[i / 8] & (1 << (i % 8)); } __device__ bool contains_container(cuda::std::uint16_t lower, cuda::std::uint32_t index) const { - cuda::std::uint32_t card = key_cards_[index * 2 + 1] + 1; - cuda::std::uint16_t const* container = - reinterpret_cast(data_.data() + this->container_offset(index)); - if (this->is_run_container(index)) { + cuda::std::uint32_t card = key_cards_[index * 2 + 1] + 1; + cuda::std::uint16_t const* container = reinterpret_cast( + data_.data() + container_offset(offsets_, offsets_aligned_, index)); + if (is_run_container(run_container_bitmap_, has_run_, index)) { return this->contains_run_container(container, lower, card); } else { if (card <= 4096) { // TODO check if this is correct @@ -202,116 +318,26 @@ class roaring_bitmap_impl { return false; } - __device__ cuda::std::uint32_t container_offset(cuda::std::int32_t i) const + __host__ __device__ static cuda::std::uint32_t container_offset(cuda::std::byte const* offsets, + bool offsets_aligned, + cuda::std::int32_t i) { - cuda::std::uint32_t offset; - if (offsets_aligned_) { + cuda::std::uint32_t offset = 0; + if (offsets_aligned) { offset = - *reinterpret_cast(offsets_ + i * sizeof(cuda::std::uint32_t)); + *reinterpret_cast(offsets + i * sizeof(cuda::std::uint32_t)); } else { cuda::std::memcpy( - &offset, offsets_ + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t)); + &offset, offsets + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t)); } return offset; } - __host__ __device__ bool read_header(cuda::std::span compressed_bitmap) - { - cuda::std::size_t length = compressed_bitmap.size(); - cuda::std::byte const* buf = compressed_bitmap.data(); - [[maybe_unused]] cuda::std::size_t readbytes = 0; - - // cookie and num_containers - if (length < 4) { - // printf("length is less than 4\n"); - return false; - } - - cuda::std::uint32_t cookie; - cuda::std::memcpy(&cookie, buf, sizeof(cuda::std::uint32_t)); - readbytes += sizeof(cuda::std::uint32_t); - buf += sizeof(cuda::std::uint32_t); - if ((cookie & 0xFFFF) != serial_cookie && cookie != serial_cookie_no_runcontainer) { - // printf("cookie is not serial cookie or serial cookie no runcontainer\n"); - return false; - } - - if ((cookie & 0xFFFF) == serial_cookie) - num_containers_ = (cookie >> 16) + 1; - else { - readbytes += sizeof(cuda::std::uint32_t); - if (readbytes > length) { - // printf("readbytes is greater than length\n"); - return false; - } - cuda::std::memcpy(&num_containers_, buf, sizeof(cuda::std::uint32_t)); - buf += sizeof(cuda::std::uint32_t); - } - if (num_containers_ < 0) { - // printf("num_containers_ is less than 0\n"); - return false; - } - if (num_containers_ > (1 << 16)) { - // printf("num_containers_ is greater than 65536\n"); - return false; - } - // printf("num_containers_: %d\n", num_containers_); - - has_run_ = (cookie & 0xFFFF) == serial_cookie; - if (has_run_) { - cuda::std::size_t s = (num_containers_ + 7) / 8; - readbytes += s; - if (readbytes > length) { - // printf("readbytes is greater than length\n"); - return false; - } - run_container_bitmap_ = reinterpret_cast(buf); - buf += s; - } - // printf("has_run: %d\n", has_run_); - - key_cards_ = reinterpret_cast(buf); - readbytes += num_containers_ * 2 * sizeof(cuda::std::uint16_t); - if (readbytes > length) { - // printf("readbytes is greater than length\n"); - return false; - } - buf += num_containers_ * 2 * sizeof(cuda::std::uint16_t); - - if ((!has_run_) || (num_containers_ >= no_offset_threshold)) { - readbytes += num_containers_ * 4; - if (readbytes > length) { - // printf("readbytes is greater than length\n"); - return false; - } - offsets_ = buf; - offsets_aligned_ = - (reinterpret_cast(offsets_) % sizeof(cuda::std::uint32_t)) == 0; - buf += num_containers_ * 4; - } - - readbytes += num_containers_ * 4; - if (readbytes > length) { - // printf("readbytes is greater than length\n"); - return false; - } - - size_ = 0; - for (cuda::std::int32_t i = 0; i < num_containers_; i++) { - // cuda::std::uint16_t key = key_cards_[i * 2]; - cuda::std::uint32_t card = key_cards_[i * 2 + 1] + 1; - size_ += card; - // printf("key: %d, card: %d\n", key, card); - } - - return true; - } - cuda::std::span data_; cuda::std::size_t size_; cuda::std::int32_t num_containers_; cuda::std::uint8_t const* run_container_bitmap_; - cuda::std::uint16_t const* key_cards_; + cuda::std::uint16_t const* key_cards_; // TODO uint8? cuda::std::byte const* offsets_; bool offsets_aligned_; bool has_run_; diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl index b66ea9e31..08465f215 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl @@ -15,6 +15,7 @@ */ #pragma once +#include #include #include @@ -24,18 +25,17 @@ namespace cuco { template -__host__ roaring_bitmap_ref::roaring_bitmap_ref( - cuda::std::span compressed_bitmap_h, - cuda::std::span compressed_bitmap_d, - cuda_thread_scope scope) - : impl_{compressed_bitmap_h, compressed_bitmap_d, scope} +__host__ __device__ roaring_bitmap_ref::roaring_bitmap_ref(cuda::std::byte const* bitmap, + metadata_type const metadata, + cuda_thread_scope scope) + : impl_{bitmap, metadata, scope} { } template -__device__ roaring_bitmap_ref::roaring_bitmap_ref( - cuda::std::span compressed_bitmap, cuda_thread_scope scope) - : impl_{compressed_bitmap, scope} +__device__ roaring_bitmap_ref::roaring_bitmap_ref(cuda::std::byte const* bitmap, + cuda_thread_scope scope) + : impl_{bitmap, scope} { } @@ -77,4 +77,12 @@ __host__ __device__ cuda::std::span roaring_bitmap_ref +__host__ __device__ typename roaring_bitmap_ref::metadata_type const +roaring_bitmap_ref::read_metadata(cuda::std::byte const* bitmap) noexcept +{ + return impl_type::read_metadata(bitmap); +} + } // namespace cuco \ No newline at end of file diff --git a/include/cuco/roaring_bitmap.cuh b/include/cuco/roaring_bitmap.cuh index b850431a7..8b36ba880 100644 --- a/include/cuco/roaring_bitmap.cuh +++ b/include/cuco/roaring_bitmap.cuh @@ -40,7 +40,7 @@ class roaring_bitmap { template using ref_type = roaring_bitmap_ref; - __host__ roaring_bitmap(cuda::std::span compressed_bitmap, + __host__ roaring_bitmap(cuda::std::byte const* bitmap, cuda_thread_scope scope = {}, Allocator const& alloc = {}, cuda::stream_ref stream = {}); @@ -76,6 +76,7 @@ class roaring_bitmap { private: allocator_type allocator_; + typename ref_type<>::metadata_type metadata_; std::unique_ptr> data_; ref_type<> ref_; }; diff --git a/include/cuco/roaring_bitmap_ref.cuh b/include/cuco/roaring_bitmap_ref.cuh index a26474cd9..d06757fe2 100644 --- a/include/cuco/roaring_bitmap_ref.cuh +++ b/include/cuco/roaring_bitmap_ref.cuh @@ -29,15 +29,15 @@ class roaring_bitmap_ref { using impl_type = detail::roaring_bitmap_impl; public: + using metadata_type = typename impl_type::metadata_type; static constexpr auto thread_scope = impl_type::thread_scope; // This is tricky as it is not clear if compressed_bitmap resides in host or device memory. - __host__ roaring_bitmap_ref(cuda::std::span compressed_bitmap_h, - cuda::std::span compressed_bitmap_d, - cuda_thread_scope scope = {}); + __host__ __device__ roaring_bitmap_ref(cuda::std::byte const* bitmap, + metadata_type const metadata, + cuda_thread_scope scope = {}); - __device__ roaring_bitmap_ref(cuda::std::span compressed_bitmap, - cuda_thread_scope scope = {}); + __device__ roaring_bitmap_ref(cuda::std::byte const* bitmap, cuda_thread_scope scope = {}); template __host__ void contains(InputIt first, @@ -57,6 +57,9 @@ class roaring_bitmap_ref { [[nodiscard]] __host__ __device__ cuda::std::span data() const noexcept; + [[nodiscard]] __host__ __device__ static metadata_type const read_metadata( + cuda::std::byte const* bitmap) noexcept; + private: impl_type impl_; }; From 5977bead161c47f9f5e19d54bfee5ac8f5ba7239 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 8 Jul 2025 17:34:00 -0700 Subject: [PATCH 05/24] Get rid of span and scope --- .../detail/roaring_bitmap/roaring_bitmap.inl | 61 +++++++++-------- .../roaring_bitmap/roaring_bitmap_impl.cuh | 38 +++++------ .../roaring_bitmap/roaring_bitmap_ref.inl | 65 ++++++++++--------- include/cuco/roaring_bitmap.cuh | 26 +++----- include/cuco/roaring_bitmap_ref.cuh | 21 +++--- 5 files changed, 106 insertions(+), 105 deletions(-) diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl index efb64c448..9c36c2e90 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl @@ -17,72 +17,77 @@ #include #include -#include #include -#include +#include #include namespace cuco { -template -__host__ roaring_bitmap::roaring_bitmap(cuda::std::byte const* bitmap, - cuda_thread_scope scope, - Allocator const& alloc, - cuda::stream_ref stream) +template +__host__ roaring_bitmap::roaring_bitmap(cuda::std::byte const* bitmap, + Allocator const& alloc, + cuda::stream_ref stream) : allocator_{alloc}, - metadata_{ref_type<>::read_metadata(bitmap)}, + metadata_{ref_type::read_metadata(bitmap)}, data_{ allocator_.allocate(metadata_.size_bytes), detail::custom_deleter{metadata_.size_bytes, allocator_}}, - ref_{data_.get(), metadata_, scope} + ref_{data_.get(), metadata_} { CUCO_CUDA_TRY(cudaMemcpyAsync( data_.get(), bitmap, metadata_.size_bytes, cudaMemcpyHostToDevice, stream.get())); // stream.wait(); // TODO check if this is necessary } -template +template template -__host__ void roaring_bitmap::contains(InputIt first, - InputIt last, - OutputIt output, - cuda::stream_ref stream) const +__host__ void roaring_bitmap::contains(InputIt first, + InputIt last, + OutputIt output, + cuda::stream_ref stream) const { ref_.contains(first, last, output, stream); } -template +template template -__host__ void roaring_bitmap::contains_async( - InputIt first, InputIt last, OutputIt output, cuda::stream_ref stream) const noexcept +__host__ void roaring_bitmap::contains_async(InputIt first, + InputIt last, + OutputIt output, + cuda::stream_ref stream) const noexcept { ref_.contains_async(first, last, output, stream); } -template -__host__ cuda::std::size_t roaring_bitmap::size() const noexcept +template +__host__ cuda::std::size_t roaring_bitmap::size() const noexcept { return ref_.size(); } -template -__host__ cuda::std::span roaring_bitmap::data() - const noexcept +template +__host__ cuda::std::byte const* roaring_bitmap::data() const noexcept { return ref_.data(); } -template -__host__ typename roaring_bitmap::allocator_type -roaring_bitmap::allocator() const noexcept +template +__host__ cuda::std::size_t roaring_bitmap::size_bytes() const noexcept +{ + return ref_.size_bytes(); +} + +template +__host__ typename roaring_bitmap::allocator_type +roaring_bitmap::allocator() const noexcept { return allocator_; } -template -__host__ typename roaring_bitmap::ref_type<> -roaring_bitmap::ref() const noexcept +template +__host__ typename roaring_bitmap::ref_type roaring_bitmap::ref() + const noexcept { return ref_; } diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh index 8dc6fe633..2e527047e 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh @@ -17,14 +17,12 @@ #pragma once #include -#include #include #include #include #include #include -#include #include #include #include @@ -55,13 +53,13 @@ struct roaring_bitmap_metadata { // TODO implement roaring_bitmap_metadata // primary template -template +template class roaring_bitmap_impl { static_assert(cuco::dependent_false, "T must be either uint32_t or uint64_t"); }; -template -class roaring_bitmap_impl { +template <> +class roaring_bitmap_impl { // Constants from the Roaring format spec static constexpr cuda::std::uint32_t serial_cookie_no_runcontainer = 12346; static constexpr cuda::std::uint32_t serial_cookie = 12347; @@ -70,19 +68,18 @@ class roaring_bitmap_impl { static constexpr cuda::std::uint32_t binary_search_threshold = 8; // TODO determine optimal value public: - using metadata_type = roaring_bitmap_metadata; - static constexpr auto thread_scope = Scope; + using metadata_type = roaring_bitmap_metadata; __host__ __device__ roaring_bitmap_impl(cuda::std::byte const* bitmap, - metadata_type metadata, - cuda_thread_scope /* scope */) + metadata_type const& metadata) { NV_IF_TARGET( NV_IS_HOST, CUCO_EXPECTS(metadata.valid, "Invalid bitmap format");) // TODO device error handling if (metadata.valid) { - data_ = cuda::std::span{bitmap, metadata.size_bytes}; + data_ = bitmap; + size_bytes_ = metadata.size_bytes; size_ = metadata.num_keys; num_containers_ = metadata.num_containers; run_container_bitmap_ = @@ -94,8 +91,8 @@ class roaring_bitmap_impl { } } - __device__ roaring_bitmap_impl(cuda::std::byte const* bitmap, cuda_thread_scope scope) - : roaring_bitmap_impl(bitmap, read_metadata(bitmap), scope) + __device__ roaring_bitmap_impl(cuda::std::byte const* bitmap) + : roaring_bitmap_impl{bitmap, read_metadata(bitmap)} { } @@ -163,9 +160,11 @@ class roaring_bitmap_impl { [[nodiscard]] __host__ __device__ bool empty() const noexcept { return size_ == 0; } - [[nodiscard]] __host__ __device__ cuda::std::span data() const noexcept + [[nodiscard]] __host__ __device__ cuda::std::byte const* data() const noexcept { return data_; } + + [[nodiscard]] __host__ __device__ cuda::std::size_t size_bytes() const noexcept { - return data_; + return size_bytes_; } __host__ __device__ static metadata_type const read_metadata( @@ -262,7 +261,7 @@ class roaring_bitmap_impl { { cuda::std::uint32_t card = key_cards_[index * 2 + 1] + 1; cuda::std::uint16_t const* container = reinterpret_cast( - data_.data() + container_offset(offsets_, offsets_aligned_, index)); + data_ + container_offset(offsets_, offsets_aligned_, index)); if (is_run_container(run_container_bitmap_, has_run_, index)) { return this->contains_run_container(container, lower, card); } else { @@ -333,7 +332,8 @@ class roaring_bitmap_impl { return offset; } - cuda::std::span data_; + cuda::std::byte const* data_; + cuda::std::size_t size_bytes_; cuda::std::size_t size_; cuda::std::int32_t num_containers_; cuda::std::uint8_t const* run_container_bitmap_; @@ -343,9 +343,9 @@ class roaring_bitmap_impl { bool has_run_; }; -template -class roaring_bitmap_impl { - using bucket_type = roaring_bitmap_impl; +template <> +class roaring_bitmap_impl { + using bucket_type = roaring_bitmap_impl; // TODO implement }; diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl index 08465f215..a4f252104 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl @@ -16,71 +16,74 @@ #pragma once #include -#include #include -#include +#include #include namespace cuco { -template -__host__ __device__ roaring_bitmap_ref::roaring_bitmap_ref(cuda::std::byte const* bitmap, - metadata_type const metadata, - cuda_thread_scope scope) - : impl_{bitmap, metadata, scope} +template +__host__ __device__ roaring_bitmap_ref::roaring_bitmap_ref(cuda::std::byte const* bitmap, + metadata_type const& metadata) + : impl_{bitmap, metadata} { } -template -__device__ roaring_bitmap_ref::roaring_bitmap_ref(cuda::std::byte const* bitmap, - cuda_thread_scope scope) - : impl_{bitmap, scope} +template +template > */> +__device__ roaring_bitmap_ref::roaring_bitmap_ref(cuda::std::byte const* bitmap) : impl_{bitmap} { } -template +template template -__host__ void roaring_bitmap_ref::contains(InputIt first, - InputIt last, - OutputIt output, - cuda::stream_ref stream) const +__host__ void roaring_bitmap_ref::contains(InputIt first, + InputIt last, + OutputIt output, + cuda::stream_ref stream) const { impl_.contains(first, last, output, stream); } -template +template template -__host__ void roaring_bitmap_ref::contains_async(InputIt first, - InputIt last, - OutputIt output, - cuda::stream_ref stream) const noexcept +__host__ void roaring_bitmap_ref::contains_async(InputIt first, + InputIt last, + OutputIt output, + cuda::stream_ref stream) const noexcept { impl_.contains_async(first, last, output, stream); } -template -__device__ bool roaring_bitmap_ref::contains(T value) const +template +__device__ bool roaring_bitmap_ref::contains(T value) const { return impl_.contains(value); } -template -__host__ __device__ cuda::std::size_t roaring_bitmap_ref::size() const noexcept +template +__host__ __device__ cuda::std::size_t roaring_bitmap_ref::size() const noexcept { return impl_.size(); } -template -__host__ __device__ cuda::std::span roaring_bitmap_ref::data() - const noexcept +template +__host__ __device__ cuda::std::byte const* roaring_bitmap_ref::data() const noexcept { return impl_.data(); } -template -__host__ __device__ typename roaring_bitmap_ref::metadata_type const -roaring_bitmap_ref::read_metadata(cuda::std::byte const* bitmap) noexcept +template +__host__ __device__ cuda::std::size_t roaring_bitmap_ref::size_bytes() const noexcept +{ + return impl_.size_bytes(); +} + +template +__host__ __device__ typename roaring_bitmap_ref::metadata_type const +roaring_bitmap_ref::read_metadata(cuda::std::byte const* bitmap) noexcept { return impl_type::read_metadata(bitmap); } diff --git a/include/cuco/roaring_bitmap.cuh b/include/cuco/roaring_bitmap.cuh index 8b36ba880..3f3aa071a 100644 --- a/include/cuco/roaring_bitmap.cuh +++ b/include/cuco/roaring_bitmap.cuh @@ -18,32 +18,24 @@ #include #include #include -#include #include -#include #include #include namespace cuco { -template > +template > class roaring_bitmap { public: - static constexpr auto thread_scope = Scope; - using allocator_type = Allocator; - template - using ref_type = roaring_bitmap_ref; + using ref_type = roaring_bitmap_ref; __host__ roaring_bitmap(cuda::std::byte const* bitmap, - cuda_thread_scope scope = {}, - Allocator const& alloc = {}, - cuda::stream_ref stream = {}); + Allocator const& alloc = {}, + cuda::stream_ref stream = {}); roaring_bitmap(roaring_bitmap const& other) = default; roaring_bitmap(roaring_bitmap&& other) = default; @@ -68,17 +60,19 @@ class roaring_bitmap { [[nodiscard]] __host__ cuda::std::size_t size() const noexcept; - [[nodiscard]] __host__ cuda::std::span data() const noexcept; + [[nodiscard]] __host__ cuda::std::byte const* data() const noexcept; + + [[nodiscard]] __host__ cuda::std::size_t size_bytes() const noexcept; [[nodiscard]] __host__ allocator_type allocator() const noexcept; - [[nodiscard]] __host__ ref_type<> ref() const noexcept; + [[nodiscard]] __host__ ref_type ref() const noexcept; private: allocator_type allocator_; - typename ref_type<>::metadata_type metadata_; + typename ref_type::metadata_type metadata_; std::unique_ptr> data_; - ref_type<> ref_; + ref_type ref_; }; } // namespace cuco diff --git a/include/cuco/roaring_bitmap_ref.cuh b/include/cuco/roaring_bitmap_ref.cuh index d06757fe2..156877c0f 100644 --- a/include/cuco/roaring_bitmap_ref.cuh +++ b/include/cuco/roaring_bitmap_ref.cuh @@ -16,28 +16,25 @@ #pragma once #include -#include #include -#include #include namespace cuco { -template +template class roaring_bitmap_ref { - using impl_type = detail::roaring_bitmap_impl; + using impl_type = detail::roaring_bitmap_impl; public: - using metadata_type = typename impl_type::metadata_type; - static constexpr auto thread_scope = impl_type::thread_scope; + using metadata_type = typename impl_type::metadata_type; - // This is tricky as it is not clear if compressed_bitmap resides in host or device memory. __host__ __device__ roaring_bitmap_ref(cuda::std::byte const* bitmap, - metadata_type const metadata, - cuda_thread_scope scope = {}); + metadata_type const& metadata); - __device__ roaring_bitmap_ref(cuda::std::byte const* bitmap, cuda_thread_scope scope = {}); + template >> + __device__ roaring_bitmap_ref(cuda::std::byte const* bitmap); template __host__ void contains(InputIt first, @@ -55,7 +52,9 @@ class roaring_bitmap_ref { [[nodiscard]] __host__ __device__ cuda::std::size_t size() const noexcept; - [[nodiscard]] __host__ __device__ cuda::std::span data() const noexcept; + [[nodiscard]] __host__ __device__ cuda::std::byte const* data() const noexcept; + + [[nodiscard]] __host__ __device__ cuda::std::size_t size_bytes() const noexcept; [[nodiscard]] __host__ __device__ static metadata_type const read_metadata( cuda::std::byte const* bitmap) noexcept; From 18acbed0c7b60ecad3f96303ab5d49f99a5ffb0c Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 8 Jul 2025 17:39:01 -0700 Subject: [PATCH 06/24] Add empty() --- .../detail/roaring_bitmap/roaring_bitmap.inl | 42 +++++++++++-------- .../roaring_bitmap/roaring_bitmap_ref.inl | 7 ++++ include/cuco/roaring_bitmap.cuh | 35 +++++++++------- include/cuco/roaring_bitmap_ref.cuh | 7 +++- 4 files changed, 55 insertions(+), 36 deletions(-) diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl index 9c36c2e90..fcc4fbd81 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #pragma once #include @@ -25,9 +26,9 @@ namespace cuco { template -__host__ roaring_bitmap::roaring_bitmap(cuda::std::byte const* bitmap, - Allocator const& alloc, - cuda::stream_ref stream) +roaring_bitmap::roaring_bitmap(cuda::std::byte const* bitmap, + Allocator const& alloc, + cuda::stream_ref stream) : allocator_{alloc}, metadata_{ref_type::read_metadata(bitmap)}, data_{ @@ -42,52 +43,57 @@ __host__ roaring_bitmap::roaring_bitmap(cuda::std::byte const* bit template template -__host__ void roaring_bitmap::contains(InputIt first, - InputIt last, - OutputIt output, - cuda::stream_ref stream) const +void roaring_bitmap::contains(InputIt first, + InputIt last, + OutputIt output, + cuda::stream_ref stream) const { ref_.contains(first, last, output, stream); } template template -__host__ void roaring_bitmap::contains_async(InputIt first, - InputIt last, - OutputIt output, - cuda::stream_ref stream) const noexcept +void roaring_bitmap::contains_async(InputIt first, + InputIt last, + OutputIt output, + cuda::stream_ref stream) const noexcept { ref_.contains_async(first, last, output, stream); } template -__host__ cuda::std::size_t roaring_bitmap::size() const noexcept +cuda::std::size_t roaring_bitmap::size() const noexcept { return ref_.size(); } template -__host__ cuda::std::byte const* roaring_bitmap::data() const noexcept +bool roaring_bitmap::empty() const noexcept +{ + return ref_.empty(); +} + +template +cuda::std::byte const* roaring_bitmap::data() const noexcept { return ref_.data(); } template -__host__ cuda::std::size_t roaring_bitmap::size_bytes() const noexcept +cuda::std::size_t roaring_bitmap::size_bytes() const noexcept { return ref_.size_bytes(); } template -__host__ typename roaring_bitmap::allocator_type -roaring_bitmap::allocator() const noexcept +typename roaring_bitmap::allocator_type roaring_bitmap::allocator() + const noexcept { return allocator_; } template -__host__ typename roaring_bitmap::ref_type roaring_bitmap::ref() - const noexcept +typename roaring_bitmap::ref_type roaring_bitmap::ref() const noexcept { return ref_; } diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl index a4f252104..088e7e7b4 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #pragma once #include @@ -69,6 +70,12 @@ __host__ __device__ cuda::std::size_t roaring_bitmap_ref::size() const noexce return impl_.size(); } +template +__host__ __device__ bool roaring_bitmap_ref::empty() const noexcept +{ + return impl_.empty(); +} + template __host__ __device__ cuda::std::byte const* roaring_bitmap_ref::data() const noexcept { diff --git a/include/cuco/roaring_bitmap.cuh b/include/cuco/roaring_bitmap.cuh index 3f3aa071a..d8269662b 100644 --- a/include/cuco/roaring_bitmap.cuh +++ b/include/cuco/roaring_bitmap.cuh @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #pragma once #include @@ -33,9 +34,9 @@ class roaring_bitmap { using ref_type = roaring_bitmap_ref; - __host__ roaring_bitmap(cuda::std::byte const* bitmap, - Allocator const& alloc = {}, - cuda::stream_ref stream = {}); + roaring_bitmap(cuda::std::byte const* bitmap, + Allocator const& alloc = {}, + cuda::stream_ref stream = {}); roaring_bitmap(roaring_bitmap const& other) = default; roaring_bitmap(roaring_bitmap&& other) = default; @@ -45,28 +46,30 @@ class roaring_bitmap { ~roaring_bitmap() = default; template - __host__ void contains(InputIt first, - InputIt last, - OutputIt contained, - cuda::stream_ref stream = {}) const; + void contains(InputIt first, + InputIt last, + OutputIt contained, + cuda::stream_ref stream = {}) const; template - __host__ void contains_async(InputIt first, - InputIt last, - OutputIt contained, - cuda::stream_ref stream = {}) const noexcept; + void contains_async(InputIt first, + InputIt last, + OutputIt contained, + cuda::stream_ref stream = {}) const noexcept; // TODO contains_if, contains_if_async, empty - [[nodiscard]] __host__ cuda::std::size_t size() const noexcept; + [[nodiscard]] cuda::std::size_t size() const noexcept; + + [[nodiscard]] bool empty() const noexcept; - [[nodiscard]] __host__ cuda::std::byte const* data() const noexcept; + [[nodiscard]] cuda::std::byte const* data() const noexcept; - [[nodiscard]] __host__ cuda::std::size_t size_bytes() const noexcept; + [[nodiscard]] cuda::std::size_t size_bytes() const noexcept; - [[nodiscard]] __host__ allocator_type allocator() const noexcept; + [[nodiscard]] allocator_type allocator() const noexcept; - [[nodiscard]] __host__ ref_type ref() const noexcept; + [[nodiscard]] ref_type ref() const noexcept; private: allocator_type allocator_; diff --git a/include/cuco/roaring_bitmap_ref.cuh b/include/cuco/roaring_bitmap_ref.cuh index 156877c0f..4e9939c05 100644 --- a/include/cuco/roaring_bitmap_ref.cuh +++ b/include/cuco/roaring_bitmap_ref.cuh @@ -13,6 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #pragma once #include @@ -32,8 +33,8 @@ class roaring_bitmap_ref { __host__ __device__ roaring_bitmap_ref(cuda::std::byte const* bitmap, metadata_type const& metadata); - template >> + template >> __device__ roaring_bitmap_ref(cuda::std::byte const* bitmap); template @@ -52,6 +53,8 @@ class roaring_bitmap_ref { [[nodiscard]] __host__ __device__ cuda::std::size_t size() const noexcept; + [[nodiscard]] __host__ __device__ bool empty() const noexcept; + [[nodiscard]] __host__ __device__ cuda::std::byte const* data() const noexcept; [[nodiscard]] __host__ __device__ cuda::std::size_t size_bytes() const noexcept; From 64bf0f3b36216845bf17be3ac18d025184ee6fad Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 9 Jul 2025 16:05:01 -0700 Subject: [PATCH 07/24] Add storage class --- .../detail/roaring_bitmap/roaring_bitmap.inl | 16 +- .../roaring_bitmap/roaring_bitmap_impl.cuh | 190 +++--------------- .../roaring_bitmap/roaring_bitmap_ref.inl | 12 +- .../roaring_bitmap/roaring_bitmap_storage.cuh | 107 ++++++++++ include/cuco/detail/roaring_bitmap/util.cuh | 160 +++++++++++++++ include/cuco/roaring_bitmap.cuh | 14 +- include/cuco/roaring_bitmap_ref.cuh | 8 +- 7 files changed, 309 insertions(+), 198 deletions(-) create mode 100644 include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh create mode 100644 include/cuco/detail/roaring_bitmap/util.cuh diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl index fcc4fbd81..964f74f45 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl @@ -16,11 +16,7 @@ #pragma once -#include -#include - #include -#include #include namespace cuco { @@ -29,16 +25,8 @@ template roaring_bitmap::roaring_bitmap(cuda::std::byte const* bitmap, Allocator const& alloc, cuda::stream_ref stream) - : allocator_{alloc}, - metadata_{ref_type::read_metadata(bitmap)}, - data_{ - allocator_.allocate(metadata_.size_bytes), - detail::custom_deleter{metadata_.size_bytes, allocator_}}, - ref_{data_.get(), metadata_} + : storage_{bitmap, alloc, stream}, ref_{storage_.ref()} { - CUCO_CUDA_TRY(cudaMemcpyAsync( - data_.get(), bitmap, metadata_.size_bytes, cudaMemcpyHostToDevice, stream.get())); - // stream.wait(); // TODO check if this is necessary } template @@ -89,7 +77,7 @@ template typename roaring_bitmap::allocator_type roaring_bitmap::allocator() const noexcept { - return allocator_; + return storage_.allocator(); } template diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh index 2e527047e..29c70b343 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh @@ -17,6 +17,8 @@ #pragma once #include +#include +#include #include #include @@ -28,30 +30,8 @@ #include #include -#include - namespace cuco::detail { -template -struct roaring_bitmap_metadata { - static_assert(cuco::dependent_false, "T must be either uint32_t or uint64_t"); -}; - -template <> -struct roaring_bitmap_metadata { - cuda::std::size_t size_bytes = 0; - cuda::std::size_t num_keys = 0; - cuda::std::size_t run_container_bitmap = 0; - cuda::std::size_t key_cards = 0; - cuda::std::size_t container_offsets = 0; - cuda::std::int32_t num_containers = 0; - bool has_run = false; - bool offsets_aligned = false; - bool valid = false; -}; - -// TODO implement roaring_bitmap_metadata - // primary template template class roaring_bitmap_impl { @@ -60,39 +40,32 @@ class roaring_bitmap_impl { template <> class roaring_bitmap_impl { - // Constants from the Roaring format spec - static constexpr cuda::std::uint32_t serial_cookie_no_runcontainer = 12346; - static constexpr cuda::std::uint32_t serial_cookie = 12347; - static constexpr cuda::std::uint32_t frozen_cookie = 13766; - static constexpr cuda::std::int32_t no_offset_threshold = 4; - static constexpr cuda::std::uint32_t binary_search_threshold = 8; // TODO determine optimal value - public: - using metadata_type = roaring_bitmap_metadata; + using storage_ref_type = roaring_bitmap_storage_ref; - __host__ __device__ roaring_bitmap_impl(cuda::std::byte const* bitmap, - metadata_type const& metadata) - { - NV_IF_TARGET( - NV_IS_HOST, - CUCO_EXPECTS(metadata.valid, "Invalid bitmap format");) // TODO device error handling + static constexpr cuda::std::uint32_t binary_search_threshold = 8; // TODO determine optimal value - if (metadata.valid) { - data_ = bitmap; - size_bytes_ = metadata.size_bytes; - size_ = metadata.num_keys; - num_containers_ = metadata.num_containers; + __host__ __device__ roaring_bitmap_impl(storage_ref_type const& storage_ref) + { + auto const& meta = storage_ref.metadata(); + if (meta.valid) { + data_ = storage_ref.data(); + size_bytes_ = meta.size_bytes; + size_ = meta.num_keys; + num_containers_ = meta.num_containers; run_container_bitmap_ = - reinterpret_cast(bitmap + metadata.run_container_bitmap); - key_cards_ = reinterpret_cast(bitmap + metadata.key_cards); - offsets_ = reinterpret_cast(bitmap + metadata.container_offsets); - offsets_aligned_ = metadata.offsets_aligned; - has_run_ = metadata.has_run; + reinterpret_cast(storage_ref.data() + meta.run_container_bitmap); + key_cards_ = + reinterpret_cast(storage_ref.data() + meta.key_cards); + offsets_ = + reinterpret_cast(storage_ref.data() + meta.container_offsets); + offsets_aligned_ = meta.offsets_aligned; + has_run_ = meta.has_run; } } __device__ roaring_bitmap_impl(cuda::std::byte const* bitmap) - : roaring_bitmap_impl{bitmap, read_metadata(bitmap)} + : roaring_bitmap_impl{storage_ref_type{bitmap}} { } @@ -128,18 +101,21 @@ class roaring_bitmap_impl { __device__ bool contains(cuda::std::uint32_t value) const { - cuda::std::uint16_t upper = value >> 16; - cuda::std::uint16_t lower = value & 0xFFFF; - - // Binary search on key_cards_ to find container with matching upper key - cuda::std::uint32_t left = 0; - cuda::std::uint32_t right = num_containers_; + cuda::std::uint16_t const upper = value >> 16; + cuda::std::uint16_t const lower = value & 0xFFFF; if (num_containers_ < binary_search_threshold) { +// linear search +#pragma unroll for (cuda::std::uint32_t i = 0; i < num_containers_; i++) { - if (key_cards_[i * 2] == upper) { return this->contains_container(lower, i); } + cuda::std::uint16_t const key = key_cards_[i * 2]; + if (key == upper) { return this->contains_container(lower, i); } + if (key > upper) { return false; } } } else { + // binary search + cuda::std::uint32_t left = 0; + cuda::std::uint32_t right = num_containers_; while (left < right) { cuda::std::uint32_t mid = left + (right - left) / 2; cuda::std::uint16_t mid_key = key_cards_[mid * 2]; @@ -167,96 +143,7 @@ class roaring_bitmap_impl { return size_bytes_; } - __host__ __device__ static metadata_type const read_metadata( - cuda::std::byte const* bitmap) noexcept - { - cuda::std::byte const* buf = bitmap; - metadata_type metadata; - - cuda::std::uint32_t cookie; - cuda::std::memcpy(&cookie, buf, sizeof(cuda::std::uint32_t)); - buf += sizeof(cuda::std::uint32_t); - if ((cookie & 0xFFFF) != serial_cookie && cookie != serial_cookie_no_runcontainer) { - metadata.valid = false; - return metadata; - } - - if ((cookie & 0xFFFF) == serial_cookie) - metadata.num_containers = (cookie >> 16) + 1; - else { - cuda::std::memcpy(&metadata.num_containers, buf, sizeof(cuda::std::uint32_t)); - buf += sizeof(cuda::std::uint32_t); - } - if (metadata.num_containers < 0) { - metadata.valid = false; - return metadata; - } - if (metadata.num_containers > (1 << 16)) { - metadata.valid = false; - return metadata; - } - - metadata.has_run = (cookie & 0xFFFF) == serial_cookie; - if (metadata.has_run) { - metadata.valid = false; - return metadata; // TODO run container bitmap is not supported yet - cuda::std::size_t s = (metadata.num_containers + 7) / 8; - metadata.run_container_bitmap = cuda::std::distance(bitmap, buf); - buf += s; - } - - metadata.key_cards = cuda::std::distance(bitmap, buf); - buf += metadata.num_containers * 2 * sizeof(cuda::std::uint16_t); - - if ((!metadata.has_run) || (metadata.num_containers >= no_offset_threshold)) { - metadata.container_offsets = cuda::std::distance(bitmap, buf); - metadata.offsets_aligned = - (reinterpret_cast(bitmap + metadata.container_offsets) % - sizeof(cuda::std::uint32_t)) == 0; - buf += metadata.num_containers * 4; - } - - metadata.num_keys = 0; - cuda::std::uint16_t const* key_cards = - reinterpret_cast(bitmap + metadata.key_cards); - cuda::std::uint32_t card = 0; - for (cuda::std::int32_t i = 0; i < metadata.num_containers; i++) { - // cuda::std::uint16_t key = key_cards[i * 2]; - card = key_cards[i * 2 + 1] + 1; - metadata.num_keys += card; - } - - // find end of roaring bitmap - cuda::std::byte const* end = bitmap + container_offset(bitmap + metadata.container_offsets, - metadata.offsets_aligned, - metadata.num_containers - 1); - if (is_run_container( - reinterpret_cast(bitmap + metadata.run_container_bitmap), - metadata.has_run, - metadata.num_containers - 1)) { - // TODO implement - } else { - if (card <= 4096) { // TODO check if this is correct - end += card * sizeof(cuda::std::uint16_t); - } else { - end += 8192; // fixed size bitset container - } - } - - metadata.size_bytes = static_cast(cuda::std::distance(bitmap, end)); - metadata.valid = true; - return metadata; - } - private: - __host__ __device__ static bool is_run_container(cuda::std::uint8_t const* run_container_bitmap, - bool has_run, - cuda::std::int32_t i) - { - if (not has_run) return false; - return run_container_bitmap[i / 8] & (1 << (i % 8)); - } - __device__ bool contains_container(cuda::std::uint16_t lower, cuda::std::uint32_t index) const { cuda::std::uint32_t card = key_cards_[index * 2 + 1] + 1; @@ -313,25 +200,10 @@ class roaring_bitmap_impl { cuda::std::uint16_t lower, cuda::std::uint32_t card) const { - // TODO implement + // TODO implement linear search return false; } - __host__ __device__ static cuda::std::uint32_t container_offset(cuda::std::byte const* offsets, - bool offsets_aligned, - cuda::std::int32_t i) - { - cuda::std::uint32_t offset = 0; - if (offsets_aligned) { - offset = - *reinterpret_cast(offsets + i * sizeof(cuda::std::uint32_t)); - } else { - cuda::std::memcpy( - &offset, offsets + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t)); - } - return offset; - } - cuda::std::byte const* data_; cuda::std::size_t size_bytes_; cuda::std::size_t size_; diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl index 088e7e7b4..9536bb79f 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl @@ -25,9 +25,8 @@ namespace cuco { template -__host__ __device__ roaring_bitmap_ref::roaring_bitmap_ref(cuda::std::byte const* bitmap, - metadata_type const& metadata) - : impl_{bitmap, metadata} +__host__ __device__ roaring_bitmap_ref::roaring_bitmap_ref(storage_ref_type const& storage_ref) + : impl_{storage_ref} { } @@ -88,11 +87,4 @@ __host__ __device__ cuda::std::size_t roaring_bitmap_ref::size_bytes() const return impl_.size_bytes(); } -template -__host__ __device__ typename roaring_bitmap_ref::metadata_type const -roaring_bitmap_ref::read_metadata(cuda::std::byte const* bitmap) noexcept -{ - return impl_type::read_metadata(bitmap); -} - } // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh new file mode 100644 index 000000000..49805afb8 --- /dev/null +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2025 NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +namespace cuco::detail { + +template +struct roaring_bitmap_storage_ref { + static_assert(cuco::dependent_false, "T must be either uint32_t or uint64_t"); +}; + +template <> +class roaring_bitmap_storage_ref { + public: + using metadata_type = roaring_bitmap_metadata; + __host__ __device__ roaring_bitmap_storage_ref(cuda::std::byte const* bitmap, + metadata_type const& metadata) + : data_{bitmap}, metadata_{metadata} + { + } + + __device__ roaring_bitmap_storage_ref(cuda::std::byte const* bitmap) + : data_{bitmap}, metadata_{metadata_type{bitmap}} + { + } + + __host__ __device__ metadata_type const& metadata() const noexcept { return metadata_; } + + __host__ __device__ cuda::std::byte const* data() const noexcept { return data_; } + + private: + cuda::std::byte const* data_; + metadata_type metadata_; +}; + +template +struct roaring_bitmap_storage { + static_assert(cuco::dependent_false, "T must be either uint32_t or uint64_t"); +}; + +template +class roaring_bitmap_storage { + public: + using allocator_type = + typename std::allocator_traits::template rebind_alloc; + using ref_type = roaring_bitmap_storage_ref; + + roaring_bitmap_storage(roaring_bitmap_storage const& other) = default; + roaring_bitmap_storage(roaring_bitmap_storage&& other) = default; + roaring_bitmap_storage& operator=(roaring_bitmap_storage const& other) = default; + roaring_bitmap_storage& operator=(roaring_bitmap_storage&& other) = default; + + ~roaring_bitmap_storage() = default; + + roaring_bitmap_storage(cuda::std::byte const* bitmap, + Allocator const& alloc, + cuda::stream_ref stream) + : allocator_{alloc}, + metadata_{bitmap}, + data_{allocator_.allocate(metadata_.size_bytes), + detail::custom_deleter{metadata_.size_bytes, + allocator_}}, + ref_{data_.get(), metadata_} + { + CUCO_CUDA_TRY(cudaMemcpyAsync( + data_.get(), bitmap, metadata_.size_bytes, cudaMemcpyHostToDevice, stream.get())); + // stream.wait(); // TODO check if this is necessary + } + + ref_type ref() const noexcept { return ref_; } + + private: + allocator_type allocator_; + typename ref_type::metadata_type metadata_; + std::unique_ptr> data_; + ref_type ref_; +}; + +// TODO implement roaring_bitmap_metadata + +} // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/roaring_bitmap/util.cuh b/include/cuco/detail/roaring_bitmap/util.cuh new file mode 100644 index 000000000..a9510800b --- /dev/null +++ b/include/cuco/detail/roaring_bitmap/util.cuh @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2025 NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include + +#include + +namespace cuco::detail { + +__host__ __device__ cuda::std::uint32_t container_offset(cuda::std::byte const* offsets, + bool offsets_aligned, + cuda::std::int32_t i) +{ + cuda::std::uint32_t offset = 0; + if (offsets_aligned) { + offset = + *reinterpret_cast(offsets + i * sizeof(cuda::std::uint32_t)); + } else { + cuda::std::memcpy( + &offset, offsets + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t)); + } + return offset; +} + +__host__ __device__ bool is_run_container(cuda::std::uint8_t const* run_container_bitmap, + bool has_run, + cuda::std::int32_t i) +{ + if (not has_run) return false; + return run_container_bitmap[i / 8] & (1 << (i % 8)); +} + +template +struct roaring_bitmap_metadata { + static_assert(cuco::dependent_false, "T must be either uint32_t or uint64_t"); +}; + +template <> +struct roaring_bitmap_metadata { + cuda::std::size_t size_bytes = 0; + cuda::std::size_t num_keys = 0; + cuda::std::size_t run_container_bitmap = 0; + cuda::std::size_t key_cards = 0; + cuda::std::size_t container_offsets = 0; + cuda::std::int32_t num_containers = 0; + bool has_run = false; + bool offsets_aligned = false; + bool valid = false; + + __host__ __device__ roaring_bitmap_metadata(cuda::std::byte const* bitmap) + { + constexpr cuda::std::uint32_t serial_cookie_no_runcontainer = 12346; + constexpr cuda::std::uint32_t serial_cookie = 12347; + // constexpr cuda::std::uint32_t frozen_cookie = 13766; + constexpr cuda::std::int32_t no_offset_threshold = 4; + + cuda::std::byte const* buf = bitmap; + + cuda::std::uint32_t cookie; + cuda::std::memcpy(&cookie, buf, sizeof(cuda::std::uint32_t)); + buf += sizeof(cuda::std::uint32_t); + if ((cookie & 0xFFFF) != serial_cookie && cookie != serial_cookie_no_runcontainer) { + valid = false; + NV_IF_TARGET(NV_IS_HOST, + CUCO_FAIL("Invalid bitmap format");) // TODO device error handling + return; + } + + if ((cookie & 0xFFFF) == serial_cookie) + num_containers = (cookie >> 16) + 1; + else { + cuda::std::memcpy(&num_containers, buf, sizeof(cuda::std::uint32_t)); + buf += sizeof(cuda::std::uint32_t); + } + if (num_containers < 0) { + valid = false; + NV_IF_TARGET(NV_IS_HOST, + CUCO_FAIL("Invalid bitmap format");) // TODO device error handling + return; + } + if (num_containers > (1 << 16)) { + valid = false; + NV_IF_TARGET(NV_IS_HOST, + CUCO_FAIL("Invalid bitmap format");) // TODO device error handling + return; + } + + has_run = (cookie & 0xFFFF) == serial_cookie; + if (has_run) { + valid = false; // TODO run container bitmap is not supported yet + NV_IF_TARGET(NV_IS_HOST, + CUCO_FAIL("Invalid bitmap format");) // TODO device error handling + return; + cuda::std::size_t s = (num_containers + 7) / 8; + run_container_bitmap = cuda::std::distance(bitmap, buf); + buf += s; + } + + key_cards = cuda::std::distance(bitmap, buf); + buf += num_containers * 2 * sizeof(cuda::std::uint16_t); + + if ((!has_run) || (num_containers >= no_offset_threshold)) { + container_offsets = cuda::std::distance(bitmap, buf); + offsets_aligned = (reinterpret_cast(bitmap + container_offsets) % + sizeof(cuda::std::uint32_t)) == 0; + buf += num_containers * 4; + } + + num_keys = 0; + cuda::std::uint16_t const* cards = + reinterpret_cast(bitmap + key_cards); + cuda::std::uint32_t card = 0; + for (cuda::std::int32_t i = 0; i < num_containers; i++) { + // cuda::std::uint16_t key = key_cards[i * 2]; + card = cards[i * 2 + 1] + 1; + num_keys += card; + } + + // find end of roaring bitmap + cuda::std::byte const* end = + bitmap + container_offset(bitmap + container_offsets, offsets_aligned, num_containers - 1); + if (is_run_container(reinterpret_cast(bitmap + run_container_bitmap), + has_run, + num_containers - 1)) { + // TODO implement + } else { + if (card <= 4096) { // TODO check if this is correct + end += card * sizeof(cuda::std::uint16_t); + } else { + end += 8192; // fixed size bitset container + } + } + + size_bytes = static_cast(cuda::std::distance(bitmap, end)); + valid = true; + } +}; + +// TODO implement roaring_bitmap_metadata + +} // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/roaring_bitmap.cuh b/include/cuco/roaring_bitmap.cuh index d8269662b..69ad93ae8 100644 --- a/include/cuco/roaring_bitmap.cuh +++ b/include/cuco/roaring_bitmap.cuh @@ -16,23 +16,21 @@ #pragma once -#include +#include #include #include #include #include -#include - namespace cuco { template > class roaring_bitmap { public: - using allocator_type = Allocator; - - using ref_type = roaring_bitmap_ref; + using storage_type = detail::roaring_bitmap_storage; + using allocator_type = typename storage_type::allocator_type; + using ref_type = roaring_bitmap_ref; roaring_bitmap(cuda::std::byte const* bitmap, Allocator const& alloc = {}, @@ -72,9 +70,7 @@ class roaring_bitmap { [[nodiscard]] ref_type ref() const noexcept; private: - allocator_type allocator_; - typename ref_type::metadata_type metadata_; - std::unique_ptr> data_; + storage_type storage_; ref_type ref_; }; diff --git a/include/cuco/roaring_bitmap_ref.cuh b/include/cuco/roaring_bitmap_ref.cuh index 4e9939c05..41994099f 100644 --- a/include/cuco/roaring_bitmap_ref.cuh +++ b/include/cuco/roaring_bitmap_ref.cuh @@ -28,10 +28,9 @@ class roaring_bitmap_ref { using impl_type = detail::roaring_bitmap_impl; public: - using metadata_type = typename impl_type::metadata_type; + using storage_ref_type = typename impl_type::storage_ref_type; - __host__ __device__ roaring_bitmap_ref(cuda::std::byte const* bitmap, - metadata_type const& metadata); + __host__ __device__ roaring_bitmap_ref(storage_ref_type const& storage_ref); template >> @@ -59,9 +58,6 @@ class roaring_bitmap_ref { [[nodiscard]] __host__ __device__ cuda::std::size_t size_bytes() const noexcept; - [[nodiscard]] __host__ __device__ static metadata_type const read_metadata( - cuda::std::byte const* bitmap) noexcept; - private: impl_type impl_; }; From 26e23da628d1741a9820e319d046d2a3b98c83f3 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 9 Jul 2025 17:36:49 -0700 Subject: [PATCH 08/24] Improve member order to reduce struct size --- .../cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh index 29c70b343..74839b1b5 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh @@ -49,16 +49,16 @@ class roaring_bitmap_impl { { auto const& meta = storage_ref.metadata(); if (meta.valid) { - data_ = storage_ref.data(); - size_bytes_ = meta.size_bytes; - size_ = meta.num_keys; - num_containers_ = meta.num_containers; + data_ = storage_ref.data(); + size_bytes_ = meta.size_bytes; + size_ = meta.num_keys; run_container_bitmap_ = reinterpret_cast(storage_ref.data() + meta.run_container_bitmap); key_cards_ = reinterpret_cast(storage_ref.data() + meta.key_cards); offsets_ = reinterpret_cast(storage_ref.data() + meta.container_offsets); + num_containers_ = meta.num_containers; offsets_aligned_ = meta.offsets_aligned; has_run_ = meta.has_run; } @@ -207,10 +207,10 @@ class roaring_bitmap_impl { cuda::std::byte const* data_; cuda::std::size_t size_bytes_; cuda::std::size_t size_; - cuda::std::int32_t num_containers_; cuda::std::uint8_t const* run_container_bitmap_; cuda::std::uint16_t const* key_cards_; // TODO uint8? cuda::std::byte const* offsets_; + cuda::std::int32_t num_containers_; bool offsets_aligned_; bool has_run_; }; From 90b6fc56bc492a18858575aeb11d651c780f9a32 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 13 Aug 2025 10:37:26 -0700 Subject: [PATCH 09/24] 64-bit roaring bitmap --- examples/roaring_bitmap/bitmapwithruns.bin | Bin 0 -> 48056 bytes examples/roaring_bitmap/host_bulk_example.cu | 144 ++++++--- examples/roaring_bitmap/portable_bitmap64.bin | Bin 0 -> 16506 bytes .../detail/roaring_bitmap/roaring_bitmap.inl | 16 +- .../roaring_bitmap/roaring_bitmap_impl.cuh | 273 ++++++++++++++---- .../roaring_bitmap/roaring_bitmap_storage.cuh | 133 ++++++++- include/cuco/detail/roaring_bitmap/util.cuh | 182 ++++++++---- include/cuco/roaring_bitmap.cuh | 110 ++++++- include/cuco/roaring_bitmap_ref.cuh | 87 +++++- 9 files changed, 749 insertions(+), 196 deletions(-) create mode 100644 examples/roaring_bitmap/bitmapwithruns.bin create mode 100644 examples/roaring_bitmap/portable_bitmap64.bin diff --git a/examples/roaring_bitmap/bitmapwithruns.bin b/examples/roaring_bitmap/bitmapwithruns.bin new file mode 100644 index 0000000000000000000000000000000000000000..5ed243753e169295a32d6251db66180f23ceac06 GIT binary patch literal 48056 zcmeIuQyb)3w5Z{vVpUYBB$bL&skUv~wr$(CZQHhO+qPG)cJFo0f7tsf^BLb5*YnMt zfdl{y015&iU;{8fdI(UZDhgDsiv4%-|000=3<=DyNCAKU(!k#=^#A~P^j|*sU-o|$ z{I7=pHSoU{0>HA}0I=r2HvJa>ynrrXJE#Th2(g1)Ln~mWaJl5SR1eaf$h0Bbj9dfq zwJ21fSdLOwIlR(`syAvLsJo)!gr+@OHfUR-V}`CuPoVFM0T+hs7_nf?hzT90)R<9V zPKpKGl5fi&tUj@R!{!;=2kdUKzrx`h$F0-&`45+FTsv@U#k~oSdOU0Js>HhtpOmk_ zj~Bl$fIE-{#0aJXUqdROrqDV2oA)5ai8LEB%*ZkzM~gfa3gjqamB7nU@R~Jm<&t1Go*`10DiTfaky~;4Sb0_zZjnz5~C2zd#5? zKpdpNFvx=uPzGb54qBiK`d~^h4VWIx1ZD+ufVshZU_r15SR5<`mIW(-mBDIYO|TAF zA8Z6R1zUiv!FFIrunX87>;?7(2Y`dYVch zsvW8qY8Yw~Y94A8Y8&bh>Ky78>KW=2>K_^u8X6iA8XX!Jni!fAnjV@Jnj2aWS{zyy zS{YgsS|8dJ+8Wvs+8x>#Iv6?H+nJ`auJsA<%GW6f_o^08NIbK{KH_(0phS zv=mwat%lY?8=)=Gc4!y07dikPhK@lep)=5V=n`}lx&hsW?m-WsC(v`~74#PR0DXqO zLf@fZ&|fG7BQOqAa2V#{2rR=fScfgxg?%_BoCZ!0XM(fBIpExIKDZ!U1TGGjg3H1c z;L30{xF%c&t`9eYo5C&N)^IzxBise<4)=ol!UN#J@Gy8JJO&;QPlBhyGvL|qJa{3z z1YQoWg4e(i!Q7^hEj~{gFY)P-Fx$8X1R7M5Z9qky*%GWC5}m zS%$1c)*$PVO~_Vc2eKR4ha5zXAjgqY$XVn9av8aX+(hmm_mM})Q{)Bm8hM9&M7|*Z zB0rGd2!KK;iV`S|vZ#PcsDf&!f!e5tCec)AS~LTi8O?^~MDw8e(L!iZv;;r(I93WPi&elX zW7V*lSRJfB)(C5gwZK|q?XZqm7pyzh3+sywzy@Q(u#wmpY&|73?~83%iRwz#e1Iu$R~y>^=4g`v?1m z{lxxYAP(agPT~yC;UXTzRXmQHxPvF~6nJVp9i9=-f@jBb;d$`_cwxL4UJ@^Zm&YsN zRq+~lZM+`d5O0Dv$6MiT@eX)ryc^yV?}PWp2jN5U5%_3)96k}Bf=|b1;dAi?_+oq+ zz7k)9ug5pxTk#$EZhRkp5I=$+$4}vB@eBB6{2G1}zk}b$AK_2&7x-)Z9sUvjg8z&E zz<=Wa0TC!c5H!IO0wEC!p%DgQ6CRNyQW0s13`Axk8vMJeuY)!T!JCa?FOpZt>*Ou+F8P3bOgr4~?&sb$nk zY7MoX+C*)oc2K*iebhnf2z8t~MV+NCP?xD|)J^IRb)R}fJ*8eyuc>#`N9qgpFZF}^ zO#w7SqclO&G)oJ#L@TsL8?;S(bdpX*r=>H{ndxkFPC5^rpDsifrAyGI>2h>Mx(Z#L zu0_|S8_2>r*dJDas-bL@F5739{WAsV-41J!yL|>(E(6{M(^h5dy{hWS9zokFWpXsmkclsCo zmku!qgEJHpW_TvT$V`mU8H;flpGnE2VbU|1n5;|=CO4ChDaaIIiZi8{vP=c0GECE6T?%&)5EjEbHfY5 zi^I#pE5mET>%*JETf;lTyTkj!2g66g$HS+>XTul5m&4b>H^X)aV-413JvPavV$-r2*vxD;HYb~h&CeEMi?Suy z(rh`lB3p&6&eme@oHvdxkyFUShAZH`v?kJ@z5{gniDwV&AeK z*w5@&_B;EF{mX_pgu^+C3v)ad;bbnx>72#6oX@4?(s1dyOk7qj2bY`6#}(v?aK*V& zTv@IHSDCBE)#U1M^|?k|Q?3Qqnrp{(~#&F}gN!(O!1~;3V z$1UWRaLc(>+*)n}x0&0FZ0*=W^Y8eN z{1^UT{s;e?2Lwn!1wx<&RuBY9Py|gd1Y7Wgq>xHTD`XHd3)zI6LLMQ%P)H~$ln_b_ z<%Eht6`{INOQD@+h33)6&|!W?0~ zut-=ctPoZU>x7NM7Gb-vOV}$M5Dp8+gphr$!#x$sJOD|`?> z3txrr!Y|>k5E2m)7b!6;@?u1k#h9pzmgtJUm{Lq5rWZ4bS;ZV;ZZV%&P%I)A7fXp{ z#R_6&v6@&@tRvPJ8;MQD7Gi6$o!C+AB6b&hiG9TZ;$U%@I8q!Vju$71Q^gtLY;m5r zP+TG|7gvdE#SP+SahteP+#~K64~a*`6XI#{oOn^ZB3>77iFd^Z;$!id_)>f$z861< z|A^njpW+`8jKC2rLPnSf7ZD@Th#HAU%!m_7L{dajN76+yMzTb*M{-5-MhZja*aMs`GYNA^VyMvg>|M@~h~MlM7yN3KO~M(#xJM;=9< zMqWf-N8Uv~M!rP;jr@rGjsOxQp%Nj{5-SOkBq@?68Imn|Qc_AKrIj*BnWbz}PAQL+ zUn(ROl}bpZrE*e5sftuxswLHx8c2<$W>QP3jnrQ1Bz2W~NWG?`qo1PxM88FUM*l=X8J01blo^?mML8;~a$GiLM^4Bo>Xd_+DjpOVkY7v#(G zHTkA|N4_sVlAp>ij{qEbbvuGCWMDh-szN;9RU(ne{obW*x1J(S)`KV_gY zL>aD(QpPG1l*!68Wu`JmnXfETmMSZh)yg_$qq0TWuIy6wDhHIq$}#1naz;6?TvDzo zH{YDM0u{fQr;>bl+Vgn<-77r`KyFfM8#D~4XeBwQDrrz>Z+x>s;{P0)2Qjy zOlnp&hnic>rxsL;sKwP%YFV{{T3M~8)>P}L_0>jdQ?-TKT5YFxRJ*9%)m~~}b$~ip z9j1;{$Ef4gN$ON}hB{lFr!G{NsLRz=>RNS!x>?<(?o{`v`_)70QT2p+T0N&;RIjMl z)m!Ra^?~|WeWt!t->C1^PwGGFH}$9bM+IYW42zL5CdS3YSTv@_;xRMk#1gR-vDC42 zv5c`SvFx#2vAnSYvBI%pv68VevGTD>v8u5evD&eEv4*iGvF5Q>v9_@evCgq>v7WI$ zvHr0^v7xaMvC*+{v5B!MvFWi{vAMAYvBj}vv6ZnkvGuV{v8}NkvE8wKv4gQAvE#8* zv9qxYvCFY*v750wvHP({v8S;YvDdM8v5&DYv43MfV!va625G29Xtc&^f+lH-rfG&| zYo3Ro7~1b+rasW38FiQfs5N*E(rk zwH{h;t)Dhf8=?)@MrmWU3EE_Bnl@9Lqs`YAX-l;g+G=f`wo%)nZP#{bd$j}FVeOc9 zQahua*Dh&SwHw-P?Vk2fd!jwpUTJT&587w#tM*;{rTx`HaU_n%sdzZf$0Kn$9*gU7 zEAGbqc*=O1c=~vzc-DB1cQ>zc++@`c*-`11Iw_}ch}_~!Vw_|EvA z`2P5z_|f=@`04n$_{I2@`1Sa$_}%z}_~ZDq_{;d4`1|;$_&@P)@t^TOaZrbKOeb|l z=X6nz>Z%^sP2JHGdI~+Yo=(rGXVJ6kx%9kx0llzZOfRXI(aYTrX zo9nIgwt5G>v))bbsrS+Q>x1;6`UrirK2D#gPtm9Av-G+80)4T*Okb(5(bwyn^sV|1 zeYd_(Kd2wkkL#!Ov-$=7vVKj!so&A>>yPxO`V0NF{!ag>f6@Qdf9St;z<>5V|#sp)sG0m81%rWL0i;Shl3S+gg&e&*dF}54K zjJ?JIdRd-Id|kNM5~Y5p-m3$`$ev>1!C zL@R2kR@^cz$4Xc!tkhOIE2EXg%5LSd@>&I~!d5Y>q*cZ$Z&k9YS~aZNRz0hs)x>IU zwX)h;9jwk)H>;=B$LenlvW8kCtkKpuYoay9nr_Xq=2{D^#nv)wrM1RdZ*8)+T05-W z);{Z?b;LSuowCka7p%+HHS4Bz$GUGlvYuKmtk>2%>!bC>`q%nl{k8xbvQe9`X`8hL zTe1~fvklv}Jv(WqveVib?96sHJExt;&Tkj8i`pgZ(snt!qFu$VZr8Hw+70Z+b~C%B z-NtTjce1-H|*Q?J^P{k#C~qSvftVt?9cXB`@8+i{%eOE#K9fP2|K(K zabzdv=#J&Mj_;&&(m3gzOiorOhm+gM=M;2`IK`b(PFbgdQ`xEJ)O6}N^_@meQ>TT~ z+G*!>bh~!`x z`<+A1QRjqn+BxT3bgnqpom;ihm?yXoAFZWcGYo6F7X7H|u@#oUr^8MnM!$*tV$JGW*+nyW`x6?i6>rJIkHxE^rsS%iNXj8h5?B$=&MiaCf`= z+=K2B_qcn?J?ma@FT2;=o9-R=zWd01>b`JayYJkO?icr8_lNu21w6<@J;I|s))PF* zQ#{QxJlpfUq?gJ|>t*mVd)d64ULG&MSI8^smGDY?<-Ces6|cHi%d6`(@EUu~yp~=Y zuf5mF>+1FJdVBr6f!+{rxHrlh>rL<`d(*s`-W+efx5!)St?*WR>%5KL7H_+^%iHT6 z@D6*&yp!G;@4R=(yXxKWZhQBwWM(dtbfp-Y@U37fK)rJV7PG2|f`? z$cb1&Pgn^z;U`ii(j?L+G9|JmawKvm@+Ar;iX@6BN+rrBDkLf=swHYB>LltX8YP-0 zS|nO0+9f(Bx+J|;LZGd|~we$-d}xNrK7pYT)osr__*Mn8+6-OuIc^$Ylg{bGJezl>krujE(tYxuSO zdVWK{iQn9B<+t@a_?`W3eow!T-`^kP5A{d*qy2IIM1P7u-Jj*p^%wYy{bl}2e~rK1 z-{f!gclf*gef~lJh=1Ha<)8I0_?P`_{!Ramf8T%PKlNYuul;xaNB@ieum8jU?E^_D zi6)68on(_jQc5aGEomg}q?b%4Qzg?TGbA%7vn6vT^Ca^p3nhytOC(Dt%Oxu&t0b!@ zYbEO@8zdVin69`4vF`0RH~{|2xb70P7F}00000 literal 0 HcmV?d00001 diff --git a/examples/roaring_bitmap/host_bulk_example.cu b/examples/roaring_bitmap/host_bulk_example.cu index bbbbe6005..3309881a2 100644 --- a/examples/roaring_bitmap/host_bulk_example.cu +++ b/examples/roaring_bitmap/host_bulk_example.cu @@ -1,28 +1,92 @@ -#include +/* + * Copyright (c) 2025 NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ #include +#include -#include +#include +#include #include #include -#include - #include #include +#include #include -int main(int argc, char* argv[]) +/** + * @file host_bulk_example.cu + * @brief Demonstrates usage of the roaring_bitmap "bulk" lookup host APIs. + * + * In this example we load two 32-bit bitmaps and one 64-bit bitmap (portable format) from the + * [RoaringBitmapFormatSpec](https://github.com/RoaringBitmap/RoaringFormatSpec) repository and + * check if the bulk lookup API returns the correct results. Namely, we test the following files: + * - + * [examples/roaring_bitmap/bitmapwithoutruns.bin](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata/bitmapwithoutruns.bin) + * - + * [examples/roaring_bitmap/bitmapwithruns.bin](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata/bitmapwithruns.bin) + * - + * [examples/roaring_bitmap/portable_bitmap64.bin](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata64/portable_bitmap64.bin) + * + */ + +template +bool check(std::string const& bitmap_file_path) { - if (argc != 2) { - std::cerr << "Usage: " << argv[0] << " " << std::endl; - return -1; - } + auto generate_keys = []() -> thrust::device_vector { + if constexpr (cuda::std::is_same_v) { + // reference: + // https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata/README.md#test-data + std::vector keys; + for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) { + keys.push_back(k); + } + for (int k = 100000; k < 200000; ++k) { + keys.push_back(3 * k); + } + for (int k = 700000; k < 800000; ++k) { + keys.push_back(k); + } + return thrust::device_vector(keys.begin(), keys.end()); + } else if constexpr (cuda::std::is_same_v) { + // reference: + // https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata64/README.md#portable_bitmap64bin + std::vector keys; + for (cuda::std::uint64_t k = 0x00000ull; k < 0x09000ull; ++k) { + keys.push_back(k); + } + for (cuda::std::uint64_t k = 0x0A000ull; k < 0x10000ull; ++k) { + keys.push_back(k); + } + keys.push_back(0x20000ull); + keys.push_back(0x20005ull); + for (cuda::std::uint64_t i = 0; i < 0x10000ull; i += 2ull) { + keys.push_back(0x80000ull + i); + } + return thrust::device_vector(keys.begin(), keys.end()); + } else { + static_assert(cuco::dependent_false, "KeyType must be uint32_t or uint64_t"); + return {}; + } + }; // Open file - std::ifstream file(argv[1], std::ios::binary); + std::ifstream file(bitmap_file_path, std::ios::binary); if (!file.is_open()) { - std::cerr << "Failed to open " << argv[1] << std::endl; - return -1; + std::cerr << "Failed to open " << bitmap_file_path << std::endl; + return false; } // Get file size @@ -30,50 +94,36 @@ int main(int argc, char* argv[]) std::streamsize file_size = file.tellg(); file.seekg(0, std::ios::beg); - // Allocate pinned host memory using cudaMallocHost - char* buffer; - CUCO_CUDA_TRY(cudaMallocHost(&buffer, file_size)); + thrust::universal_host_pinned_vector buffer(file_size); // Read file into memory - file.read(buffer, file_size); + file.read(reinterpret_cast(thrust::raw_pointer_cast(buffer.data())), file_size); file.close(); - cuco::roaring_bitmap roaring_bitmap( - reinterpret_cast(buffer)); + cuco::roaring_bitmap roaring_bitmap(thrust::raw_pointer_cast(buffer.data())); - std::vector keys; - for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) { - keys.push_back(k); - } - for (int k = 100000; k < 200000; ++k) { - keys.push_back(3 * k); - } - for (int k = 700000; k < 800000; ++k) { - keys.push_back(k); - } + auto keys = generate_keys(); + thrust::device_vector contained(keys.size(), false); - thrust::universal_vector keys_d(keys.begin(), keys.end()); - thrust::universal_vector contained(keys.size(), false); + roaring_bitmap.contains(keys.begin(), keys.end(), contained.begin()); - roaring_bitmap.contains(keys_d.begin(), keys_d.end(), contained.begin()); + bool all_contained = thrust::all_of(contained.begin(), contained.end(), ::cuda::std::identity{}); + return all_contained; +} - size_t num_errors = 0; - for (size_t i = 0; i < keys.size(); i++) { - if (not contained[i]) { - if (num_errors <= 10) { - std::cout << "Error: " << keys_d[i] << " is not contained" << std::endl; - } - num_errors++; - } - } - if (num_errors > 0) { std::cout << "num_errors: " << num_errors << std::endl; } +int main() +{ + auto data_dir_prefix = []() -> std::string { + std::string source_path = __FILE__; + auto pos = source_path.find_last_of("/\\"); + return (pos == std::string::npos) ? std::string(".") : source_path.substr(0, pos); + }; - // check if all elements are contained and written to output - bool all_contained = thrust::all_of(contained.begin(), contained.end(), ::cuda::std::identity{}); - std::cout << "all_contained: " << all_contained << std::endl; + bool success = check(data_dir_prefix() + "/bitmapwithoutruns.bin"); + success &= check(data_dir_prefix() + "/bitmapwithruns.bin"); + success &= check(data_dir_prefix() + "/portable_bitmap64.bin"); - // Free the allocated memory - CUCO_CUDA_TRY(cudaFreeHost(buffer)); + std::cout << "success: " << (success ? "true" : "false") << std::endl; - return 0; + return success ? 0 : 1; } \ No newline at end of file diff --git a/examples/roaring_bitmap/portable_bitmap64.bin b/examples/roaring_bitmap/portable_bitmap64.bin new file mode 100644 index 0000000000000000000000000000000000000000..acd0f9007d6902f2fa29b82d8f5ee6662a4291d2 GIT binary patch literal 16506 zcmeI&F%Cdb3;@s~5*IOJFgb_WQCz_h9MKIZi`^!9P1@i8x4yr&j5nsfiXyMaUCL~m zIM+7&E_28npZ6?V?B|ka)G-SJ1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ z;P(Reu7JgX-+!Y42oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+z&C*l;jK-t literal 0 HcmV?d00001 diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl index 964f74f45..7159cc6ae 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl @@ -25,7 +25,7 @@ template roaring_bitmap::roaring_bitmap(cuda::std::byte const* bitmap, Allocator const& alloc, cuda::stream_ref stream) - : storage_{bitmap, alloc, stream}, ref_{storage_.ref()} + : storage_{bitmap, alloc, stream} { } @@ -36,7 +36,7 @@ void roaring_bitmap::contains(InputIt first, OutputIt output, cuda::stream_ref stream) const { - ref_.contains(first, last, output, stream); + ref_type{storage_.ref()}.contains(first, last, output, stream); } template @@ -46,31 +46,31 @@ void roaring_bitmap::contains_async(InputIt first, OutputIt output, cuda::stream_ref stream) const noexcept { - ref_.contains_async(first, last, output, stream); + ref_type{storage_.ref()}.contains_async(first, last, output, stream); } template cuda::std::size_t roaring_bitmap::size() const noexcept { - return ref_.size(); + return ref_type{storage_.ref()}.size(); } template bool roaring_bitmap::empty() const noexcept { - return ref_.empty(); + return ref_type{storage_.ref()}.empty(); } template cuda::std::byte const* roaring_bitmap::data() const noexcept { - return ref_.data(); + return ref_type{storage_.ref()}.data(); } template cuda::std::size_t roaring_bitmap::size_bytes() const noexcept { - return ref_.size_bytes(); + return ref_type{storage_.ref()}.size_bytes(); } template @@ -83,6 +83,6 @@ typename roaring_bitmap::allocator_type roaring_bitmap typename roaring_bitmap::ref_type roaring_bitmap::ref() const noexcept { - return ref_; + return ref_type{storage_.ref()}; } } // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh index 74839b1b5..42752f2d6 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh @@ -46,26 +46,15 @@ class roaring_bitmap_impl { static constexpr cuda::std::uint32_t binary_search_threshold = 8; // TODO determine optimal value __host__ __device__ roaring_bitmap_impl(storage_ref_type const& storage_ref) - { - auto const& meta = storage_ref.metadata(); - if (meta.valid) { - data_ = storage_ref.data(); - size_bytes_ = meta.size_bytes; - size_ = meta.num_keys; - run_container_bitmap_ = - reinterpret_cast(storage_ref.data() + meta.run_container_bitmap); - key_cards_ = - reinterpret_cast(storage_ref.data() + meta.key_cards); - offsets_ = - reinterpret_cast(storage_ref.data() + meta.container_offsets); - num_containers_ = meta.num_containers; - offsets_aligned_ = meta.offsets_aligned; - has_run_ = meta.has_run; - } - } - - __device__ roaring_bitmap_impl(cuda::std::byte const* bitmap) - : roaring_bitmap_impl{storage_ref_type{bitmap}} + : storage_ref_{storage_ref}, + offsets_aligned_{(reinterpret_cast( + storage_ref_.data() + storage_ref_.metadata().container_offsets)) % + sizeof(cuda::std::uint32_t) == + 0}, + aligned_16_{(reinterpret_cast(storage_ref_.data() + + storage_ref_.metadata().key_cards)) % + sizeof(cuda::std::uint16_t) == + 0} // if base address of key_cards is aligned, then all containers are aligned { } @@ -100,29 +89,54 @@ class roaring_bitmap_impl { } __device__ bool contains(cuda::std::uint32_t value) const + { + if (storage_ref_.metadata().num_keys == 0) { return false; } + + if (aligned_16_) { + return this->dispatch_contains(value); + } else { + return this->dispatch_contains(value); + } + } + + template + __device__ bool dispatch_contains(cuda::std::uint32_t value) const { cuda::std::uint16_t const upper = value >> 16; cuda::std::uint16_t const lower = value & 0xFFFF; + cuda::std::uint16_t key; - if (num_containers_ < binary_search_threshold) { + if (storage_ref_.metadata().num_containers < binary_search_threshold) { // linear search #pragma unroll - for (cuda::std::uint32_t i = 0; i < num_containers_; i++) { - cuda::std::uint16_t const key = key_cards_[i * 2]; - if (key == upper) { return this->contains_container(lower, i); } + for (cuda::std::uint32_t i = 0; i < storage_ref_.metadata().num_containers; i++) { + if constexpr (Aligned) { + key = aligned_load(storage_ref_.key_cards() + + (i * 2) * sizeof(cuda::std::uint16_t)); + } else { + key = misaligned_load(storage_ref_.key_cards() + + (i * 2) * sizeof(cuda::std::uint16_t)); + } + if (key == upper) { return this->contains_container(lower, i); } if (key > upper) { return false; } } } else { // binary search cuda::std::uint32_t left = 0; - cuda::std::uint32_t right = num_containers_; + cuda::std::uint32_t right = storage_ref_.metadata().num_containers; while (left < right) { - cuda::std::uint32_t mid = left + (right - left) / 2; - cuda::std::uint16_t mid_key = key_cards_[mid * 2]; + cuda::std::uint32_t mid = left + (right - left) / 2; + if constexpr (Aligned) { + key = aligned_load(storage_ref_.key_cards() + + (mid * 2) * sizeof(cuda::std::uint16_t)); + } else { + key = misaligned_load(storage_ref_.key_cards() + + (mid * 2) * sizeof(cuda::std::uint16_t)); + } - if (mid_key == upper) { - return this->contains_container(lower, mid); - } else if (mid_key < upper) { + if (key == upper) { + return this->contains_container(lower, mid); + } else if (key < upper) { left = mid + 1; } else { right = mid; @@ -132,42 +146,70 @@ class roaring_bitmap_impl { return false; } - [[nodiscard]] __host__ __device__ cuda::std::size_t size() const noexcept { return size_; } + [[nodiscard]] __host__ __device__ cuda::std::size_t size() const noexcept + { + return storage_ref_.metadata().num_keys; + } - [[nodiscard]] __host__ __device__ bool empty() const noexcept { return size_ == 0; } + [[nodiscard]] __host__ __device__ bool empty() const noexcept { return this->size() == 0; } - [[nodiscard]] __host__ __device__ cuda::std::byte const* data() const noexcept { return data_; } + [[nodiscard]] __host__ __device__ cuda::std::byte const* data() const noexcept + { + return storage_ref_.data(); + } [[nodiscard]] __host__ __device__ cuda::std::size_t size_bytes() const noexcept { - return size_bytes_; + return storage_ref_.metadata().size_bytes; } - private: + template __device__ bool contains_container(cuda::std::uint16_t lower, cuda::std::uint32_t index) const { - cuda::std::uint32_t card = key_cards_[index * 2 + 1] + 1; - cuda::std::uint16_t const* container = reinterpret_cast( - data_ + container_offset(offsets_, offsets_aligned_, index)); - if (is_run_container(run_container_bitmap_, has_run_, index)) { - return this->contains_run_container(container, lower, card); + cuda::std::uint32_t offset; + if (offsets_aligned_) { + offset = aligned_load(storage_ref_.container_offsets() + + index * sizeof(cuda::std::uint32_t)); + } else { + offset = misaligned_load(storage_ref_.container_offsets() + + index * sizeof(cuda::std::uint32_t)); + } + cuda::std::byte const* container = storage_ref_.data() + offset; + if (storage_ref_.metadata().has_run and + (storage_ref_.run_container_bitmap()[index / 8] & (1 << (index % 8)))) { + return this->contains_run_container(container, lower); } else { - if (card <= 4096) { // TODO check if this is correct - return this->contains_array_container(container, lower, card); + cuda::std::uint32_t card; + if constexpr (Aligned) { + card = 1u + aligned_load( + storage_ref_.key_cards() + (index * 2 + 1) * sizeof(cuda::std::uint16_t)); + } else { + card = 1u + misaligned_load( + storage_ref_.key_cards() + (index * 2 + 1) * sizeof(cuda::std::uint16_t)); + } + if (card <= 4096) { + return this->contains_array_container(container, lower, card); } else { return this->contains_bitset_container(container, lower, card); } } } - __device__ bool contains_array_container(cuda::std::uint16_t const* container, + template + __device__ bool contains_array_container(cuda::std::byte const* container, cuda::std::uint16_t lower, cuda::std::uint32_t card) const { + cuda::std::uint16_t elem; // Use linear search for small arrays, binary search for larger ones if (card < binary_search_threshold) { for (cuda::std::uint32_t i = 0; i < card; i++) { - if (container[i] == lower) { return true; } + if constexpr (Aligned) { + elem = aligned_load(container + i * sizeof(cuda::std::uint16_t)); + } else { + elem = misaligned_load(container + i * sizeof(cuda::std::uint16_t)); + } + if (elem == lower) { return true; } } return false; } else { @@ -176,9 +218,15 @@ class roaring_bitmap_impl { while (left < right) { cuda::std::uint32_t mid = left + (right - left) / 2; - if (container[mid] == lower) { + if constexpr (Aligned) { + elem = aligned_load(container + mid * sizeof(cuda::std::uint16_t)); + } else { + elem = + misaligned_load(container + mid * sizeof(cuda::std::uint16_t)); + } + if (elem == lower) { return true; - } else if (container[mid] < lower) { + } else if (elem < lower) { left = mid + 1; } else { right = mid; @@ -188,37 +236,136 @@ class roaring_bitmap_impl { } } - __device__ bool contains_bitset_container(cuda::std::uint16_t const* container, + __device__ bool contains_bitset_container(cuda::std::byte const* container, cuda::std::uint16_t lower, cuda::std::uint32_t card) const { - // check if bit at position lower is set - return container[lower / 16] & (1 << (lower % 16)); + return static_cast(container[lower / 8]) & + (cuda::std::uint8_t(1) << (lower % 8)); } - __device__ bool contains_run_container(cuda::std::uint16_t const* container, - cuda::std::uint16_t lower, - cuda::std::uint32_t card) const + template + __device__ bool contains_run_container(cuda::std::byte const* container, + cuda::std::uint16_t lower) const { - // TODO implement linear search + // TODO implement binary search + cuda::std::uint16_t num_runs; + if constexpr (Aligned) { + num_runs = aligned_load(container); + } else { + num_runs = misaligned_load(container); + } + + cuda::std::uint16_t start; + cuda::std::uint32_t end; + + for (cuda::std::uint32_t i = 0; i < num_runs; i++) { + // TODO load start+end in one instruction + if constexpr (Aligned) { + start = + aligned_load(container + (i * 2 + 1) * sizeof(cuda::std::uint16_t)); + end = + static_cast(start) + + aligned_load(container + (i * 2 + 2) * sizeof(cuda::std::uint16_t)); + } else { + start = misaligned_load(container + + (i * 2 + 1) * sizeof(cuda::std::uint16_t)); + end = static_cast(start) + + misaligned_load(container + + (i * 2 + 2) * sizeof(cuda::std::uint16_t)); + } + if (start <= lower && end >= lower) { return true; } + if (start > lower) { break; } + } return false; } - cuda::std::byte const* data_; - cuda::std::size_t size_bytes_; - cuda::std::size_t size_; - cuda::std::uint8_t const* run_container_bitmap_; - cuda::std::uint16_t const* key_cards_; // TODO uint8? - cuda::std::byte const* offsets_; - cuda::std::int32_t num_containers_; + storage_ref_type storage_ref_; bool offsets_aligned_; - bool has_run_; + bool aligned_16_; }; template <> class roaring_bitmap_impl { - using bucket_type = roaring_bitmap_impl; - // TODO implement + public: + using bucket_type = roaring_bitmap_impl; + using storage_ref_type = roaring_bitmap_storage_ref; + + __host__ __device__ roaring_bitmap_impl(storage_ref_type const& storage_ref) + : storage_ref_{storage_ref} + { + } + + template + __host__ void contains(InputIt first, + InputIt last, + OutputIt contained, + cuda::stream_ref stream = {}) const + { + this->contains_async(first, last, contained, stream); + stream.wait(); + } + + template + __host__ void contains_async(InputIt first, + InputIt last, + OutputIt contained, + cuda::stream_ref stream = {}) const noexcept + { + auto nosync_exec_policy = thrust::cuda::par_nosync.on(stream.get()); + if (this->empty()) { + thrust::fill( + nosync_exec_policy, contained, contained + cuda::std::distance(first, last), false); + } else { + thrust::transform(nosync_exec_policy, + first, + last, + contained, + cuda::proclaim_return_type( + [*this] __device__(auto key) { return this->contains(key); })); + } + } + + __device__ bool contains(cuda::std::uint64_t value) const + { + cuda::std::uint32_t bucket_key = value >> 32; + cuda::std::uint32_t bucket_value = value & 0xFFFFFFFF; + + // binary search in storage_ref_.buckets() + cuda::std::uint32_t left = 0; + cuda::std::uint32_t right = storage_ref_.metadata().num_buckets; + while (left < right) { + cuda::std::uint32_t mid = left + (right - left) / 2; + if (storage_ref_.buckets()[mid].first == bucket_key) { + return bucket_type{storage_ref_.buckets()[mid].second}.contains( + bucket_value); // TODO is constructing the ref in-place a bad idea? + } else if (storage_ref_.buckets()[mid].first < bucket_key) { + left = mid + 1; + } else { + right = mid; + } + } + return false; + } + + [[nodiscard]] __host__ __device__ cuda::std::size_t size() const noexcept + { + return storage_ref_.metadata().num_keys; + } + + [[nodiscard]] __host__ __device__ bool empty() const noexcept { return this->size() == 0; } + + [[nodiscard]] __host__ __device__ cuda::std::byte const* data() const noexcept + { + return storage_ref_.data(); + } + + [[nodiscard]] __host__ __device__ cuda::std::size_t size_bytes() const noexcept + { + return storage_ref_.metadata().size_bytes; + } + + storage_ref_type storage_ref_; }; } // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh index 49805afb8..349f1bb83 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh @@ -27,6 +27,8 @@ #include #include +#include +#include namespace cuco::detail { @@ -41,12 +43,18 @@ class roaring_bitmap_storage_ref { using metadata_type = roaring_bitmap_metadata; __host__ __device__ roaring_bitmap_storage_ref(cuda::std::byte const* bitmap, metadata_type const& metadata) - : data_{bitmap}, metadata_{metadata} + : metadata_{metadata}, + data_{bitmap}, + run_container_bitmap_{ + reinterpret_cast(bitmap + metadata.run_container_bitmap)}, + key_cards_{bitmap + metadata.key_cards}, + container_offsets_{bitmap + metadata.container_offsets} { + assert(metadata.valid); } __device__ roaring_bitmap_storage_ref(cuda::std::byte const* bitmap) - : data_{bitmap}, metadata_{metadata_type{bitmap}} + : roaring_bitmap_storage_ref{bitmap, metadata_type{bitmap}} { } @@ -54,9 +62,58 @@ class roaring_bitmap_storage_ref { __host__ __device__ cuda::std::byte const* data() const noexcept { return data_; } + __host__ __device__ cuda::std::size_t size_bytes() const noexcept { return metadata_.size_bytes; } + + __host__ __device__ cuda::std::uint8_t const* run_container_bitmap() const noexcept + { + return run_container_bitmap_; + } + + __host__ __device__ cuda::std::byte const* key_cards() const noexcept { return key_cards_; } + + __host__ __device__ cuda::std::byte const* container_offsets() const noexcept + { + return container_offsets_; + } + private: + metadata_type metadata_; cuda::std::byte const* data_; + cuda::std::uint8_t const* run_container_bitmap_; + cuda::std::byte const* key_cards_; + cuda::std::byte const* container_offsets_; +}; + +template <> +class roaring_bitmap_storage_ref { + public: + using metadata_type = roaring_bitmap_metadata; + + __host__ __device__ roaring_bitmap_storage_ref( + cuda::std::byte const* bitmap, + metadata_type const& metadata, + cuda::std::pair>* buckets) + : metadata_{metadata}, data_{bitmap}, buckets_{buckets} + { + } + + __host__ __device__ metadata_type const& metadata() const noexcept { return metadata_; } + + __host__ __device__ cuda::std::byte const* data() const noexcept { return data_; } + + __host__ __device__ cuda::std::size_t size_bytes() const noexcept { return metadata_.size_bytes; } + + __host__ __device__ + cuda::std::pair>* + buckets() const noexcept + { + return buckets_; + } + + private: metadata_type metadata_; + cuda::std::byte const* data_; + cuda::std::pair>* buckets_; }; template @@ -90,7 +147,6 @@ class roaring_bitmap_storage { { CUCO_CUDA_TRY(cudaMemcpyAsync( data_.get(), bitmap, metadata_.size_bytes, cudaMemcpyHostToDevice, stream.get())); - // stream.wait(); // TODO check if this is necessary } ref_type ref() const noexcept { return ref_; } @@ -102,6 +158,75 @@ class roaring_bitmap_storage { ref_type ref_; }; -// TODO implement roaring_bitmap_metadata +template +class roaring_bitmap_storage { + public: + using allocator_type = + typename std::allocator_traits::template rebind_alloc; + using ref_type = roaring_bitmap_storage_ref; + using bucket_ref_type = roaring_bitmap_storage_ref; + using bucket_allocator_type = typename std::allocator_traits::template rebind_alloc< + cuda::std::pair>; + + roaring_bitmap_storage(roaring_bitmap_storage const& other) = default; + roaring_bitmap_storage(roaring_bitmap_storage&& other) = default; + roaring_bitmap_storage& operator=(roaring_bitmap_storage const& other) = default; + roaring_bitmap_storage& operator=(roaring_bitmap_storage&& other) = default; + + ~roaring_bitmap_storage() = default; + + roaring_bitmap_storage(cuda::std::byte const* bitmap, + Allocator const& alloc, + cuda::stream_ref stream) + : allocator_{alloc}, + bucket_allocator_{alloc}, + bucket_metadata_{}, + buckets_h_{}, + metadata_{ + [bitmap](std::vector& bucket_metadata) { + return typename ref_type::metadata_type{bitmap, bucket_metadata}; + }(bucket_metadata_)}, + data_{allocator_.allocate(metadata_.size_bytes), + detail::custom_deleter{metadata_.size_bytes, + allocator_}}, + buckets_{bucket_allocator_.allocate(metadata_.num_buckets), + detail::custom_deleter{ + metadata_.num_buckets, bucket_allocator_}}, + ref_{data_.get(), metadata_, buckets_.get()} + { + assert(metadata_.valid); + buckets_h_.reserve(bucket_metadata_.size()); + for (auto const& meta : bucket_metadata_) { + buckets_h_.emplace_back(meta.key, + bucket_ref_type{data_.get() + meta.byte_offset, meta.metadata}); + } + CUCO_CUDA_TRY(cudaMemcpyAsync( + data_.get(), bitmap, metadata_.size_bytes, cudaMemcpyHostToDevice, stream.get())); + CUCO_CUDA_TRY(cudaMemcpyAsync( + buckets_.get(), + buckets_h_.data(), + metadata_.num_buckets * sizeof(cuda::std::pair), + cudaMemcpyHostToDevice, + stream.get())); + // stream.wait(); + // clear intermediate data + // bucket_metadata.clear(); + // buckets_h.clear(); + } + + ref_type ref() const noexcept { return ref_; } + + private: + allocator_type allocator_; + bucket_allocator_type bucket_allocator_; + std::vector bucket_metadata_; + std::vector> buckets_h_; + typename ref_type::metadata_type metadata_; + std::unique_ptr> data_; + std::unique_ptr, + custom_deleter> + buckets_; + ref_type ref_; +}; } // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/roaring_bitmap/util.cuh b/include/cuco/detail/roaring_bitmap/util.cuh index a9510800b..01892e73a 100644 --- a/include/cuco/detail/roaring_bitmap/util.cuh +++ b/include/cuco/detail/roaring_bitmap/util.cuh @@ -21,32 +21,25 @@ #include #include #include +#include #include +#include namespace cuco::detail { -__host__ __device__ cuda::std::uint32_t container_offset(cuda::std::byte const* offsets, - bool offsets_aligned, - cuda::std::int32_t i) +template +__host__ __device__ __forceinline__ T aligned_load(cuda::std::byte const* ptr) { - cuda::std::uint32_t offset = 0; - if (offsets_aligned) { - offset = - *reinterpret_cast(offsets + i * sizeof(cuda::std::uint32_t)); - } else { - cuda::std::memcpy( - &offset, offsets + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t)); - } - return offset; + return *reinterpret_cast(cuda::std::assume_aligned(ptr)); } -__host__ __device__ bool is_run_container(cuda::std::uint8_t const* run_container_bitmap, - bool has_run, - cuda::std::int32_t i) +template +__host__ __device__ __forceinline__ T misaligned_load(cuda::std::byte const* ptr) { - if (not has_run) return false; - return run_container_bitmap[i / 8] & (1 << (i % 8)); + T value; + cuda::std::memcpy(&value, ptr, sizeof(T)); + return value; } template @@ -56,21 +49,20 @@ struct roaring_bitmap_metadata { template <> struct roaring_bitmap_metadata { - cuda::std::size_t size_bytes = 0; - cuda::std::size_t num_keys = 0; - cuda::std::size_t run_container_bitmap = 0; - cuda::std::size_t key_cards = 0; - cuda::std::size_t container_offsets = 0; - cuda::std::int32_t num_containers = 0; - bool has_run = false; - bool offsets_aligned = false; - bool valid = false; + cuda::std::size_t size_bytes = 0; + cuda::std::uint32_t num_keys = 0; + cuda::std::uint32_t run_container_bitmap = 0; + cuda::std::uint32_t key_cards = 0; + cuda::std::uint32_t container_offsets = 0; + cuda::std::int32_t num_containers = 0; + bool has_run = false; + bool valid = false; __host__ __device__ roaring_bitmap_metadata(cuda::std::byte const* bitmap) { constexpr cuda::std::uint32_t serial_cookie_no_runcontainer = 12346; constexpr cuda::std::uint32_t serial_cookie = 12347; - // constexpr cuda::std::uint32_t frozen_cookie = 13766; + // constexpr cuda::std::uint32_t frozen_cookie = 13766; // not implemented constexpr cuda::std::int32_t no_offset_threshold = 4; cuda::std::byte const* buf = bitmap; @@ -80,8 +72,11 @@ struct roaring_bitmap_metadata { buf += sizeof(cuda::std::uint32_t); if ((cookie & 0xFFFF) != serial_cookie && cookie != serial_cookie_no_runcontainer) { valid = false; - NV_IF_TARGET(NV_IS_HOST, - CUCO_FAIL("Invalid bitmap format");) // TODO device error handling + NV_IF_TARGET( + NV_IS_HOST, + CUCO_FAIL( + "Invalid bitmap format: cookie type invalid or not supported");) // TODO device error + // handling return; } @@ -91,57 +86,61 @@ struct roaring_bitmap_metadata { cuda::std::memcpy(&num_containers, buf, sizeof(cuda::std::uint32_t)); buf += sizeof(cuda::std::uint32_t); } - if (num_containers < 0) { + if (num_containers < 0 or num_containers > (1 << 16)) { valid = false; - NV_IF_TARGET(NV_IS_HOST, - CUCO_FAIL("Invalid bitmap format");) // TODO device error handling - return; - } - if (num_containers > (1 << 16)) { - valid = false; - NV_IF_TARGET(NV_IS_HOST, - CUCO_FAIL("Invalid bitmap format");) // TODO device error handling + NV_IF_TARGET( + NV_IS_HOST, + CUCO_FAIL( + "Invalid bitmap format: num_containers out of range");) // TODO device error handling return; } has_run = (cookie & 0xFFFF) == serial_cookie; if (has_run) { - valid = false; // TODO run container bitmap is not supported yet - NV_IF_TARGET(NV_IS_HOST, - CUCO_FAIL("Invalid bitmap format");) // TODO device error handling - return; cuda::std::size_t s = (num_containers + 7) / 8; run_container_bitmap = cuda::std::distance(bitmap, buf); buf += s; } - key_cards = cuda::std::distance(bitmap, buf); + key_cards = cuda::std::distance(bitmap, buf); + bool const aligned_16 = (reinterpret_cast(bitmap + key_cards) % + sizeof(cuda::std::uint16_t)) == 0; buf += num_containers * 2 * sizeof(cuda::std::uint16_t); if ((!has_run) || (num_containers >= no_offset_threshold)) { container_offsets = cuda::std::distance(bitmap, buf); - offsets_aligned = (reinterpret_cast(bitmap + container_offsets) % - sizeof(cuda::std::uint32_t)) == 0; - buf += num_containers * 4; + buf += num_containers * sizeof(cuda::std::uint32_t); + } else { + valid = false; + NV_IF_TARGET( + NV_IS_HOST, + CUCO_FAIL("Invalid bitmap format: not implemented");) // TODO device error handling + return; } - num_keys = 0; - cuda::std::uint16_t const* cards = - reinterpret_cast(bitmap + key_cards); cuda::std::uint32_t card = 0; for (cuda::std::int32_t i = 0; i < num_containers; i++) { - // cuda::std::uint16_t key = key_cards[i * 2]; - card = cards[i * 2 + 1] + 1; + if (aligned_16) { + card = aligned_load(bitmap + key_cards + + (i * 2 + 1) * sizeof(cuda::std::uint16_t)) + + 1u; + } else { + card = misaligned_load(bitmap + key_cards + + (i * 2 + 1) * sizeof(cuda::std::uint16_t)) + + 1u; + } num_keys += card; } - // find end of roaring bitmap + // find end of roaring bitmap (re-use card from last container) cuda::std::byte const* end = - bitmap + container_offset(bitmap + container_offsets, offsets_aligned, num_containers - 1); - if (is_run_container(reinterpret_cast(bitmap + run_container_bitmap), - has_run, - num_containers - 1)) { - // TODO implement + bitmap + misaligned_load( + bitmap + container_offsets + (num_containers - 1) * sizeof(cuda::std::uint32_t)); + if (has_run and (static_cast( + (bitmap + run_container_bitmap)[(num_containers - 1) / 8]) & + (cuda::std::uint8_t(1) << ((num_containers - 1) % 8)))) { + cuda::std::uint16_t const num_runs = misaligned_load(end); + end += sizeof(cuda::std::uint16_t) + num_runs * 2 * sizeof(cuda::std::uint16_t); } else { if (card <= 4096) { // TODO check if this is correct end += card * sizeof(cuda::std::uint16_t); @@ -155,6 +154,73 @@ struct roaring_bitmap_metadata { } }; -// TODO implement roaring_bitmap_metadata +template <> +struct roaring_bitmap_metadata { + cuda::std::size_t num_buckets = 0; + cuda::std::size_t size_bytes = 0; + cuda::std::size_t num_keys = 0; + bool valid = false; + + struct bucket_metadata { + cuda::std::size_t byte_offset; + cuda::std::uint32_t key; + roaring_bitmap_metadata metadata; + + bucket_metadata(cuda::std::size_t offset, + cuda::std::uint32_t k, + roaring_bitmap_metadata const& meta) + : byte_offset{offset}, key{k}, metadata{meta} + { + } + }; + __host__ roaring_bitmap_metadata(cuda::std::byte const* bitmap, + std::vector& bucket_metadata) + { + cuda::std::size_t byte_offset = 0; + cuda::std::byte const* bitmap_ptr = bitmap; + cuda::std::memcpy(&num_buckets, bitmap_ptr, sizeof(cuda::std::uint64_t)); + byte_offset += sizeof(cuda::std::uint64_t); // skip num_buckets + + bucket_metadata.clear(); + bucket_metadata.reserve(num_buckets); + + for (cuda::std::size_t i = 0; i < num_buckets; ++i) { + cuda::std::uint32_t bucket_key; + cuda::std::memcpy(&bucket_key, bitmap_ptr + byte_offset, sizeof(cuda::std::uint32_t)); + byte_offset += sizeof(cuda::std::uint32_t); // skip bucket key + roaring_bitmap_metadata bucket_meta{bitmap_ptr + byte_offset}; + if (!bucket_meta.valid) { + valid = false; + return; + } + bucket_metadata.emplace_back(byte_offset, bucket_key, bucket_meta); + num_keys += bucket_meta.num_keys; + byte_offset += bucket_meta.size_bytes; // skip bucket + } + size_bytes = byte_offset; + valid = true; + } + + __host__ __device__ roaring_bitmap_metadata(cuda::std::byte const* bitmap) + { + cuda::std::size_t byte_offset = 0; + cuda::std::byte const* bitmap_ptr = bitmap; + cuda::std::memcpy(&num_buckets, bitmap_ptr, sizeof(cuda::std::uint64_t)); + byte_offset += sizeof(cuda::std::uint64_t); // skip num_buckets + + for (cuda::std::size_t i = 0; i < num_buckets; ++i) { + byte_offset += sizeof(cuda::std::uint32_t); // skip bucket key + roaring_bitmap_metadata bucket_meta{bitmap_ptr + byte_offset}; + if (!bucket_meta.valid) { + valid = false; + return; + } + num_keys += bucket_meta.num_keys; + byte_offset += bucket_meta.size_bytes; // skip bucket + } + size_bytes = byte_offset; + valid = true; + } +}; } // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/roaring_bitmap.cuh b/include/cuco/roaring_bitmap.cuh index 69ad93ae8..4ca3fb8a2 100644 --- a/include/cuco/roaring_bitmap.cuh +++ b/include/cuco/roaring_bitmap.cuh @@ -25,53 +25,135 @@ namespace cuco { +/** + * @brief GPU-accelerated container that owns a serialized Roaring bitmap. + * + * The `roaring_bitmap` provides host-side bulk membership queries over a bitmap stored in the + * [Roaring bitmap format specification](https://github.com/RoaringBitmap/RoaringFormatSpec). + * The serialized bytes are copied to device-accessible storage upon construction, and queries are + * executed on the GPU. + * + * In addition to bulk host APIs such as `contains`/`contains_async`, this container exposes a + * non-owning reference object via `ref()` that can be used for device-side per-thread queries. + * + * @tparam T Key type. Must be `cuda::std::uint32_t` or `cuda::std::uint64_t`. + * @tparam Allocator Allocator type used to manage device-accessible storage for the serialized + * bytes. + */ template > class roaring_bitmap { public: - using storage_type = detail::roaring_bitmap_storage; - using allocator_type = typename storage_type::allocator_type; - using ref_type = roaring_bitmap_ref; - + using value_type = T; ///< Key type + using storage_type = detail::roaring_bitmap_storage; ///< Storage implementation + using allocator_type = typename storage_type::allocator_type; ///< Allocator type + using ref_type = roaring_bitmap_ref; ///< Non-owning reference type + + /** + * @brief Constructs a `roaring_bitmap` by copying the serialized bytes to device-accessible + * storage. + * + * @param bitmap Pointer to the beginning of the serialized bitmap in host memory + * @param alloc Allocator used to allocate device-accessible storage + * @param stream CUDA stream used for device memory operations during construction + */ roaring_bitmap(cuda::std::byte const* bitmap, Allocator const& alloc = {}, cuda::stream_ref stream = {}); - roaring_bitmap(roaring_bitmap const& other) = default; - roaring_bitmap(roaring_bitmap&& other) = default; - roaring_bitmap& operator=(roaring_bitmap const& other) = default; - roaring_bitmap& operator=(roaring_bitmap&& other) = default; - - ~roaring_bitmap() = default; - + roaring_bitmap(roaring_bitmap const& other) = default; ///< Copy constructor + roaring_bitmap(roaring_bitmap&& other) = default; ///< Move constructor + roaring_bitmap& operator=(roaring_bitmap const& other) = default; ///< Copy assignment + roaring_bitmap& operator=(roaring_bitmap&& other) = default; ///< Move assignment + + ~roaring_bitmap() = default; ///< Destructor + + /** + * @brief Bulk membership query for keys in `[first, last)`. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `contains_async`. + * + * @tparam InputIt Device-accessible random access input iterator of keys convertible to `T` + * @tparam OutputIt Device-accessible random access output iterator to `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param contained Output iterator where results are written; `true` iff the corresponding key + * is present in the bitmap + * @param stream CUDA stream used for device memory operations and kernel launches + */ template void contains(InputIt first, InputIt last, OutputIt contained, cuda::stream_ref stream = {}) const; + /** + * @brief Asynchronously performs a bulk membership query for keys in `[first, last)`. + * + * @tparam InputIt Device-accessible random access input iterator of keys convertible to `T` + * @tparam OutputIt Device-accessible random access output iterator to `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param contained Output iterator where results are written; `true` iff the corresponding key + * is present in the bitmap + * @param stream CUDA stream used for device memory operations and kernel launches + */ template void contains_async(InputIt first, InputIt last, OutputIt contained, cuda::stream_ref stream = {}) const noexcept; - // TODO contains_if, contains_if_async, empty - + /** + * @brief Number of keys stored in the bitmap. + * + * @return Count of keys in the bitmap + */ [[nodiscard]] cuda::std::size_t size() const noexcept; + /** + * @brief Checks whether the bitmap contains no keys. + * + * @return `true` iff `size() == 0` + */ [[nodiscard]] bool empty() const noexcept; + /** + * @brief Returns a pointer to the beginning of the serialized bitmap bytes in device-accessible + * storage. + * + * @return Pointer to the serialized storage + */ [[nodiscard]] cuda::std::byte const* data() const noexcept; + /** + * @brief Size in bytes of the serialized bitmap storage. + * + * @return Number of bytes occupied by the serialized bitmap + */ [[nodiscard]] cuda::std::size_t size_bytes() const noexcept; + /** + * @brief Returns the allocator used to manage device-accessible storage. + * + * @return Allocator instance + */ [[nodiscard]] allocator_type allocator() const noexcept; + /** + * @brief Returns a non-owning reference to the underlying bitmap suitable for device-side use. + * + * The returned reference type provides device functions such as `contains(T)` for per-thread + * membership testing. + * + * @return Non-owning reference to the underlying bitmap + */ [[nodiscard]] ref_type ref() const noexcept; private: storage_type storage_; - ref_type ref_; }; } // namespace cuco diff --git a/include/cuco/roaring_bitmap_ref.cuh b/include/cuco/roaring_bitmap_ref.cuh index 41994099f..88b704c28 100644 --- a/include/cuco/roaring_bitmap_ref.cuh +++ b/include/cuco/roaring_bitmap_ref.cuh @@ -23,39 +23,122 @@ namespace cuco { +/** + * @brief Non-owning reference to a Roaring bitmap stored in its serialized format. + * + * A `roaring_bitmap_ref` provides device and host APIs to query membership against a bitmap that + * is laid out according to the [Roaring bitmap format + * specification](https://github.com/RoaringBitmap/RoaringFormatSpec). The object does not own the + * underlying storage; it simply provides algorithms over the referenced bytes. + * + * @note The reference reads directly from the serialized representation without deserializing. + * It supports 32-bit and 64-bit key types. For 32-bit bitmaps the layout follows the + * "Standard 32-bit Roaring Bitmap" format; for 64-bit bitmaps, the "portable" format is + * supported. + * + * @tparam T Key type stored in the bitmap. Must be `cuda::std::uint32_t` or `cuda::std::uint64_t`. + */ template class roaring_bitmap_ref { using impl_type = detail::roaring_bitmap_impl; public: - using storage_ref_type = typename impl_type::storage_ref_type; - + using value_type = T; ///< Key type stored in the bitmap + using storage_ref_type = typename impl_type::storage_ref_type; ///< Implementation storage ref + + /** + * @brief Constructs a non-owning reference from an implementation-specific storage reference. + * + * @param storage_ref Reference to the underlying serialized bitmap storage + */ __host__ __device__ roaring_bitmap_ref(storage_ref_type const& storage_ref); + /** + * @brief Constructs a device-side reference from a raw pointer to a 32-bit Roaring bitmap. + * + * @note This constructor is only available when `T == cuda::std::uint32_t` and can be used in + * device code to create a lightweight view over device-resident serialized bytes. + * + * @param bitmap Pointer to the beginning of the serialized bitmap in device memory + */ template >> __device__ roaring_bitmap_ref(cuda::std::byte const* bitmap); + /** + * @brief Bulk membership query for keys in `[first, last)`. + * + * @note This function synchronizes the given stream. For asynchronous execution use + * `contains_async`. + * + * @tparam InputIt Device-accessible random access input iterator of keys convertible to `T` + * @tparam OutputIt Device-accessible random access output iterator to `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param contained Output iterator where results are written; `true` iff the corresponding key + * is present in the bitmap + * @param stream CUDA stream used for device memory operations and kernel launches + */ template __host__ void contains(InputIt first, InputIt last, OutputIt contained, cuda::stream_ref stream = {}) const; + /** + * @brief Asynchronously performs a bulk membership query for keys in `[first, last)`. + * + * @tparam InputIt Device-accessible random access input iterator of keys convertible to `T` + * @tparam OutputIt Device-accessible random access output iterator to `bool` + * + * @param first Beginning of the sequence of keys + * @param last End of the sequence of keys + * @param contained Output iterator where results are written; `true` iff the corresponding key + * is present in the bitmap + * @param stream CUDA stream used for device memory operations and kernel launches + */ template __host__ void contains_async(InputIt first, InputIt last, OutputIt contained, cuda::stream_ref stream = {}) const noexcept; + /** + * @brief Device-side membership query for a single key. + * + * @param value Key to test for membership + * + * @return `true` iff `value` is contained in the bitmap + */ __device__ bool contains(T value) const; + /** + * @brief Number of keys stored in the bitmap. + * + * @return Count of keys in the bitmap + */ [[nodiscard]] __host__ __device__ cuda::std::size_t size() const noexcept; + /** + * @brief Checks whether the bitmap contains no keys. + * + * @return `true` iff `size() == 0` + */ [[nodiscard]] __host__ __device__ bool empty() const noexcept; + /** + * @brief Returns a pointer to the beginning of the serialized bitmap bytes. + * + * @return Pointer to the serialized storage + */ [[nodiscard]] __host__ __device__ cuda::std::byte const* data() const noexcept; + /** + * @brief Size in bytes of the serialized bitmap storage. + * + * @return Number of bytes occupied by the serialized bitmap + */ [[nodiscard]] __host__ __device__ cuda::std::size_t size_bytes() const noexcept; private: From a56e3a9392aed3ef6ec9c41daa97660ef773000c Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 13 Aug 2025 10:54:52 -0700 Subject: [PATCH 10/24] Update readme --- README.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index c66f76f30..546ebf99a 100644 --- a/README.md +++ b/README.md @@ -259,4 +259,11 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection `cuco::bloom_filter` implements a Blocked Bloom Filter for approximate set membership queries. #### Examples: -- [Host-bulk APIs (Default fingerprinting policy)](https://github.com/NVIDIA/cuCollections/blob/dev/examples/bloom_filter/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJydVmtvGjkU_StXsx8WmuEVbVUJQiSapLtoK5IF2qpaVsjj8TBWBnvqBwRF-e977ZmBgZBqtVRqwL6Pc889vvZzoJnWXAod9P9-Dngc9HthkBGxsmTFgn5AbUyCMNDSKup-d94tBLyDG5nvFF-lBhq0CZfdy99CmHwd345HcHM_fbifjubj-0nb2Xr7z5wyoVkMVsRMgUkZjHJC8U-5E8JXphwQuGx3oeEMFkG5twiaAx9lJy2syQ6ENGA1wzBcQ8IzBuyJstwAF0DlOs84EZTBlpvUpyrjeDjwvQwiI0PQnqBHjr-SuiUQs4fuPqkxeb_T2W63beJht6VadbLCWHc-j2_uJrO7FkLfu30RGTILiv2wXGHh0Q5IjsgoiRBvRrYgFZCVYrhnpEO-VdxwsQpBy8RsiWI-Tsy1UTyy5oi8CifWXzdA-ohA4kYzGM8WAXwczcaz0Mf5Np7_cf9lDt9G0-loMh_fzeB-is2a3I5dq_DXJxhNvsOf48ltCAypw1TsKVeuCoTKHa0sLjicMXYEI5EFLJ0zyhNOoVIQrOSGKYFlQc7UmhdaQ5Cxj5PxNTfE-LVXxflUnYVYiF-4oJmNGVxRS2UnyqRcL7Hvhqk2ten1sY1JldWmQ6UVpu02X23FbIMplhtGjVTnTdgTo9YBW-YSm7Y7b6Wxuwyl1j7FwCU2hZG1X-bCoOK4aGwkj5sL8YyFgVukWLdxHIOw6-Uj22kntiH0ur92u90B7D-dTucKfmeCKWJYuQ3O_nwkkxduw0Pcd9Btvx-UkcbIrjKe64QrbSAlWeLjuWCy3PD0vpFAvErQKjMPXntoG5Xd0uhx6SvzOPCr22xVmygR8M0tszvynKmDBmaXF0I7tgCcBE5eaLL0JkOXfbB3neVk60553auYDDFLiM0MFA12mjwF5Kt3kuv365q7qnJdl_GeaxW-lKm1ift91KCBqys8kR9t9ojAPO-e47fxJFgOU7lyJK6Knru5WMDsY6yFgNoHw_tcTMRZmbtQZ79_pPMaatewRtU5P1v3LpWiG26rHbEVyrYZeo82ZnDfe80yDbFOK_nSWyHvdZdB3QAdvVj2thd1sRRWYh-mcDjaqgIcYBwafF7M5Tz3Ds6wlDOJ40aFIiwzNX_KWiRldu0scRBidxoF8BASkml2zN1ZR1F3PDoptRhlKX9Zpna107efqziFN1xane1KDeGs35eGjnN3EepU2iyGIh34q80oy1q51HixbBj44YHEzB-mw16dFTyq7i7Ur6gJD4VXjf0vcH2ZJiUG8Bbzt7W7RpjwqE6Gizui3N3W9QOhvWOuZEQyvN7wQomJIahzZamxGCushSmjsKeUR9yga8HrSd2fHqbXXTxjOZbl5oUskGArIkSOrHhfXtVWq8oNhmPTlOiUafcEif3MwnrP0yn2dIqSTvEGnUkmkS7HtgM7LI64X2xU8vIXWuNYbGcaVF8qT6yTQbMJnTJgIb9Cu0Xe5H_kPa2kvvSTvKJ5fkA6UeKLBb9XJPhl17dyPTmsnw48xVAVArr48wVfq-4NiK9CdXjUBmJDae_yve3htsxN8eINWhhoSC8ueh-gRRRNh3q9_NCFVgvvLYP_GczB4lZG1pF_Bmc8qsWklGa4uCkerriA9YrH4CWs9vHmONpH7oKXf_y_fwHeCexw)) \ No newline at end of file +- [Host-bulk APIs (Default fingerprinting policy)](https://github.com/NVIDIA/cuCollections/blob/dev/examples/bloom_filter/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJydVmtvGjkU_StXsx8WmuEVbVUJQiSapLtoK5IF2qpaVsjj8TBWBnvqBwRF-e977ZmBgZBqtVRqwL6Pc889vvZzoJnWXAod9P9-Dngc9HthkBGxsmTFgn5AbUyCMNDSKup-d94tBLyDG5nvFF-lBhq0CZfdy99CmHwd345HcHM_fbifjubj-0nb2Xr7z5wyoVkMVsRMgUkZjHJC8U-5E8JXphwQuGx3oeEMFkG5twiaAx9lJy2syQ6ENGA1wzBcQ8IzBuyJstwAF0DlOs84EZTBlpvUpyrjeDjwvQwiI0PQnqBHjr-SuiUQs4fuPqkxeb_T2W63beJht6VadbLCWHc-j2_uJrO7FkLfu30RGTILiv2wXGHh0Q5IjsgoiRBvRrYgFZCVYrhnpEO-VdxwsQpBy8RsiWI-Tsy1UTyy5oi8CifWXzdA-ohA4kYzGM8WAXwczcaz0Mf5Np7_cf9lDt9G0-loMh_fzeB-is2a3I5dq_DXJxhNvsOf48ltCAypw1TsKVeuCoTKHa0sLjicMXYEI5EFLJ0zyhNOoVIQrOSGKYFlQc7UmhdaQ5Cxj5PxNTfE-LVXxflUnYVYiF-4oJmNGVxRS2UnyqRcL7Hvhqk2ten1sY1JldWmQ6UVpu02X23FbIMplhtGjVTnTdgTo9YBW-YSm7Y7b6Wxuwyl1j7FwCU2hZG1X-bCoOK4aGwkj5sL8YyFgVukWLdxHIOw6-Uj22kntiH0ur92u90B7D-dTucKfmeCKWJYuQ3O_nwkkxduw0Pcd9Btvx-UkcbIrjKe64QrbSAlWeLjuWCy3PD0vpFAvErQKjMPXntoG5Xd0uhx6SvzOPCr22xVmygR8M0tszvynKmDBmaXF0I7tgCcBE5eaLL0JkOXfbB3neVk60553auYDDFLiM0MFA12mjwF5Kt3kuv365q7qnJdl_GeaxW-lKm1ift91KCBqys8kR9t9ojAPO-e47fxJFgOU7lyJK6Knru5WMDsY6yFgNoHw_tcTMRZmbtQZ79_pPMaatewRtU5P1v3LpWiG26rHbEVyrYZeo82ZnDfe80yDbFOK_nSWyHvdZdB3QAdvVj2thd1sRRWYh-mcDjaqgIcYBwafF7M5Tz3Ds6wlDOJ40aFIiwzNX_KWiRldu0scRBidxoF8BASkml2zN1ZR1F3PDoptRhlKX9Zpna107efqziFN1xane1KDeGs35eGjnN3EepU2iyGIh34q80oy1q51HixbBj44YHEzB-mw16dFTyq7i7Ur6gJD4VXjf0vcH2ZJiUG8Bbzt7W7RpjwqE6Gizui3N3W9QOhvWOuZEQyvN7wQomJIahzZamxGCushSmjsKeUR9yga8HrSd2fHqbXXTxjOZbl5oUskGArIkSOrHhfXtVWq8oNhmPTlOiUafcEif3MwnrP0yn2dIqSTvEGnUkmkS7HtgM7LI64X2xU8vIXWuNYbGcaVF8qT6yTQbMJnTJgIb9Cu0Xe5H_kPa2kvvSTvKJ5fkA6UeKLBb9XJPhl17dyPTmsnw48xVAVArr48wVfq-4NiK9CdXjUBmJDae_yve3htsxN8eINWhhoSC8ueh-gRRRNh3q9_NCFVgvvLYP_GczB4lZG1pF_Bmc8qsWklGa4uCkerriA9YrH4CWs9vHmONpH7oKXf_y_fwHeCexw)) + +### roaring_bitmap + +`cuco::roaring_bitmap` implements a Roaring bitmap following the [Roaring bitmap format specification](https://github.com/RoaringBitmap/RoaringFormatSpec). + +#### Examples: +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/roaring_bitmap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy9WAtv2zYQ_is3DSjkxrac9Dk3zuY16WasS4Yk21A0hUBJtE1YEjWSiuMF-e-7IyVbcp31uTlAJJHH7x787njSrae51kLm2hu-vfVE4g33u17K8lnJZtwbenGZMK_raVmqmJ6Dh1c5PISXslgpMZsb8OMOHAwOnsDpH5PjyRhenp3_dnY-vpycnfZJ1Iq_FjHPNU-gzBOuwMw5jAsW46Wa6cIfXJEdcNAfgE8CV141d-V1XliUlSwhYyvIpYFSc4QRGqYi5cBvYl4YEDnEMitSwfKYw1KYuVVV4Vhz4E0FIiPDUJ7higKfpk1JYGZtOv3mxhTDIFgul31mze5LNQtSJ6yD15OXJ6cXJz00fb3s9zzFwILif5VCoePRCliBlsUsQntTtgSpgM0UxzkjyfKlEkbksy5oOTVLprjFSYQ2SkSlaQWvthP9bwpg-FiOgRtfwOTiyoMfxxeTi67F-XNy-fPZ75fw5_j8fHx6OTm5gLNz3KzT4wltFT69gvHpG_hlcnrcBY6hQ1X8plDkBZoqKKw8cTG84LxlxlQ6s3TBYzEVMdQEgpm85ipHt6DgKhOOamhkYnFSkQnDjB17zzmrKrjKvxV5nJYJh8O4jGWgJFOIF0bCZKzox-X86D2Z0ohUmFVgFBNG9-dFgTJtqYQF2iSBWRU8dGItGDNXpTZBwq_RmPCax0aq_nyXSCpnuKnp7skyF-i_ZmkToik3xc3jLGutFXLHIG1yPmsNOUgLGDx0rPvBZsMcAcKoTBchv2G4bxyj5KYjJfgUjnmGEUevDce4a9qniv_t4CKTCAWJlEq5KAsLDOPfJnqT2ZPcpWGlCZZIbsmQ00sJjw56CAQOzO46MpTD08eNYfALqYxNCmRRxkwHpkpmZI3Ff3vuTPrRSr-yIhdIs3c-JaXGrJwhV8uoj4kftGTrp82aDqZjIbXAqK3WHMRsjhcgnP_kbu0s-onyplRITpqLpVIYcRzTZYqsglOW8XTVJZcxkMYKTWWayiXxnXZCD62KnvOkCpHeonDgLlStZGlUmet-JPLP8S-IUhkFGdOGq4BMSphhu-E7n2bYf2nVJ5lUk6V6fvr4q1r19PFuBZ2a7gElm-FoGuYOZjlWjxxZAL_w1SXeYy5GUqaOUz5iDocub5E8mHAPKs6HxI2wYGaOwLeIC6zEM2DGc045GS74SsMI3r7zO9A7AldLhsNWMTqsVYIFAGKwVUI1Gw9lLHCkHC0QOtRoY3hdL-lCY7YUuXl0EJqjTg0EEARI8ilXHE_RYWPwa-39-cn4-NeTfpZ8S0M9Gqu1WJsqD3dZCRSbF7U0nTv-DjFYYPgGL_ByCPsD-tH93sg-NBwFC9cvSj0PI4Zbtuisse9aShDYgm7QDrHrcfd7e4t_w3yEfP4g7rMG7vOPw92F6erVPYzZFU_fgkZ8JnK_03UqeJ74nRr8DnhKjcbns-vp4_-VXZjDG369l82YzB9BNWvyh6lGYo5qN3bLyjR1O4jP39XPn7GLH9I1buvaH3yBri2Rwc1BhbaR3SnypCVyr8XCJaLYMlRQMh4QxL-YO7h5Xi2APRBfyHa7pR_L9tsNR7A7jUOmNVfGp96S1GDRT3huwilD4XUh7mK7VN1DhgZBxGFdkDA-tRXV60zTg9u7Wj9d6IGumBNnqMp2EvTs8q3qGO2ov32edCshqYdDpDpTK6cL09f_hqT6mK8SQdHl2ku7IuZKweEhuvCKoZh9JyE5HKDhbT00ZtdhJNLK9soXGxQ7dLd24ydu3HuaFn9bV6wpmvPFzB80bUY4Z3B9dqKntMYuCO3dyC02PE1nvhO-Dw03ulPFsubIpiG3TXIh8pwnOygTrQwerVE5xULlr5V3NjtzzrHJtT7htkrIsKVWq7UxaHfiK45TXOErlAljLFKH8Zyph0d-bYtiy7CQVsbO-05dn6oY7g8StKW4go5TqblfW-Io2W6SNq1Be_yjFVfYtimpepFWb1KFfXfeUQN0RIcFvV7zxKUc-UAZZ8lR42-_ybkl-t4k7W5Q6-kaynZdLE3DtQTaXNtH43Lqv7e4iVcpwERoHG2U5PgaeXvn_K0I3lKDE5bmdI5njFCbHR0FNEyECpEDU3HT6OmavWEzD6sh97nFpdoIwvDV5PVJGFaJZqHxPQa2fqPmsv5U5EmY4u6S71decEW_dempfPEJZjRq6h4OcxzswPfNQQTo42IYtlToMsJpyjla8aJZu-yG6DKO6fPByDXEuxuQrRhhePaw6Ox-a6ntr4EffBnyp8K6M-Qe2J0vDg55XdBi9MVV2UrVsKqvfq36exxAdzkO05zNFwr8Vr1t0HGzcoBL9omQXtejz2BYLdTms56XX8fx_sGTch-nZWHcNz-vh7CjeG9v_xn0mIrnI52FzwbQ62GnZ_CfobMu6aUsi-yHwFREDcw4jlMcvHbf7nCAtmrh3XXreSxPrXmsFd7dO_v3DxFG5ro=)) \ No newline at end of file From 078ea6758b51af06fc1003a34aad251105cef118 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 13 Aug 2025 17:26:17 -0700 Subject: [PATCH 11/24] Download roaring bitmap .bin files at build time --- CMakeLists.txt | 6 ++ README.md | 2 +- benchmarks/roaring_bitmap/contains_bench.cu | 88 ++++++++++-------- cmake/roaring_testdata.cmake | 39 ++++++++ examples/roaring_bitmap/bitmapwithoutruns.bin | Bin 72616 -> 0 bytes examples/roaring_bitmap/bitmapwithruns.bin | Bin 48056 -> 0 bytes examples/roaring_bitmap/host_bulk_example.cu | 48 ++++++---- examples/roaring_bitmap/portable_bitmap64.bin | Bin 16506 -> 0 bytes 8 files changed, 125 insertions(+), 58 deletions(-) create mode 100644 cmake/roaring_testdata.cmake delete mode 100644 examples/roaring_bitmap/bitmapwithoutruns.bin delete mode 100644 examples/roaring_bitmap/bitmapwithruns.bin delete mode 100644 examples/roaring_bitmap/portable_bitmap64.bin diff --git a/CMakeLists.txt b/CMakeLists.txt index a00476b4e..2bf5f58e3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,6 +88,12 @@ target_include_directories(cuco INTERFACE target_link_libraries(cuco INTERFACE CCCL::CCCL CUDA::toolkit) target_compile_features(cuco INTERFACE cxx_std_17 cuda_std_17) +################################################################################################### +# - Optionally download RoaringFormatSpec test data ----------------------------------------------- + +option(CUCO_DOWNLOAD_ROARING_TESTDATA "Download RoaringFormatSpec test data" ON) +include(${CMAKE_CURRENT_LIST_DIR}/cmake/roaring_testdata.cmake) + ################################################################################################### # - optionally build tests ------------------------------------------------------------------------ diff --git a/README.md b/README.md index 546ebf99a..f9cc3efc0 100644 --- a/README.md +++ b/README.md @@ -266,4 +266,4 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection `cuco::roaring_bitmap` implements a Roaring bitmap following the [Roaring bitmap format specification](https://github.com/RoaringBitmap/RoaringFormatSpec). #### Examples: -- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/roaring_bitmap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy9WAtv2zYQ_is3DSjkxrac9Dk3zuY16WasS4Yk21A0hUBJtE1YEjWSiuMF-e-7IyVbcp31uTlAJJHH7x787njSrae51kLm2hu-vfVE4g33u17K8lnJZtwbenGZMK_raVmqmJ6Dh1c5PISXslgpMZsb8OMOHAwOnsDpH5PjyRhenp3_dnY-vpycnfZJ1Iq_FjHPNU-gzBOuwMw5jAsW46Wa6cIfXJEdcNAfgE8CV141d-V1XliUlSwhYyvIpYFSc4QRGqYi5cBvYl4YEDnEMitSwfKYw1KYuVVV4Vhz4E0FIiPDUJ7higKfpk1JYGZtOv3mxhTDIFgul31mze5LNQtSJ6yD15OXJ6cXJz00fb3s9zzFwILif5VCoePRCliBlsUsQntTtgSpgM0UxzkjyfKlEkbksy5oOTVLprjFSYQ2SkSlaQWvthP9bwpg-FiOgRtfwOTiyoMfxxeTi67F-XNy-fPZ75fw5_j8fHx6OTm5gLNz3KzT4wltFT69gvHpG_hlcnrcBY6hQ1X8plDkBZoqKKw8cTG84LxlxlQ6s3TBYzEVMdQEgpm85ipHt6DgKhOOamhkYnFSkQnDjB17zzmrKrjKvxV5nJYJh8O4jGWgJFOIF0bCZKzox-X86D2Z0ohUmFVgFBNG9-dFgTJtqYQF2iSBWRU8dGItGDNXpTZBwq_RmPCax0aq_nyXSCpnuKnp7skyF-i_ZmkToik3xc3jLGutFXLHIG1yPmsNOUgLGDx0rPvBZsMcAcKoTBchv2G4bxyj5KYjJfgUjnmGEUevDce4a9qniv_t4CKTCAWJlEq5KAsLDOPfJnqT2ZPcpWGlCZZIbsmQ00sJjw56CAQOzO46MpTD08eNYfALqYxNCmRRxkwHpkpmZI3Ff3vuTPrRSr-yIhdIs3c-JaXGrJwhV8uoj4kftGTrp82aDqZjIbXAqK3WHMRsjhcgnP_kbu0s-onyplRITpqLpVIYcRzTZYqsglOW8XTVJZcxkMYKTWWayiXxnXZCD62KnvOkCpHeonDgLlStZGlUmet-JPLP8S-IUhkFGdOGq4BMSphhu-E7n2bYf2nVJ5lUk6V6fvr4q1r19PFuBZ2a7gElm-FoGuYOZjlWjxxZAL_w1SXeYy5GUqaOUz5iDocub5E8mHAPKs6HxI2wYGaOwLeIC6zEM2DGc045GS74SsMI3r7zO9A7AldLhsNWMTqsVYIFAGKwVUI1Gw9lLHCkHC0QOtRoY3hdL-lCY7YUuXl0EJqjTg0EEARI8ilXHE_RYWPwa-39-cn4-NeTfpZ8S0M9Gqu1WJsqD3dZCRSbF7U0nTv-DjFYYPgGL_ByCPsD-tH93sg-NBwFC9cvSj0PI4Zbtuisse9aShDYgm7QDrHrcfd7e4t_w3yEfP4g7rMG7vOPw92F6erVPYzZFU_fgkZ8JnK_03UqeJ74nRr8DnhKjcbns-vp4_-VXZjDG369l82YzB9BNWvyh6lGYo5qN3bLyjR1O4jP39XPn7GLH9I1buvaH3yBri2Rwc1BhbaR3SnypCVyr8XCJaLYMlRQMh4QxL-YO7h5Xi2APRBfyHa7pR_L9tsNR7A7jUOmNVfGp96S1GDRT3huwilD4XUh7mK7VN1DhgZBxGFdkDA-tRXV60zTg9u7Wj9d6IGumBNnqMp2EvTs8q3qGO2ov32edCshqYdDpDpTK6cL09f_hqT6mK8SQdHl2ku7IuZKweEhuvCKoZh9JyE5HKDhbT00ZtdhJNLK9soXGxQ7dLd24ydu3HuaFn9bV6wpmvPFzB80bUY4Z3B9dqKntMYuCO3dyC02PE1nvhO-Dw03ulPFsubIpiG3TXIh8pwnOygTrQwerVE5xULlr5V3NjtzzrHJtT7htkrIsKVWq7UxaHfiK45TXOErlAljLFKH8Zyph0d-bYtiy7CQVsbO-05dn6oY7g8StKW4go5TqblfW-Io2W6SNq1Be_yjFVfYtimpepFWb1KFfXfeUQN0RIcFvV7zxKUc-UAZZ8lR42-_ybkl-t4k7W5Q6-kaynZdLE3DtQTaXNtH43Lqv7e4iVcpwERoHG2U5PgaeXvn_K0I3lKDE5bmdI5njFCbHR0FNEyECpEDU3HT6OmavWEzD6sh97nFpdoIwvDV5PVJGFaJZqHxPQa2fqPmsv5U5EmY4u6S71decEW_dempfPEJZjRq6h4OcxzswPfNQQTo42IYtlToMsJpyjla8aJZu-yG6DKO6fPByDXEuxuQrRhhePaw6Ox-a6ntr4EffBnyp8K6M-Qe2J0vDg55XdBi9MVV2UrVsKqvfq36exxAdzkO05zNFwr8Vr1t0HGzcoBL9omQXtejz2BYLdTms56XX8fx_sGTch-nZWHcNz-vh7CjeG9v_xn0mIrnI52FzwbQ62GnZ_CfobMu6aUsi-yHwFREDcw4jlMcvHbf7nCAtmrh3XXreSxPrXmsFd7dO_v3DxFG5ro=)) \ No newline at end of file +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/roaring_bitmap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy9WAtv4zYS_itzWqCQN7blZB9pncfVG2d7Rrd2YXu7KDYLgZJom7AsqiQVxxfkv3dISpbkKN1He-cAsUUOv3nwm-FQ946kUjKeSKf_8d5hkdM_bjsxSZYZWVKn74RZRJy2I3kmQv3sPb9J4Dlc8XQn2HKlwA1bcNI7eQXj30bD0QCuJtNfJ9PBfDQZd7WoEX_HQppIGkGWRFSAWlEYpCTEr3ymDb9Roe2Ak24PXC1w4-RzN07rzKDseAYbsoOEK8gkRRgmYcFiCvQupKkClkDIN2nMSBJS2DK1MqpyHGMO_J6D8EARlCe4IsWnRVUSiNqbrj8rpdK-52232y4xZne5WHqxFZbeu9HV9Xh23UHT98veJzEGFgT9I2MCHQ92QFK0LCQB2huTLXABZCkozimuLd8KpliybIPkC7UlghqciEklWJCpWvAKO9H_qgCGjyQYuMEMRrMbB94MZqNZ2-B8GM3_M3k_hw-D6XQwno-uZzCZ4maNhyO9Vfj0Fgbj3-Hn0XjYBoqhQ1X0LhXaCzSV6bDSyMZwRmnNjAW3ZsmUhmzBQigIBEt-S0WCbkFKxYZZqqGRkcGJ2YYposzYI-eMKu8muUmesSSMs4jCeZiF3BOcCET0A6Y2JO2G2erykUymWMzUzlOCMCW7qzS9PESKiCdV5IX4L6KLyycnWaKaJ9Uupb5VUBNQK5FJ5UX0Fh3xb2mouOiumkRivkRCxM2TWcIwdpLEVYiq3AI3npJNbS3jDYOaIMmyNmQhDaD33DL2R5NJKwTwgyxe-_SO4J5TjK-dDgSjCxjSDe4Weq0o7pnUe5znTn1bkIUaBUkYc77OUgMMg19HsqwKo8SmcK4JtpgYnGA-bDm8OOkgEFgwwxhkN4XXLyvD4KZcKJNQyMANUS1YCL7R1hj8j1Nr0hsj_daIzJCin1yd0BIzeok8z4IuFg2vJls8lWtamMoplwyjttvzFytBuAZm_dfuFs6inyivMoHE1nMhFwIjjmMyi5GPMCYbGu_a2mUMpDJCCx7HfKtzRe-E7BsVHetJHiJ5QH7PfulKxzMlskR2A5aYJa6NYOtbnPWCmAfeq-PTUxL94GkDI6JIs7LW15n5_7PxqwwsiJQ_v35Z2mgp9w_a-Ppls7rWPjF-xBOOwryaG_lBktNpQ9aYd6munNAZXr2_mvjDyYfxu8lg6E8ng-lo_JM_v57Nh4P54GIy1idMgOWZqn3ymcKqKGJjJmPNwVqWICfhZ7qb42-sDAHnsWW4i3b3-7aKIJUx_b_LM9DXTPVTolZo_D3iAslQ15ImVFcIf013Ei7g4ye3BZ1LsJWt36-VxvNCJRgA0PlklOjTB9sLLLdaOVrApC_RRv-2WNKGymyGhfrFia8uWwUQgOfBFZZD9PCPjGLiGnuK06qkyj3y-aHgS5l9BcQ_Rs7p9WD4y3V3Ez3TQx09VqgxLuQBaXLKmH5WSGsX3AYxWGO0e2f4dQ7HPf3Rv48uzEMlLmDgumkmV35AcIfXrT32Q00JAhvQEu0c2z37--ho_VeYL5DJn8U9reB-_2W4TZi22D5BsKZ4ugY0oEuWuK22VUGTyG0V4A9AY91hfTsZX7_8CjI2VgRDxf8FE7EElVx8pNqWvs_R0rj3eVpqMUvLO7O9WRzb3cbnH4rnb9jxz-ka1HUd9_6GrgOR3t1JjlbKNoq8qok8aTGzScsODGU6cU80xF-Y27v7Pl8AR8D-ZmaYLf3SzLgvOYItfOgTKalQrm6_tRo8TyKaKH9BUHhf49vYF-a_YYMG6WNpX7wwPoUV-Z2v6sH9Q6Fff-kH_Y1JMUFVJlP0s83NvDU2o-7hUdXOhbjs95HqROysLkx1919aqou5zREUXS68NCtCKgScn6MLbwmKmYublsMBPXyoR4-ZdRiJOLc998UExQw97N34iSp7mZXsv8YVY4qkdL10e1WbEc4aXBzL6KleYxb45teFXaxoHC9dK_wUGm50K49lwZHy5mFuAylLEho1UCbYKTy1g2yxoMLdK2-VOzOl2M0bn3BbOWzw7iB2e2PQ7sgVFKeowHum8kMi1Xm4IuL5pVvYIsjWT7mRMfOuVdfVVQz3BwlaU5xDhzGX1K1YkhfevOsr7g_FbWHPHkveem9Y9if18S828ayyxbYvqh4ALsHUldhRx5FOBjxs9HsJJBdLKl1Ka99b5S1VrcVq8JSA3S_NUYnf9sKe3z-qm10vCLrpuyyNsLVAB1eXAsPaiqo3eOPp5DeeikdkiWvzO421Xssf3tqtCvlktWmXVhTTVS-Lm5eOnlZkNYsnA6jXmY4WF_ilzMU-DnqcL9xHSqt25IZhJaj0AbrKMbW7f7D8yzO8pgYnTJ7rpmdDNKrplp-xRYRXatO-F1277tj94Whaze992w2aUn7EBNrduOps76bMwlC_uql_LmxD39wR7cGPsKA9cbPMy3KB_t03AX4tmj2W6mjNFzanoIit12i5rde5hn5eqd1C479xAJ2jOKznDMER5LByV_a1XNnDJcc4-UyfhqXO_RnRfH1r3Lb8foZkMKTEAsKwGpj3l7Wr3o1T9gH558b5gvtf63Dho5Mp9876k0RsgYx12o5-m4qlUZRvh53kNgyPT15lxzhtzcJJp4NwF-HR0fEpdIgIVxdy45_2oNPBvlnhP6W7gagTk01g3ifHLKhghmEY4-CtfQWMA5pwa-ehXcxjWa7NY9FyHj6Zvz8Bdgyhgg==)) \ No newline at end of file diff --git a/benchmarks/roaring_bitmap/contains_bench.cu b/benchmarks/roaring_bitmap/contains_bench.cu index 2f727c541..da66bea4d 100644 --- a/benchmarks/roaring_bitmap/contains_bench.cu +++ b/benchmarks/roaring_bitmap/contains_bench.cu @@ -18,74 +18,80 @@ #include #include +#include #include +#include +#include #include +#include -#include #include -#include -#include +#include -void roaring_bitmap_contains(nvbench::state& state) -{ - namespace fs = std::filesystem; - - // Get the path of the current source file - fs::path source_file_path = __FILE__; - fs::path source_dir = source_file_path.parent_path(); +using namespace cuco::benchmark; // defaults +using namespace cuco::utility; // key_generator, distribution - fs::path path = source_dir / "../../examples/roaring_bitmap/bitmapwithoutruns.bin"; - fs::path full_path = path.lexically_normal(); +template +void roaring_bitmap_contains(nvbench::state& state, nvbench::type_list) +{ + auto const num_items = state.get_int64("NumInputs"); + auto const bitmap_file = state.get_string_or_default("BitmapFile", {}); - std::ifstream file(full_path, std::ios::binary); - if (!file.is_open()) { state.skip("Failed to open bitmap file"); } + std::ifstream file(bitmap_file, std::ios::binary); + if (!file.is_open()) { state.skip("Bitmap file not found"); } // Get file size file.seekg(0, std::ios::end); std::streamsize file_size = file.tellg(); file.seekg(0, std::ios::beg); - char* buffer; - CUCO_CUDA_TRY(cudaMallocHost(&buffer, file_size)); + thrust::universal_host_pinned_vector buffer(file_size); - file.read(buffer, file_size); + file.read(reinterpret_cast(thrust::raw_pointer_cast(buffer.data())), file_size); file.close(); - cuco::roaring_bitmap roaring_bitmap( - reinterpret_cast(buffer)); + cuco::roaring_bitmap roaring_bitmap(thrust::raw_pointer_cast(buffer.data())); - std::vector keys; - for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) { - keys.push_back(k); - } - for (cuda::std::uint32_t k = 100000; k < 200000; ++k) { - keys.push_back(3 * k); - } - for (cuda::std::uint32_t k = 700000; k < 800000; ++k) { - keys.push_back(k); - } + thrust::device_vector items(num_items); - // multiply the keys for more accurate benchmark numbers - for (int i = 0; i < 13; i++) { - keys.insert(keys.end(), keys.begin(), keys.end()); - } + key_generator gen{}; + gen.generate(distribution::unique{}, items.begin(), items.end()); - thrust::device_vector keys_d(keys.begin(), keys.end()); - thrust::device_vector contained(keys.size(), false); + thrust::device_vector contained(items.size(), false); - state.add_element_count(keys.size()); - state.add_global_memory_reads(keys.size(), "InputSize"); + state.add_element_count(items.size()); + state.add_global_memory_reads(items.size(), "InputSize"); + + auto& summ = state.add_summary("BitmapSizeMB"); + summ.set_string("hint", "BitmapSize"); + summ.set_string("short_name", "BitmapSizeMB"); + summ.set_string("description", "Bitmap size in MB"); + summ.set_float64("value", static_cast(file_size) / (1024 * 1024)); state.exec([&](nvbench::launch& launch) { roaring_bitmap.contains_async( - keys_d.begin(), keys_d.end(), contained.begin(), {launch.get_stream()}); + items.begin(), items.end(), contained.begin(), {launch.get_stream()}); }); - - CUCO_CUDA_TRY(cudaFreeHost(buffer)); } -NVBENCH_BENCH(roaring_bitmap_contains) +NVBENCH_BENCH_TYPES(roaring_bitmap_contains, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("roaring_bitmap_contains") + .add_int64_power_of_two_axis("NumInputs", {32}) +// Default benchmark is only available if the Roaring bitmap testdata has been downloaded +#ifdef CUCO_ROARING_DATA_DIR + .add_string_axis("BitmapFile", {std::string(CUCO_ROARING_DATA_DIR) + "/bitmapwithruns.bin"}) +#endif + .set_max_noise(cuco::benchmark::defaults::MAX_NOISE); + +NVBENCH_BENCH_TYPES(roaring_bitmap_contains, + NVBENCH_TYPE_AXES(nvbench::type_list)) .set_name("roaring_bitmap_contains") + .add_int64_power_of_two_axis("NumInputs", {31}) +// Default benchmark is only available if the Roaring bitmap testdata has been downloaded +#ifdef CUCO_ROARING_DATA_DIR + .add_string_axis("BitmapFile", {std::string(CUCO_ROARING_DATA_DIR) + "/portable_bitmap64.bin"}) +#endif .set_max_noise(cuco::benchmark::defaults::MAX_NOISE); \ No newline at end of file diff --git a/cmake/roaring_testdata.cmake b/cmake/roaring_testdata.cmake new file mode 100644 index 000000000..8dded834c --- /dev/null +++ b/cmake/roaring_testdata.cmake @@ -0,0 +1,39 @@ +# ============================================================================= +# Copyright (c) 2025, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +# Only act if enabled +if(NOT CUCO_DOWNLOAD_ROARING_TESTDATA) + return() +endif() + +set(CUCO_ROARING_DATA_DIR "${CMAKE_BINARY_DIR}/data/roaring_bitmap") + +file(MAKE_DIRECTORY "${CUCO_ROARING_DATA_DIR}") + +set(ROARING_FORMATSPEC_BASE "https://raw.githubusercontent.com/RoaringBitmap/RoaringFormatSpec/5177ad9") + +rapids_cmake_download_with_retry("${ROARING_FORMATSPEC_BASE}/testdata/bitmapwithoutruns.bin" + "${CUCO_ROARING_DATA_DIR}/bitmapwithoutruns.bin" + "d719ae2e0150a362ef7cf51c361527585891f01460b1a92bcfb6a7257282a442") + +rapids_cmake_download_with_retry("${ROARING_FORMATSPEC_BASE}/testdata/bitmapwithruns.bin" + "${CUCO_ROARING_DATA_DIR}/bitmapwithruns.bin" + "1f1909bfdd354fa2f0694fe88b8076833ca5383ad9fc3f68f2709c84a2ab70e3") + +rapids_cmake_download_with_retry("${ROARING_FORMATSPEC_BASE}/testdata64/portable_bitmap64.bin" + "${CUCO_ROARING_DATA_DIR}/portable_bitmap64.bin" + "b5a553a759167f5f9ccb3fa21552d943b4c73235635b753376f4faf62067d178") + +# Define macro only when data is available +add_compile_definitions(CUCO_ROARING_DATA_DIR="${CUCO_ROARING_DATA_DIR}") \ No newline at end of file diff --git a/examples/roaring_bitmap/bitmapwithoutruns.bin b/examples/roaring_bitmap/bitmapwithoutruns.bin deleted file mode 100644 index a99fd50aff79b98fa93b3219fc6226fa238d72e2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 72616 zcmeI((;Mqb-01P7u^Kf^(llz)c6l$`wr$(CZQHi(wU=$%wte>dN1Susi}^kCedcB^ zWn;g=imxDp$dwDpiR8cJjX{C?7{d^C~dV-@h#MH*@{}PCxpu z6!3q!|5f0>8vNIQ|5^Y+%Xj_P&HwuO-y!H3a*o)}%;$&8?bYf=emjLHB)_G)m+n}` z^;xIq?4P%J!SY43mvqXK72j34R%3siE5D#o8lePchP!4eV+9{H+cK-`J;!A zZ$G*E^!&3^%oi4aTXt{Nv32V=P2bjkSMz=44`x5=oFvb_yKwEw{u|5hjDOJmN&Oea z-=zPbeSv;F`+W}Hj?JfrbM3|IN`5_sDY)PKdnt~kS)YD-=Kk56=PsW=dts+IS^8bM zYnAp_TV88?z3z?bH!I#MeLJla)a_Z%bA7iD{GXSA0Rlh(2mk>f00e*l5C8%|00;m9 zAOHk_01yBI|2G6mO!UD50zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00;m9AOHk_ z01yBIKmZ5;0U!VbfB+Bx0zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00;m9AOHk_ z01yBIKmZ5;0U!VbfB+Bx0zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00;m9AOHk_ z01yBIKmZ5;0U!VbfB+Bx0zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00;m9AOHk_ z01yBIKmZ5;0U!VbfB+Bx0zd!=00AHX1b_e#00KY&2>d@2fFK0IAqrw49+Dsh(jWt} zAQ$qXB$NtD3#ErLL0O?3P;Mw6R1hix6^BYeWuXdCWvCid6RHE%hZ;dmp%zeUs2$W1 z>H>9#dO>}m0nlJ*7&H3L19~7D zumiC`Jdh%gI*=}qA&@zcEs!&iCy+l-C{Q#|B2YR|E>JO0B~U$3D^NGkAka9_EYLF0 zCeS|6DbO|0BhWk0FEB7LBrrTMDlj%MAuu^GEif}MCon&-D6llJBCtBJF0e7MC9plP zE3h|kAaE#fG;kttI&dyobtJ#Z^5 znj$Tb)<`>~Bhm%wj`TwMA_I`Y$S`CiG6orsOhTq2GmzQHJY*rV1X+%(Le?T1kj=<8 zWGAu**^eAVjv&X8Q^;B50&*F-hTKH%Aor0+$W!D6@)~)Ed_=w=-;rO)UnGEHD2Xy? z5Eao7s-h9pL><&a6KF~_4f+q75zT^TM{}Wh(E?~;v=~|vErXUvE1^}<8fa~_9@-FX zf;LB6p>5F)XlJw=+7s=A_D2VyL(viFXmlJp5uJigM`xjP(FN#YbQ!u5U4yPiH=$e6 z9q4X!ANntP7(IrbM9-k-(M#x6^agqxy@x(TpPx1Ns^LhWV zCSWoa#&j%-*;ovVV=1uISUM~NmKn>2<;3z}`LRM+QLF@38Y_oY#HwJ`v07MNtO3>- zYlgMN+F;QHMJBppaPGje=i`W(HI(7@Yi#@;|W6!Xc*c5RTwDPT?%h z;}Wjm8gAeg?&3b4#8csE@$`5mJS(09&yDB93*tra;&>^%EM5Vxj90^J;&t%)cq60|Z8p1VaP~ zkq8kg5g|;%Av_{Mq$JW1{}368EJSu97m=4JKolm55haN-M0uhTQI)7c)F$c?4T&a1 zbD|Z|mgqoqCb|(li9SSsVh}Nu7(t9C#t{>VDa3SQ7BQDtKrAMf5i5x`#Cl>Av6a|C z>?ZaR{}P9ZW5h|~3~`>gL|i3q5VwhY#6#i<@tk-?yd^#mpNVh8PvQ>&lPF1$G|7hCCf=o@OBQubh$!ug!G7p)bEJPM1OOU0>a%4ra3R#`3Mb;%7kd4V^ zWJ|IQ*`Dk~b|rg|y~%#$KynBv2J3%Q-# zMeZdJkcY^lWgq$gkuN@;3=l2!&G=#Zo*a zQ3|C|24zt$a06{yNoHL506hpJCCqMA}I zsMb_Fsw35f>Q42d`cea^!PGEnBsGQ_PfenxQZuO8)I4e-wS-zut)kXa8>r3HHfkre zhuTjaq>fO>sZ-Qh>H>9{x<=ik?ojusN7Pg51@)SGM}4HeP~WLv)L$wC%KbdVP5 z5UtV?+N2%YqZ4#WIt~2~osrH$XQy+~dFcXlVY(Pyk}gA+rz_D_=^Auxx*pw-ZbCPw zThVRl4s>U_8{L!cL-(f#(L?DG^k{k2VtN_9l3qivr#I1C=^gZL zdLR8SeV9H*pQO*w=jluIRr&^fo4!Xsq@U2w=~why`UCx${zm_#|IjdlG6X|293wC? z6J~TK%GgYdi8Cpf)J!@i1CyD_#^hx3F!`B6Oi`u;Q<^EqRAj0!)tOpMU8Vukm}$ne zWZE$8nNCbsrU%oT>BkIYhA_jKQOsCo0yCMJ#>`~qF!Py3%u;3rvzl4QY-F}D+nHU= zUgiLEh&jrfU`{jVn2XF6<~nnWxyw9Y9y8CFm&_aHJ@bkA%KTt{GZ2fgI7_iC%d--z zuo`Qy7VEM;n`Be5Y1#B_CN?XZgU!w6V+*oH*y3y{wk%tLt;|+qYqE9N`fMY%Dcgc= z&9-AZvR&BjY%jJiJAfU`4r52MW7zTRBz7u0gPqOJV;8bZ*yZdhb}hSs-OO%dcd~of z{p>;Z2z#78#hzs^u$S3u>`nF#d!K#8K4o99ui1C(NA?T*o&ClBWdj_>ksQMXIgtx- zDi`5Q&fz>R!KLKVaQ|=_xh!0EE*F=VE5H@zig6{mGF*AC5?7V0!PVyKaSgd9Tyw4! z*Ou$Rb>_NpJ-I$ye{K*rlpDc~=EiXoxhdRqZWcF}Tfi;mmT@b&HQah`6StMy!R_Yu zasP6MxntZ(?hJRHyTo1PZg97`d)!0r3HO|P#l7V|aG$wv+)wTg2M5t05u}4$PzcJw za8M6MgLW_$j0aN$QwP%pGXygSvjuYo^91t;3k8b?O9V>?%LOY2s|2eDYX$2D8w48% zn+012+XUMOI|aK2djxw2`vnIEhXjWQM+L_QCj=)4rv+yQ=LF{m7X_CFR|HoF*9A8Q zw*VhzXpEcS-{yYDR|H}si zOdthD2nwPQ5>z1~n1UmCLPAI>q!IoRG74FQ>_RRfuTVfJEEE$;3T1@yLM5T9P(!FK z)Ds#CO@!t`E1|8>LFg=W6M71Lg#N-HVW==d7%hwwCJIx8>B1~wuCPE@EG!dN3TuS* z!X{y>utV4_>=XVK4hzSGlfoI{yl_dlD%=om3-^SF!V}@S@Je_qd=Neh--MsS9|0Co zkq~K-69rKg!=f%mMO%!CaWRFMT1+Qq5HpL}#GGOtF~3+yEGm`|ON-^iieeS9x>!rB zD>e`ti_OHAVjHo&*h%av_7HoE{ltOd5OKITN*pUr5GRY%#F^q8alW`nTq>>*SBvY! zjp7z@ySPi-D;^LJiATi~;%V`mcu~9}UKekPcf|+dWAT~zQhX!67e9$##UJ8t5t0xI zmneyqcuA5JNs|o8l3dA`l2R%ut(0EMBxRLyNV%naQbDPRR9q@0m6a+;m8EJ@O{tDl zUuq;Zm0C!xrFK$Bsf*NI>LvA+21tXYVbVxxj5J=FBu$lONVBDR(n4v8v|L&xt(7)N zo26~iPHB&{Upgoqk&a8Jq_ffm>9TZ9x+&d}?n{rPr_u}Qwe(KOTVPQQb5LJ zQfB0!EXpBSl_Rn#JF+JyoJGzq=aTcv1?0kVF}b8%MlLT`lB>!!F;Sd@k zLUf1=2_ZQY4(Xw2$PUFq@lc9T>QK5+hEV2EwouMco>2Z!p-|CKiBRcKxlqMWl~DCi ztx(-igHYp8vrx-Wn^5~ur%=~Wk5KPWztF(YkkIhZsLCm~*#n6?|_0X-*-Oz*3Silw-UuOyXJN?Ikol1a&`DNmIb%4_AF@=^Js zd{=%cf0aNO3zK0c91M%$P*@E|!e-bBd*MVlWjIavpK!)-mT>lPu5jLPfpFn)v2e+7 znQ-}VrEt}7jd1O7y>P>DlW_BJt8m+Jhj8a`w{XvJpK$;1pzzS}i16s}xbVdAl<@TM ztnl3Mg7D(-vhd3An(+GYrtsGAj_~gAzVN@{!{KA$li@Sr^WjV3tKl2r+u?iRhv6sT z=iyi3x8V=r&*5+3pW#1YSVdJrrBzN9R9OwHx*An&HKxYZ6l!WUoti<-tY%Yls(IA> zY9Y0#T0$+YmQyRLRn+QgEw!%NKy9oxQ(LNS)b?s8wX51g?XC7x2dYEV;p!-LtU5uR ztWHyBs&mx&>LPWixj*#4c3NfBegNwcx{q4 zRhyyB*5+vowI$keZI!lG+n{aMwrM-HJ=%Wlpmsz%uAS1(Y8SN2+BNN_c1OFfJ<^_P zFSOU%JME+PMfS^?U^o)8IJ-ePu&#M>E z3+u)7l6o1vyk1GKs@KqK>-F@8dK105-b!z)chEcQ-SnP%AHBamNFS7wgOPmHHZey}n7`s_)Qu>-+S7^~3rx{iJ?IKd)cXuj)7S+xk8Iq5edF zuD{aX>L2va`ZxWj{zr!+XoQH+5iTM`EhB9r?IWEcT_Zgry(9f110zEs!y}_2 zV;zlW>tWm+JY*aI98g-2NMkAxC(ZXnLv@<#yU5xHVFQcz9z!+={ zGe#O?jPb@KW2!O3m~G5678*;8<;E&wt+Bz_Y-}@j8hecW#zEtVaojj%oHZ^OmyK)2 zP2-Mn-*{v^HC`C6jd#XJ(5H5!SUQ77s}6Va5>G|_*e z8KYUE*`v9ld7}lQg`>ryC8K4c<)f9NRiibcwWIZ-4Wmt>&7-ZNZKEBcoul2NJ)?c1 z{iB1TL!%?2qod=Z6Qfh2)1$MZbE6BQi=)e;E2C?o>!X{ZTcbOoyQBM}|3(i-k3~;L z&qU8hFGa6LZ$xiL??oR*pG2QWUq#T+)U?f* z88=gysm*j|1~apn&CF@$G4q>+%%Wxqv$R>xtY}s-tDCjVx@H5jvDwUQX|^%jo1M(A zW)HKs+0Ptk4l#$Dqs+191aq=E&75h@G3T3$%%$cEbG5n7+-Pnwx0}1nz2*V)ka^TR zVV*Y6nHS9~=5_OydDnbkJ~p43FU>dRd-Id|)%;=pHX#eKaEr27i?<|8u{6uDEX%cg zD`};&(pu@QOjcGahn3sPXBD)HSjDYUR#~fpRoSX$)wJqZ^{qx$Q>%s5+G=NYw7OW` ztzK4NYk)P_8fJ~O##rO6N!C>n-`>li45$m{h z$~tRZur6EItee&y>%R5KdTPC}UR&?1kJcCKyY(=Dvm4q??B;eWyRF^9?re9n zd)j^M{`Meys6E0SZI81j+EeW5_AGm@y}({`4n_w0xE6Z^US%6@Bqus_@1?4R}@8+K5KaA=2f1V?tlj_yPq+le`G zCxw&RN#|s6GCSFvoK7Amzf;I5>XdLwJLQ~;P8FxRQ_HFAG;kU_&777_8>hX~$?59! zaC$raoPo{|XSg%U8S6}NCOgxdna&(%zO%?#>a1{9JL{Z{&K768v&-4*9B>XfN1YSS zY3H1C(YfMWcWya%od?ci=b7`;dE>lyJ~>~VAI@(FauFAIDVKG5S8^3sa}C#WUDtP$ zZYnpeo8HajW_5G8x!rtjLAQuo+%4snbt|})-D+-4w~kxiZR9p}Tez*=c5X+vi`(7p z<@R+4xP#qc?nrlxJKmk-PIYIvv)y^_LU)O~++F3abvL-1-EHnpcaOW@J?I{BkGrSb zv+f1=vU|E3bgyN}$b?hE&|`_BF7esRCMzudoWAcn=r7!wP|#8@b%#v(B@=ES^M zB9=0iCiYJ(V=PN7dn{KhZ>&J9aI9FYWUNfAe5_KeYOF@AcC22kVXR54d8}2eZLCAA zbF5peXRJ@Ge{4`}Xlz7mbZlH~Vr)umdTdr~Zfrqpaco&^Wo%7seQZ-~YivhscWhtm z-`L^UvDnGjnb`T*rP$Tjjo9tjz1YLpli2gvtJvGvhuG)Xx7g3vpBU_+9^ug*=Lw$d zg+1MidbSty;$8|bwU^Gz;AQr*c{#m2UVg8TSJW%vmG;Vc6}>86b+49J*K6Q4_L_Mu zy*6HZuano+>*4kG`gsGrA>MFrlsDF!;7#_Xc{9B^-h6M7x71tVt@hS=8@(;wc5j!r z*E`@H@{W2Zywlz}@1l3byYAic?s^Zr$KEsVrT4~r?|t&VdOy719^@lF?o&SN^S_ntmO>zTe1i>bLM) z`|bRWeiy&H-^=go5AX;3!~Bu{7=OG!$)D=a@Mrt;{DuA!f4RTPU+Zu1H~ZWCo&Fww zzkkp_;ve@<`DgtL{$>A~f78F?-}fK+PyHAEYyX}9(f{Iq_ka0+{XiUxlW`^pxc-44~cnmI_|W)>_~`h!_{8{>`1JU!_}ut{_~Q7o_{#X2`1<&! z_}2K2`0n_=_`mVP@ni9m@iX!B@k{Zm@f-2m@q6)y@h9=;@mKM;@elFO@o({;@jr1m zfhLFqo!}BeLQaGedLo*z6R|`*ks^^gkuH%TkvWkqku#Aekv~x=Q8ZB^Q94mBQ87^^ zQ9V&BQ8&>b(Kyj8(K68{(LT{B(KXQ{(L2#EF)%SCF+4FUF*Y$FF*z|UF*7kIF+Z^= zu{5zFu{yCXu`#hFu|2UXu{UubaVT*#aUyX#aV~K&aV2p*aVv2*@gVUy@htH&@h0&; z@hR~&@gwm&0VRf z00e*l5C8%|00;m9AOHk_01yBIKmZ5;0U!VbfB+Bx0zd!=00AKI|3Tn?{ih3XK_CDG qfB+Bx0zd!=00AHX1b_e#00KY&2mk>f00e*l5C8%|00{gq6Zk)YaHA&x diff --git a/examples/roaring_bitmap/bitmapwithruns.bin b/examples/roaring_bitmap/bitmapwithruns.bin deleted file mode 100644 index 5ed243753e169295a32d6251db66180f23ceac06..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 48056 zcmeIuQyb)3w5Z{vVpUYBB$bL&skUv~wr$(CZQHhO+qPG)cJFo0f7tsf^BLb5*YnMt zfdl{y015&iU;{8fdI(UZDhgDsiv4%-|000=3<=DyNCAKU(!k#=^#A~P^j|*sU-o|$ z{I7=pHSoU{0>HA}0I=r2HvJa>ynrrXJE#Th2(g1)Ln~mWaJl5SR1eaf$h0Bbj9dfq zwJ21fSdLOwIlR(`syAvLsJo)!gr+@OHfUR-V}`CuPoVFM0T+hs7_nf?hzT90)R<9V zPKpKGl5fi&tUj@R!{!;=2kdUKzrx`h$F0-&`45+FTsv@U#k~oSdOU0Js>HhtpOmk_ zj~Bl$fIE-{#0aJXUqdROrqDV2oA)5ai8LEB%*ZkzM~gfa3gjqamB7nU@R~Jm<&t1Go*`10DiTfaky~;4Sb0_zZjnz5~C2zd#5? zKpdpNFvx=uPzGb54qBiK`d~^h4VWIx1ZD+ufVshZU_r15SR5<`mIW(-mBDIYO|TAF zA8Z6R1zUiv!FFIrunX87>;?7(2Y`dYVch zsvW8qY8Yw~Y94A8Y8&bh>Ky78>KW=2>K_^u8X6iA8XX!Jni!fAnjV@Jnj2aWS{zyy zS{YgsS|8dJ+8Wvs+8x>#Iv6?H+nJ`auJsA<%GW6f_o^08NIbK{KH_(0phS zv=mwat%lY?8=)=Gc4!y07dikPhK@lep)=5V=n`}lx&hsW?m-WsC(v`~74#PR0DXqO zLf@fZ&|fG7BQOqAa2V#{2rR=fScfgxg?%_BoCZ!0XM(fBIpExIKDZ!U1TGGjg3H1c z;L30{xF%c&t`9eYo5C&N)^IzxBise<4)=ol!UN#J@Gy8JJO&;QPlBhyGvL|qJa{3z z1YQoWg4e(i!Q7^hEj~{gFY)P-Fx$8X1R7M5Z9qky*%GWC5}m zS%$1c)*$PVO~_Vc2eKR4ha5zXAjgqY$XVn9av8aX+(hmm_mM})Q{)Bm8hM9&M7|*Z zB0rGd2!KK;iV`S|vZ#PcsDf&!f!e5tCec)AS~LTi8O?^~MDw8e(L!iZv;;r(I93WPi&elX zW7V*lSRJfB)(C5gwZK|q?XZqm7pyzh3+sywzy@Q(u#wmpY&|73?~83%iRwz#e1Iu$R~y>^=4g`v?1m z{lxxYAP(agPT~yC;UXTzRXmQHxPvF~6nJVp9i9=-f@jBb;d$`_cwxL4UJ@^Zm&YsN zRq+~lZM+`d5O0Dv$6MiT@eX)ryc^yV?}PWp2jN5U5%_3)96k}Bf=|b1;dAi?_+oq+ zz7k)9ug5pxTk#$EZhRkp5I=$+$4}vB@eBB6{2G1}zk}b$AK_2&7x-)Z9sUvjg8z&E zz<=Wa0TC!c5H!IO0wEC!p%DgQ6CRNyQW0s13`Axk8vMJeuY)!T!JCa?FOpZt>*Ou+F8P3bOgr4~?&sb$nk zY7MoX+C*)oc2K*iebhnf2z8t~MV+NCP?xD|)J^IRb)R}fJ*8eyuc>#`N9qgpFZF}^ zO#w7SqclO&G)oJ#L@TsL8?;S(bdpX*r=>H{ndxkFPC5^rpDsifrAyGI>2h>Mx(Z#L zu0_|S8_2>r*dJDas-bL@F5739{WAsV-41J!yL|>(E(6{M(^h5dy{hWS9zokFWpXsmkclsCo zmku!qgEJHpW_TvT$V`mU8H;flpGnE2VbU|1n5;|=CO4ChDaaIIiZi8{vP=c0GECE6T?%&)5EjEbHfY5 zi^I#pE5mET>%*JETf;lTyTkj!2g66g$HS+>XTul5m&4b>H^X)aV-413JvPavV$-r2*vxD;HYb~h&CeEMi?Suy z(rh`lB3p&6&eme@oHvdxkyFUShAZH`v?kJ@z5{gniDwV&AeK z*w5@&_B;EF{mX_pgu^+C3v)ad;bbnx>72#6oX@4?(s1dyOk7qj2bY`6#}(v?aK*V& zTv@IHSDCBE)#U1M^|?k|Q?3Qqnrp{(~#&F}gN!(O!1~;3V z$1UWRaLc(>+*)n}x0&0FZ0*=W^Y8eN z{1^UT{s;e?2Lwn!1wx<&RuBY9Py|gd1Y7Wgq>xHTD`XHd3)zI6LLMQ%P)H~$ln_b_ z<%Eht6`{INOQD@+h33)6&|!W?0~ zut-=ctPoZU>x7NM7Gb-vOV}$M5Dp8+gphr$!#x$sJOD|`?> z3txrr!Y|>k5E2m)7b!6;@?u1k#h9pzmgtJUm{Lq5rWZ4bS;ZV;ZZV%&P%I)A7fXp{ z#R_6&v6@&@tRvPJ8;MQD7Gi6$o!C+AB6b&hiG9TZ;$U%@I8q!Vju$71Q^gtLY;m5r zP+TG|7gvdE#SP+SahteP+#~K64~a*`6XI#{oOn^ZB3>77iFd^Z;$!id_)>f$z861< z|A^njpW+`8jKC2rLPnSf7ZD@Th#HAU%!m_7L{dajN76+yMzTb*M{-5-MhZja*aMs`GYNA^VyMvg>|M@~h~MlM7yN3KO~M(#xJM;=9< zMqWf-N8Uv~M!rP;jr@rGjsOxQp%Nj{5-SOkBq@?68Imn|Qc_AKrIj*BnWbz}PAQL+ zUn(ROl}bpZrE*e5sftuxswLHx8c2<$W>QP3jnrQ1Bz2W~NWG?`qo1PxM88FUM*l=X8J01blo^?mML8;~a$GiLM^4Bo>Xd_+DjpOVkY7v#(G zHTkA|N4_sVlAp>ij{qEbbvuGCWMDh-szN;9RU(ne{obW*x1J(S)`KV_gY zL>aD(QpPG1l*!68Wu`JmnXfETmMSZh)yg_$qq0TWuIy6wDhHIq$}#1naz;6?TvDzo zH{YDM0u{fQr;>bl+Vgn<-77r`KyFfM8#D~4XeBwQDrrz>Z+x>s;{P0)2Qjy zOlnp&hnic>rxsL;sKwP%YFV{{T3M~8)>P}L_0>jdQ?-TKT5YFxRJ*9%)m~~}b$~ip z9j1;{$Ef4gN$ON}hB{lFr!G{NsLRz=>RNS!x>?<(?o{`v`_)70QT2p+T0N&;RIjMl z)m!Ra^?~|WeWt!t->C1^PwGGFH}$9bM+IYW42zL5CdS3YSTv@_;xRMk#1gR-vDC42 zv5c`SvFx#2vAnSYvBI%pv68VevGTD>v8u5evD&eEv4*iGvF5Q>v9_@evCgq>v7WI$ zvHr0^v7xaMvC*+{v5B!MvFWi{vAMAYvBj}vv6ZnkvGuV{v8}NkvE8wKv4gQAvE#8* zv9qxYvCFY*v750wvHP({v8S;YvDdM8v5&DYv43MfV!va625G29Xtc&^f+lH-rfG&| zYo3Ro7~1b+rasW38FiQfs5N*E(rk zwH{h;t)Dhf8=?)@MrmWU3EE_Bnl@9Lqs`YAX-l;g+G=f`wo%)nZP#{bd$j}FVeOc9 zQahua*Dh&SwHw-P?Vk2fd!jwpUTJT&587w#tM*;{rTx`HaU_n%sdzZf$0Kn$9*gU7 zEAGbqc*=O1c=~vzc-DB1cQ>zc++@`c*-`11Iw_}ch}_~!Vw_|EvA z`2P5z_|f=@`04n$_{I2@`1Sa$_}%z}_~ZDq_{;d4`1|;$_&@P)@t^TOaZrbKOeb|l z=X6nz>Z%^sP2JHGdI~+Yo=(rGXVJ6kx%9kx0llzZOfRXI(aYTrX zo9nIgwt5G>v))bbsrS+Q>x1;6`UrirK2D#gPtm9Av-G+80)4T*Okb(5(bwyn^sV|1 zeYd_(Kd2wkkL#!Ov-$=7vVKj!so&A>>yPxO`V0NF{!ag>f6@Qdf9St;z<>5V|#sp)sG0m81%rWL0i;Shl3S+gg&e&*dF}54K zjJ?JIdRd-Id|kNM5~Y5p-m3$`$ev>1!C zL@R2kR@^cz$4Xc!tkhOIE2EXg%5LSd@>&I~!d5Y>q*cZ$Z&k9YS~aZNRz0hs)x>IU zwX)h;9jwk)H>;=B$LenlvW8kCtkKpuYoay9nr_Xq=2{D^#nv)wrM1RdZ*8)+T05-W z);{Z?b;LSuowCka7p%+HHS4Bz$GUGlvYuKmtk>2%>!bC>`q%nl{k8xbvQe9`X`8hL zTe1~fvklv}Jv(WqveVib?96sHJExt;&Tkj8i`pgZ(snt!qFu$VZr8Hw+70Z+b~C%B z-NtTjce1-H|*Q?J^P{k#C~qSvftVt?9cXB`@8+i{%eOE#K9fP2|K(K zabzdv=#J&Mj_;&&(m3gzOiorOhm+gM=M;2`IK`b(PFbgdQ`xEJ)O6}N^_@meQ>TT~ z+G*!>bh~!`x z`<+A1QRjqn+BxT3bgnqpom;ihm?yXoAFZWcGYo6F7X7H|u@#oUr^8MnM!$*tV$JGW*+nyW`x6?i6>rJIkHxE^rsS%iNXj8h5?B$=&MiaCf`= z+=K2B_qcn?J?ma@FT2;=o9-R=zWd01>b`JayYJkO?icr8_lNu21w6<@J;I|s))PF* zQ#{QxJlpfUq?gJ|>t*mVd)d64ULG&MSI8^smGDY?<-Ces6|cHi%d6`(@EUu~yp~=Y zuf5mF>+1FJdVBr6f!+{rxHrlh>rL<`d(*s`-W+efx5!)St?*WR>%5KL7H_+^%iHT6 z@D6*&yp!G;@4R=(yXxKWZhQBwWM(dtbfp-Y@U37fK)rJV7PG2|f`? z$cb1&Pgn^z;U`ii(j?L+G9|JmawKvm@+Ar;iX@6BN+rrBDkLf=swHYB>LltX8YP-0 zS|nO0+9f(Bx+J|;LZGd|~we$-d}xNrK7pYT)osr__*Mn8+6-OuIc^$Ylg{bGJezl>krujE(tYxuSO zdVWK{iQn9B<+t@a_?`W3eow!T-`^kP5A{d*qy2IIM1P7u-Jj*p^%wYy{bl}2e~rK1 z-{f!gclf*gef~lJh=1Ha<)8I0_?P`_{!Ramf8T%PKlNYuul;xaNB@ieum8jU?E^_D zi6)68on(_jQc5aGEomg}q?b%4Qzg?TGbA%7vn6vT^Ca^p3nhytOC(Dt%Oxu&t0b!@ zYbEO@8zdVin69`4vF`0RH~{|2xb70P7F}00000 diff --git a/examples/roaring_bitmap/host_bulk_example.cu b/examples/roaring_bitmap/host_bulk_example.cu index 3309881a2..33ca281bf 100644 --- a/examples/roaring_bitmap/host_bulk_example.cu +++ b/examples/roaring_bitmap/host_bulk_example.cu @@ -13,9 +13,12 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #include #include +#include +#include #include #include #include @@ -34,11 +37,16 @@ * [RoaringBitmapFormatSpec](https://github.com/RoaringBitmap/RoaringFormatSpec) repository and * check if the bulk lookup API returns the correct results. Namely, we test the following files: * - - * [examples/roaring_bitmap/bitmapwithoutruns.bin](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata/bitmapwithoutruns.bin) + * [examples/roaring_bitmap/bitmapwithoutruns.bin + * (32-bit)](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata/bitmapwithoutruns.bin) * - - * [examples/roaring_bitmap/bitmapwithruns.bin](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata/bitmapwithruns.bin) + * [examples/roaring_bitmap/bitmapwithruns.bin + * (32-bit)](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata/bitmapwithruns.bin) * - - * [examples/roaring_bitmap/portable_bitmap64.bin](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata64/portable_bitmap64.bin) + * [examples/roaring_bitmap/portable_bitmap64.bin + * (64-bit)](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata64/portable_bitmap64.bin) + * + * @note This example requires the cmake option -DCUCO_DOWNLOAD_ROARING_TESTDATA=ON to be set. * */ @@ -47,8 +55,8 @@ bool check(std::string const& bitmap_file_path) { auto generate_keys = []() -> thrust::device_vector { if constexpr (cuda::std::is_same_v) { - // reference: - // https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata/README.md#test-data + // Create query keys for the bitmapwith{out}runs.bin files: + // https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata/README.md#test-data std::vector keys; for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) { keys.push_back(k); @@ -61,8 +69,8 @@ bool check(std::string const& bitmap_file_path) } return thrust::device_vector(keys.begin(), keys.end()); } else if constexpr (cuda::std::is_same_v) { - // reference: - // https://github.com/RoaringBitmap/RoaringFormatSpec/blob/master/testdata64/README.md#portable_bitmap64bin + // Create query keys for the portable_bitmap64.bin file: + // https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata64/README.md#portable_bitmap64bin std::vector keys; for (cuda::std::uint64_t k = 0x00000ull; k < 0x09000ull; ++k) { keys.push_back(k); @@ -100,30 +108,38 @@ bool check(std::string const& bitmap_file_path) file.read(reinterpret_cast(thrust::raw_pointer_cast(buffer.data())), file_size); file.close(); + // Create roaring bitmap from the file cuco::roaring_bitmap roaring_bitmap(thrust::raw_pointer_cast(buffer.data())); + // Generate query keys (all should be contained in the bitmap) auto keys = generate_keys(); + + // Create a vector to store the results thrust::device_vector contained(keys.size(), false); + // Bulk-lookup query keys against the bitmap roaring_bitmap.contains(keys.begin(), keys.end(), contained.begin()); + // Check if all the keys are contained in the bitmap bool all_contained = thrust::all_of(contained.begin(), contained.end(), ::cuda::std::identity{}); return all_contained; } int main() { - auto data_dir_prefix = []() -> std::string { - std::string source_path = __FILE__; - auto pos = source_path.find_last_of("/\\"); - return (pos == std::string::npos) ? std::string(".") : source_path.substr(0, pos); - }; - - bool success = check(data_dir_prefix() + "/bitmapwithoutruns.bin"); - success &= check(data_dir_prefix() + "/bitmapwithruns.bin"); - success &= check(data_dir_prefix() + "/portable_bitmap64.bin"); +#ifdef CUCO_ROARING_DATA_DIR + std::string const data_dir = CUCO_ROARING_DATA_DIR; + bool success = check(data_dir + "/bitmapwithoutruns.bin"); + success &= check(data_dir + "/bitmapwithruns.bin"); + success &= check(data_dir + "/portable_bitmap64.bin"); std::cout << "success: " << (success ? "true" : "false") << std::endl; return success ? 0 : 1; +#else + std::cerr << "This example requires CUCO_ROARING_DATA_DIR to be defined (build with cmake option " + "-DCUCO_DOWNLOAD_ROARING_TESTDATA=ON)" + << std::endl; + return 1; +#endif } \ No newline at end of file diff --git a/examples/roaring_bitmap/portable_bitmap64.bin b/examples/roaring_bitmap/portable_bitmap64.bin deleted file mode 100644 index acd0f9007d6902f2fa29b82d8f5ee6662a4291d2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16506 zcmeI&F%Cdb3;@s~5*IOJFgb_WQCz_h9MKIZi`^!9P1@i8x4yr&j5nsfiXyMaUCL~m zIM+7&E_28npZ6?V?B|ka)G-SJ1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ z;P(Reu7JgX-+!Y42oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+z&C*l;jK-t From 6cd8413fb7fc00c39696c5ed0855bd3f811a559f Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 13 Aug 2025 17:27:12 -0700 Subject: [PATCH 12/24] Allow build script to handle extra cmake args --- ci/build.sh | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/ci/build.sh b/ci/build.sh index 3d244f334..7ac9029e3 100755 --- a/ci/build.sh +++ b/ci/build.sh @@ -51,6 +51,8 @@ HOST_COMPILER=${CXX:-g++} # $CXX if set, otherwise `g++` CUDA_ARCHS=native # detect system's GPU architectures CXX_STANDARD=17 +EXTRA_CMAKE_OPTIONS=() + function usage { echo "cuCollections build script" echo "Usage: $0 [OPTIONS]" @@ -62,9 +64,9 @@ function usage { echo " --prefix: Build directory prefix (Defaults to /build)" echo " -i/--infix: Build directory infix (Defaults to local)" echo " -d/--debug: Debug build" - echo " -p/--parallel: Build parallelism (Defaults to \$PARALLEL_LEVEL if set, otherwise the system's number of CPU cores)" - echo " --cuda: CUDA compiler (Defaults to \$CUDACXX if set, otherwise nvcc)" - echo " --cxx: Host compiler (Defaults to \$CXX if set, otherwise g++)" + echo " -p/--parallel: Build parallelism (Defaults to $PARALLEL_LEVEL if set, otherwise the system's number of CPU cores)" + echo " --cuda: CUDA compiler (Defaults to $CUDACXX if set, otherwise nvcc)" + echo " --cxx: Host compiler (Defaults to $CXX if set, otherwise g++)" echo " --arch: Target CUDA arches, e.g. \"60-real;70;80-virtual\" (Defaults to the system's native GPU archs)" echo " --std: CUDA/C++ standard (Defaults to 17)" echo " -v/-verbose/--verbose: Enable shell echo for debugging" @@ -103,6 +105,9 @@ function usage { echo " Enables verbose mode for detailed output and builds with C++17 standard." echo " Build files will be written to /build/local and symlinked to /build/latest." echo + echo "Pass-through:" + echo " -- [CMake args...] Anything after -- is forwarded to CMake" + echo exit 1 } @@ -126,6 +131,7 @@ while [ "${#args[@]}" -ne 0 ]; do --arch) CUDA_ARCHS="${args[1]}"; args=("${args[@]:2}");; --std) CXX_STANDARD="${args[1]}"; args=("${args[@]:2}");; -v | -verbose | --verbose) VERBOSE=1; args=("${args[@]:1}");; + --) EXTRA_CMAKE_OPTIONS+=("${args[@]:1}"); break;; -h | -help | --help) usage ;; *) echo "Unrecognized option: ${args[0]}"; usage ;; esac @@ -200,8 +206,14 @@ echo "-- BUILD_TESTS: ${BUILD_TESTS}" echo "-- BUILD_EXAMPLES: ${BUILD_EXAMPLES}" echo "-- BUILD_BENCHMARKS: ${BUILD_BENCHMARKS}" +if [ ${#EXTRA_CMAKE_OPTIONS[@]} -gt 0 ]; then + echo "-- EXTRA_CMAKE_OPTIONS: ${EXTRA_CMAKE_OPTIONS[*]}" +else + echo "-- EXTRA_CMAKE_OPTIONS: (none)" +fi + # configure -cmake -S .. -B $BUILD_DIR $CMAKE_OPTIONS +cmake -S .. -B $BUILD_DIR $CMAKE_OPTIONS "${EXTRA_CMAKE_OPTIONS[@]}" echo "========================================" if command -v sccache >/dev/null; then From 42e5d01b2be42efd9594d93227c91e7f1a750647 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Wed, 13 Aug 2025 17:46:53 -0700 Subject: [PATCH 13/24] Add unit test --- tests/CMakeLists.txt | 5 + tests/roaring_bitmap/contains_test.cu | 134 ++++++++++++++++++++++++++ 2 files changed, 139 insertions(+) create mode 100644 tests/roaring_bitmap/contains_test.cu diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 21828b360..23258d445 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -152,3 +152,8 @@ ConfigureTest(BLOOM_FILTER_TEST bloom_filter/unique_sequence_test.cu bloom_filter/arrow_policy_test.cu bloom_filter/variable_cg_test.cu) + +################################################################################################### +# - roaring_bitmap --------------------------------------------------------------------------------- +ConfigureTest(ROARING_BITMAP_TEST + roaring_bitmap/contains_test.cu) diff --git a/tests/roaring_bitmap/contains_test.cu b/tests/roaring_bitmap/contains_test.cu new file mode 100644 index 000000000..db3b9cd33 --- /dev/null +++ b/tests/roaring_bitmap/contains_test.cu @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2025 NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +namespace { +template +bool check(std::string const& bitmap_file_path) +{ + auto generate_keys = []() -> thrust::device_vector { + if constexpr (cuda::std::is_same_v) { + std::vector keys; + for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) { + keys.push_back(k); + } + for (int k = 100000; k < 200000; ++k) { + keys.push_back(3 * k); + } + for (int k = 700000; k < 800000; ++k) { + keys.push_back(k); + } + return thrust::device_vector(keys.begin(), keys.end()); + } else if constexpr (cuda::std::is_same_v) { + std::vector keys; + for (cuda::std::uint64_t k = 0x00000ull; k < 0x09000ull; ++k) { + keys.push_back(k); + } + for (cuda::std::uint64_t k = 0x0A000ull; k < 0x10000ull; ++k) { + keys.push_back(k); + } + keys.push_back(0x20000ull); + keys.push_back(0x20005ull); + for (cuda::std::uint64_t i = 0; i < 0x10000ull; i += 2ull) { + keys.push_back(0x80000ull + i); + } + return thrust::device_vector(keys.begin(), keys.end()); + } else { + static_assert(cuco::dependent_false, "KeyType must be uint32_t or uint64_t"); + return {}; + } + }; + + std::ifstream file(bitmap_file_path, std::ios::binary); + if (!file.is_open()) { return false; } + + file.seekg(0, std::ios::end); + std::streamsize file_size = file.tellg(); + file.seekg(0, std::ios::beg); + + thrust::universal_host_pinned_vector buffer(file_size); + + file.read(reinterpret_cast(thrust::raw_pointer_cast(buffer.data())), file_size); + file.close(); + + cuco::roaring_bitmap roaring_bitmap(thrust::raw_pointer_cast(buffer.data())); + + auto keys = generate_keys(); + thrust::device_vector contained(keys.size(), false); + + roaring_bitmap.contains(keys.begin(), keys.end(), contained.begin()); + + bool const all_contained = + thrust::all_of(contained.begin(), contained.end(), ::cuda::std::identity{}); + return all_contained; +} +} // namespace + +TEST_CASE("roaring_bitmap bulk contains from RoaringFormatSpec testdata", "[roaring_bitmap]") +{ +#ifndef CUCO_ROARING_DATA_DIR + SKIP( + "CUCO_ROARING_DATA_DIR is not defined. Configure with -DCUCO_DOWNLOAD_ROARING_TESTDATA=ON to " + "run this test."); +#else + std::string const data_dir = CUCO_ROARING_DATA_DIR; + + SECTION("32-bit: bitmapwithoutruns.bin") + { + std::string const path = data_dir + "/bitmapwithoutruns.bin"; + if (!std::ifstream(path).good()) { + std::string const msg = std::string("Missing file: ") + path; + SKIP(msg.c_str()); + } + REQUIRE(check(path)); + } + + SECTION("32-bit: bitmapwithruns.bin") + { + std::string const path = data_dir + "/bitmapwithruns.bin"; + if (!std::ifstream(path).good()) { + std::string const msg = std::string("Missing file: ") + path; + SKIP(msg.c_str()); + } + REQUIRE(check(path)); + } + + SECTION("64-bit: portable_bitmap64.bin") + { + std::string const path = data_dir + "/portable_bitmap64.bin"; + if (!std::ifstream(path).good()) { + std::string const msg = std::string("Missing file: ") + path; + SKIP(msg.c_str()); + } + REQUIRE(check(path)); + } +#endif +} \ No newline at end of file From 20dc8168ebc408657981cb441fe971eaaae9553e Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 14 Aug 2025 05:36:47 -0700 Subject: [PATCH 14/24] Minor doc fix --- README.md | 2 +- examples/roaring_bitmap/host_bulk_example.cu | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index f9cc3efc0..052636255 100644 --- a/README.md +++ b/README.md @@ -266,4 +266,4 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection `cuco::roaring_bitmap` implements a Roaring bitmap following the [Roaring bitmap format specification](https://github.com/RoaringBitmap/RoaringFormatSpec). #### Examples: -- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/roaring_bitmap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy9WAtv4zYS_itzWqCQN7blZB9pncfVG2d7Rrd2YXu7KDYLgZJom7AsqiQVxxfkv3dISpbkKN1He-cAsUUOv3nwm-FQ946kUjKeSKf_8d5hkdM_bjsxSZYZWVKn74RZRJy2I3kmQv3sPb9J4Dlc8XQn2HKlwA1bcNI7eQXj30bD0QCuJtNfJ9PBfDQZd7WoEX_HQppIGkGWRFSAWlEYpCTEr3ymDb9Roe2Ak24PXC1w4-RzN07rzKDseAYbsoOEK8gkRRgmYcFiCvQupKkClkDIN2nMSBJS2DK1MqpyHGMO_J6D8EARlCe4IsWnRVUSiNqbrj8rpdK-52232y4xZne5WHqxFZbeu9HV9Xh23UHT98veJzEGFgT9I2MCHQ92QFK0LCQB2huTLXABZCkozimuLd8KpliybIPkC7UlghqciEklWJCpWvAKO9H_qgCGjyQYuMEMRrMbB94MZqNZ2-B8GM3_M3k_hw-D6XQwno-uZzCZ4maNhyO9Vfj0Fgbj3-Hn0XjYBoqhQ1X0LhXaCzSV6bDSyMZwRmnNjAW3ZsmUhmzBQigIBEt-S0WCbkFKxYZZqqGRkcGJ2YYposzYI-eMKu8muUmesSSMs4jCeZiF3BOcCET0A6Y2JO2G2erykUymWMzUzlOCMCW7qzS9PESKiCdV5IX4L6KLyycnWaKaJ9Uupb5VUBNQK5FJ5UX0Fh3xb2mouOiumkRivkRCxM2TWcIwdpLEVYiq3AI3npJNbS3jDYOaIMmyNmQhDaD33DL2R5NJKwTwgyxe-_SO4J5TjK-dDgSjCxjSDe4Weq0o7pnUe5znTn1bkIUaBUkYc77OUgMMg19HsqwKo8SmcK4JtpgYnGA-bDm8OOkgEFgwwxhkN4XXLyvD4KZcKJNQyMANUS1YCL7R1hj8j1Nr0hsj_daIzJCin1yd0BIzeok8z4IuFg2vJls8lWtamMoplwyjttvzFytBuAZm_dfuFs6inyivMoHE1nMhFwIjjmMyi5GPMCYbGu_a2mUMpDJCCx7HfKtzRe-E7BsVHetJHiJ5QH7PfulKxzMlskR2A5aYJa6NYOtbnPWCmAfeq-PTUxL94GkDI6JIs7LW15n5_7PxqwwsiJQ_v35Z2mgp9w_a-Ppls7rWPjF-xBOOwryaG_lBktNpQ9aYd6munNAZXr2_mvjDyYfxu8lg6E8ng-lo_JM_v57Nh4P54GIy1idMgOWZqn3ymcKqKGJjJmPNwVqWICfhZ7qb42-sDAHnsWW4i3b3-7aKIJUx_b_LM9DXTPVTolZo_D3iAslQ15ImVFcIf013Ei7g4ye3BZ1LsJWt36-VxvNCJRgA0PlklOjTB9sLLLdaOVrApC_RRv-2WNKGymyGhfrFia8uWwUQgOfBFZZD9PCPjGLiGnuK06qkyj3y-aHgS5l9BcQ_Rs7p9WD4y3V3Ez3TQx09VqgxLuQBaXLKmH5WSGsX3AYxWGO0e2f4dQ7HPf3Rv48uzEMlLmDgumkmV35AcIfXrT32Q00JAhvQEu0c2z37--ho_VeYL5DJn8U9reB-_2W4TZi22D5BsKZ4ugY0oEuWuK22VUGTyG0V4A9AY91hfTsZX7_8CjI2VgRDxf8FE7EElVx8pNqWvs_R0rj3eVpqMUvLO7O9WRzb3cbnH4rnb9jxz-ka1HUd9_6GrgOR3t1JjlbKNoq8qok8aTGzScsODGU6cU80xF-Y27v7Pl8AR8D-ZmaYLf3SzLgvOYItfOgTKalQrm6_tRo8TyKaKH9BUHhf49vYF-a_YYMG6WNpX7wwPoUV-Z2v6sH9Q6Fff-kH_Y1JMUFVJlP0s83NvDU2o-7hUdXOhbjs95HqROysLkx1919aqou5zREUXS68NCtCKgScn6MLbwmKmYublsMBPXyoR4-ZdRiJOLc998UExQw97N34iSp7mZXsv8YVY4qkdL10e1WbEc4aXBzL6KleYxb45teFXaxoHC9dK_wUGm50K49lwZHy5mFuAylLEho1UCbYKTy1g2yxoMLdK2-VOzOl2M0bn3BbOWzw7iB2e2PQ7sgVFKeowHum8kMi1Xm4IuL5pVvYIsjWT7mRMfOuVdfVVQz3BwlaU5xDhzGX1K1YkhfevOsr7g_FbWHPHkveem9Y9if18S828ayyxbYvqh4ALsHUldhRx5FOBjxs9HsJJBdLKl1Ka99b5S1VrcVq8JSA3S_NUYnf9sKe3z-qm10vCLrpuyyNsLVAB1eXAsPaiqo3eOPp5DeeikdkiWvzO421Xssf3tqtCvlktWmXVhTTVS-Lm5eOnlZkNYsnA6jXmY4WF_ilzMU-DnqcL9xHSqt25IZhJaj0AbrKMbW7f7D8yzO8pgYnTJ7rpmdDNKrplp-xRYRXatO-F1277tj94Whaze992w2aUn7EBNrduOps76bMwlC_uql_LmxD39wR7cGPsKA9cbPMy3KB_t03AX4tmj2W6mjNFzanoIit12i5rde5hn5eqd1C479xAJ2jOKznDMER5LByV_a1XNnDJcc4-UyfhqXO_RnRfH1r3Lb8foZkMKTEAsKwGpj3l7Wr3o1T9gH558b5gvtf63Dho5Mp9876k0RsgYx12o5-m4qlUZRvh53kNgyPT15lxzhtzcJJp4NwF-HR0fEpdIgIVxdy45_2oNPBvlnhP6W7gagTk01g3ifHLKhghmEY4-CtfQWMA5pwa-ehXcxjWa7NY9FyHj6Zvz8Bdgyhgg==)) \ No newline at end of file +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/roaring_bitmap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy9WA1v2zYT_iv3qsAgN7blpB_ZnI_NjdPNWF97sNMVQ1MIlETbhGVRI6k4XpD__h5JfSbK2nV75wCxRR6fu-M9dzzqzpFUSsYT6Qw_3jkscoaHXScmySojK-oMnTCLiNN1JM9EqJ-959cJPIcLnu4FW60VuGEHjgZHr2D662Q8GcHFbP7LbD66msymfS1qxN-xkCaSRpAlERWg1hRGKQnxK5_pwq9UaDvgqD8AVwtcO_nctdM5MSh7nsGW7CHhCjJJEYZJWLKYAr0NaaqAJRDybRozkoQUdkytjaocx5gDv-UgPFAE5QmuSPFpWZcEokrT9WetVDr0vN1u1yfG7D4XKy-2wtJ7N7m4nC4ue2h6uex9EuPGgqC_Z0yg48EeSIqWhSRAe2OyAy6ArATFOcW15TvBFEtWXZB8qXZEUIMTMakECzLV2LzCTvS_LoDbRxLcuNECJotrB96MFpNF1-B8mFz9NHt_BR9G8_loejW5XMBsjsGajic6VPj0FkbT3-DnyXTcBYpbh6robSq0F2gq09tKI7uHC0obZiy5NUumNGRLFkJBIFjxGyoSdAtSKrbMUg2NjAxOzLZMEWXGHjlnVHnXyXXyjCVhnEUUTsMs5J7gRCCiHzC1JWk_zNbnj2QyxWKm9p4ShCnZX6fp-UOkiHhSRV6I_yK6PH9ykiWqfVLtU-pbBQ0BtRaZVF5Eb9AR_4aGiov-uk0k5iskRNw-mSUM906SuA5Rl1ti4CnZNtYy3jKoCZKsGkMW0gB6zy1jfzCZtEYAP8jijU9vCcac4v7a6UAwuoQx3WK00GtFMWZSxzjPnWZYkIUaBUkYc77JUgMMo18msqoKk8SmcK4JdpgYnGA-7Di8OOohEFgwwxhkN4XXL2vD4KZcKJNQyMAtUR1YCr7V1hj8j3Nr0hsj_daILJCin1yd0BIzeoU8z4I-Fg2vIVs8VWs6mMoplwx3bV_yFytBuAFm_dfuFs6inyivMoHE1nMhFwJ3HMdkFiMfYUq2NN53tcu4kcoILXkc853OFR0JOTQqevDROqurGc-UyBLZD1hiJl27S52vccgLYh54rw6Pj0n0naeNiIgiXquyzmNT_j07HhlRBD1n2uuXlR2WHv-gHa9feq3qOiWJf8DTiMJVncd50c9DvyUbzJFUVznojS_eX8z88ezD9N1sNPbns9F8Mv3Rv7pcXI1HV6Oz2VSfBgGWUqrKRDFFUFHExqzD-oB1J0H-wM90f4W_MYsDzmPLRhftHg5txiPtMFW_ybPF16zyU6LWaPwd4gLJUNeKJlRns7-hewln8PGT24HeOdgqNBw2ythpoRIMAGjuGyX6pMBWAEujVo4WMOlLtNG_KZZ0oTabYVF9ceSr804BBOB5cIGlCz38PaOYZMae4mSp6HCHvLwvOFFlSgHxjxFwfjka__eyv42e6aGeHivUGBfyDWlzyph-UkhrF9wWMdjgbg9O8OsUDgf6o38fnJmH2r6AgeunmVz7AcEIbzol9n1DCQIb0ArtFFsz-_vgYPNnmC-QyZ_FPa7hfvtluG2YtjA-QbC2_XQNaEBXLHE7XauCJpHbKcDvgca6G_p6Mr5--RfI2FoRDBX_H0zEElRx8ZFqW_o-R0vj3udpqcUsLW9NeLM4ttHG5--K56-I-Od0jZq6Dgd_Q9cDkcHtUY5WybaKvGqIPGkxs0nLHhjKdOIeaYg_MXdw-22-AA6A_c3MMCH90sy4qziC7XboEympUK5ulbUaPE8imih_SVC4rPFd7OHy37BFg_SxVBYv3J_Civx-Vvfg7r7Qr7_0g_7GpJihKpMp-tnmZt7GmlH34VHVzYW4HA6R6kTsrS5Mdfc_WqqPuc0RFF0uvDQrQioEnJ6iC28JiplLlpbDAT38UI8eM-twJ-Lc9twXsylm6L5040eq7MVTsj-MK8YUSelm5Q7qNiOcNbg4ltFTvcYs8M2vM7tY0TheuVb4KTQMdCffy4Ij1S3BdO4pSxIatVAm2Cs8tYNsuaTCLZV3qsjMKXbexicMK4ct9vliXxqDdkeuoDhFBd4JlR8SqU7DNRHPz93CFkF2fsqNjJl3rbq-rmIYHyRoQ3EOHcZcUrdmSV548_tE0esXnX3JHkve5q2j6k-a419s4kktxLYvqh8ALsHUldgZx5FOBjxs9DsEJBdLal1Kp-yt8paq0WK1eErAxktzVOK3vVznd4V6sJsFQTd955URthbozdWlwLC2puoN3k56-e2k5hFZ4dr8_mGt1_IPb9hWhXyy2nQrK4rpupfFLUnvnlZkNYsnN1CvMx0tLvArmbNyH_Q4X7qPlNbtyA3DSlDrA3SVY2p_d2_5l2d4Qw1OmDzXTc-WaFTTLT9jywivv6Z9L7p23bH748m8nt9l2w2aUn7EBNrduuqkdFNmYahfszQ_Z7ahb--ISvADLGjtl7aiLBfo33wV4F9Fs8dSE639wuYUFLH1Gi239TrXMMwrtVto_B4H0DmKw3rOEBxBHlbuWlyrlQNccoiTz_RpWOksz4j261tr2PL7GZLBkBILCMNqYN41Nq56107VB-Sfa-cL7n-dhwsfnUy5d9afJGJLZKzTdfSbTyyNonqT6yQ3YXh49Co7xGlrFk46PYQ7Cw8ODo-hR0S4PpNb_3gAvR72zQr_Kd0NRL2YbAPz7jdmQQ0zDMMYB2_s61oc0ITbOPfdYh7LcmMei5Zz_8n8_Q-9NYGl)) \ No newline at end of file diff --git a/examples/roaring_bitmap/host_bulk_example.cu b/examples/roaring_bitmap/host_bulk_example.cu index 33ca281bf..4e371eaa1 100644 --- a/examples/roaring_bitmap/host_bulk_example.cu +++ b/examples/roaring_bitmap/host_bulk_example.cu @@ -36,14 +36,11 @@ * In this example we load two 32-bit bitmaps and one 64-bit bitmap (portable format) from the * [RoaringBitmapFormatSpec](https://github.com/RoaringBitmap/RoaringFormatSpec) repository and * check if the bulk lookup API returns the correct results. Namely, we test the following files: - * - - * [examples/roaring_bitmap/bitmapwithoutruns.bin + * - [bitmapwithoutruns.bin * (32-bit)](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata/bitmapwithoutruns.bin) - * - - * [examples/roaring_bitmap/bitmapwithruns.bin + * - [bitmapwithruns.bin * (32-bit)](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata/bitmapwithruns.bin) - * - - * [examples/roaring_bitmap/portable_bitmap64.bin + * - [portable_bitmap64.bin * (64-bit)](https://github.com/RoaringBitmap/RoaringFormatSpec/blob/5177ad9/testdata64/portable_bitmap64.bin) * * @note This example requires the cmake option -DCUCO_DOWNLOAD_ROARING_TESTDATA=ON to be set. From 5d1b47056d0a6b035d98a5beba40198513c9e57b Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 14 Aug 2025 06:20:14 -0700 Subject: [PATCH 15/24] Compile benchmarks with -lineinfo --- benchmarks/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 916d674a3..9940c82b9 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -28,7 +28,7 @@ function(ConfigureBench BENCH_NAME) target_include_directories(${BENCH_NAME} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}") target_compile_options(${BENCH_NAME} PRIVATE --compiler-options=-Wall --compiler-options=-Wextra - --compiler-options=-Werror -Wno-deprecated-gpu-targets --expt-extended-lambda) + --compiler-options=-Werror -Wno-deprecated-gpu-targets --expt-extended-lambda -lineinfo) # Add GCC-specific warning suppression only for GCC if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") target_compile_options(${BENCH_NAME} PRIVATE -Xcompiler -Wno-subobject-linkage) From aa56fd6398a340e78ce0a58dc60d778b9ff16b4a Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 14 Aug 2025 06:21:15 -0700 Subject: [PATCH 16/24] Use cub::DeviceTransform instead of thrust::transform --- .../roaring_bitmap/roaring_bitmap_impl.cuh | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh index 42752f2d6..82762dbe0 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -28,7 +29,6 @@ #include #include #include -#include namespace cuco::detail { @@ -74,17 +74,18 @@ class roaring_bitmap_impl { OutputIt contained, cuda::stream_ref stream = {}) const noexcept { - auto nosync_exec_policy = thrust::cuda::par_nosync.on(stream.get()); if (this->empty()) { + auto nosync_exec_policy = thrust::cuda::par_nosync.on(stream.get()); thrust::fill( nosync_exec_policy, contained, contained + cuda::std::distance(first, last), false); } else { - thrust::transform(nosync_exec_policy, - first, - last, - contained, - cuda::proclaim_return_type( - [*this] __device__(auto key) { return this->contains(key); })); + cub::DeviceTransform::Transform( + first, + contained, + cuda::std::distance(first, last), + cuda::proclaim_return_type( + [*this] __device__(auto key) { return this->contains(key); }), + stream.get()); } } From 144be8511a2df41bfb1e5d1dfd3ca4031543ee31 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Thu, 14 Aug 2025 15:23:45 -0700 Subject: [PATCH 17/24] Pass ParentT to CG type and pass CG objects by-value --- README.md | 4 +- examples/static_set/device_ref_example.cu | 4 +- examples/static_set/device_subsets_example.cu | 4 +- include/cuco/bloom_filter_ref.cuh | 10 +-- .../detail/bloom_filter/bloom_filter_impl.cuh | 16 ++--- .../detail/bloom_filter/bloom_filter_ref.inl | 9 ++- include/cuco/detail/bloom_filter/kernels.cuh | 10 +-- include/cuco/detail/dynamic_map_kernels.cuh | 12 ++-- .../detail/hyperloglog/hyperloglog_impl.cuh | 8 +-- .../detail/hyperloglog/hyperloglog_ref.inl | 4 +- .../cuco/detail/open_addressing/kernels.cuh | 26 ++++--- .../open_addressing_ref_impl.cuh | 49 +++++++------ include/cuco/detail/probe_sequence_impl.cuh | 20 +++--- .../probing_scheme/probing_scheme_impl.inl | 8 +-- include/cuco/detail/static_map.inl | 6 +- include/cuco/detail/static_map/kernels.cuh | 8 ++- .../cuco/detail/static_map/static_map_ref.inl | 61 ++++++++-------- include/cuco/detail/static_map_kernels.cuh | 10 +-- .../static_multimap/device_view_impl.inl | 52 ++++++++------ .../cuco/detail/static_multimap/kernels.cuh | 20 +++--- .../static_multimap/static_multimap.inl | 69 ++++++++++-------- .../static_multimap/static_multimap_ref.inl | 34 ++++----- .../static_multiset/static_multiset_ref.inl | 34 ++++----- .../cuco/detail/static_set/static_set_ref.inl | 36 +++++----- include/cuco/detail/utils.cuh | 2 +- include/cuco/hyperloglog_ref.cuh | 5 +- include/cuco/operator.hpp | 48 ++++++------- include/cuco/probing_scheme.cuh | 8 +-- include/cuco/static_map.cuh | 21 +++--- include/cuco/static_map_ref.cuh | 4 +- include/cuco/static_multimap.cuh | 71 +++++++++++-------- include/cuco/static_multimap_ref.cuh | 4 +- include/cuco/static_multiset_ref.cuh | 4 +- include/cuco/static_set_ref.cuh | 4 +- tests/static_multimap/for_each_test.cu | 5 +- tests/static_multiset/for_each_test.cu | 5 +- tests/utility/probing_scheme_test.cu | 3 +- 37 files changed, 369 insertions(+), 329 deletions(-) diff --git a/README.md b/README.md index c66f76f30..4c04ac88f 100644 --- a/README.md +++ b/README.md @@ -209,8 +209,8 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection #### Examples: - [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJx9VgtvIjcQ_ivTraqSu-UVKTqJPFSapCq6EzmF3J1OpSLGa1grxqZ-wFHEf--MvcvjHk0kYD3jmW9mvs_rbeaEc9Jol_X-2mayyHrdPFNMzwObi6yX8VCwLM-cCZbTc_vVWMMruDXLjZXz0kODn8F55_y8iR8XOQw_Du4Gfbh9eHz_8Nh_GjwMW7QhbnonudBOFBB0ISz4UkB_yTh-VZYcPgpLaOC81YEGOYyzyjbOzi5jlI0JsGAb0MZDcALDSAczqQSIL1wsPUgN3CyWSjLNBaylL2OqKk6EA5-rIGbqGfoz3LHEp9mxJzC_h05_pffLXru9Xq9bLMJuGTtvq-Ts2u8Gt_fD0X0Toe-3fdAK2wtW_BOkxcKnG2BLRMbZFPEqtgZjgc2tQJs3hHxtpZd6noMzM79mVsQ4hXTeymnwJ82rcWL9xw7YPqaxcf0RDEbjDH7vjwajPMb5NHj68-HDE3zqPz72h0-D-xE8POKwhncDGhU-_QH94Wd4Oxje5SCwdZhKfFlaqgKhSmqrKFIPR0KcwJiZBMstBZczyaGmEczNSliNZcFS2IVMhEOQRYyj5EJ65uPaN8XFVO2xHuufpeYqFAKueOCm7WgLnzjhWzyUN197FAw9ivYsaE6Rmbo5tvvSBufbhVhhlslKcG9sq_yeizJznJb6vtHhXAWSrPV1fmlwHIItTjbFOl30bL9K_Pgt8rZE78k0qJeJ-MKwwwIrSuaplWIGd2KBvfGWeYEdctTRiqmHJuC8KQKOm6JB__3AHYT3hK5kjcuApEKOqA3SbWUSE2fWLGLAuBkHE50CaZWmWhiancHhVWNS8kXAs8T5WP-MxIhZnrnRpCb3HCkIhApxvojNAQpOUmqP0pO6sTKyOBvrLa5jKsrwVmzgGmH5S2oSQLsN94ul34BTxifgVhAZhfZJTfTbrvD3OKNFqYXCFqyYCsK1qG7kZXoCV5qgcBN2TSicOG5ygZdVHl8yTx3Ac0UgWcFwHiwJEo8b-l4GDwXzjCqJODmNhKQBgiBOsMxJDQGLaHYPNQzDYkohUy9I6AgiNU-QBHCMRa_n5L9i4o_i6rCYxA3XcNH5tdPpHCLe4vmGWgdOB5HEBk2Zq5SPvr-AMgwnx4jWtIMFzHkITNZJsmLsTuvi8rsgDtGvk5ELqRp7VO3jOGfH2IirgdPAIgfiCYzdVYJh0Oc66nM11TT6ZheYixSM7UydqvsZm06qR4B7xl_hFG4owbaOmFc--4Fsvx3NbneEFCWKTSSUScf7EW07OXRzOM-h1cpB7mhD0nyvd3JoJBC0Z9-Y-Jrae9ehG1EGUzFH4p_lSRRCF_S7c9S8QWQFdk6phATFYJIymSup2jgqPPQSf34Y9ijmCHHSmUFKRr0E5X9cztQYdYOeeA6fFFR3rBT8BeTsgI80WekeCYh6icfSAWZ9JvxP_TFbbTpKhncBOdskZe7zlWyFZ5kQOm0jV8JcMRbdIuRJNCJv6yrJYGaNk1R15goGvTOIX0h0WRBb_Ga7q-FgyY3T4GewrVRh8Gi4usITaBQ4xxflT_BHzF5jbo0xRnYJuxTKCh-sBlIzruDdii4r-BqwhytYplecd88vQhfNZunT_SxrYr5r_vp19w00meXltVtM3nSg2URVe_zwWIkomootpvHSpuT0KCbnXOHiKt2wcAFFql-yXV7bUTsndmRGtvs7_v8HAzB-mg==)) -- [Device-ref APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJztWG1v4zYS_itTFcU5G_klQfdaeJPgfMkWNVokRZJtUdSFSlO0xYssqiJlxw3y3-8ZUrLsTbZogft4CZBYmuEzz7xwOPRTZJW12hQ2Gv_yFOk0Gp_EUS6KZS2WKhpHsk5FFEfW1JXk5-GbWUFv6NKU20ovM0c9eUSno9PTPv68jen6x-nVdEKXN7c_3NxO7qc31wNe4Bd9r6UqrEqpLlJVkcsUTUoh8a-RxPSjqpgNnQ5G1GOFWdTIZtHRO4-yNTWtxJYK46i2CjDa0kLnitSjVKUjXZA0qzLXopCKNtpl3lSD4-nQzw2ImTsBfYEVJZ4W-5ok3I46_2TOlePhcLPZDISnPTDVcpgHZTv8fnr5_vrufR_Ud8s-FDnCS5X6vdYVHJ9vSZRgJsUcfHOxIVORWFYKMmeY-abSThfLmKxZuI2olMdJtXWVntfuIHgtT_i_r4DwiQKBm9zR9G4W0b8nd9O72OP8NL3_9ubDPf00ub2dXN9P39_RzS2SdX015VTh6RuaXP9M302vr2JSCB1MqceyYi9AVXNYVRpieKfUAY2FCbRsqaReaEltGdHSrFVVwC0qVbXSoeBAMvU4uV5pJ5x_98I5b2o4K2bF57qQeZ0qOpO1NEPLS2RilRvIOrv4WCMV0EiHi7qQjCzyi325y6raumGq1rCSrJV0phpkr6nkZols5a8LLfKqUGSDF_aNgacguFbJsjJ1aV-qgF6qFgew2iCLSqy86vBNqKF_-dpuqFZqoSo2mahHgWQoOB-05pVWC7pSK4TRwbRCMC0HvynqLl4NVt9q2Jz8MLXdDt1TKiuzhgLShJ1W9M3G529nnzaZlhlJVNpc8T5sCtjBben8rvOIbFqagncZEruozMrLUOmBBYTpXpKHQ2BYVTmYZRYg_6C2loENXmXCZv59bZmNKXj9LtTkQ-3rUGF_khP2YQZGiBPigaxtS1WIlULlulu1iGn3YlqUtZsydxQCop8ky9zMRZ4ktDY6JYlsm1Wyn9ZAsxegmFN8iOJ5Yxu7dDy2-g-VOCqOZsUTXCW2adFCwH5J5_SyWt5xLIgDZx1vPxI1_JfLhJGwIlgdj5s3jbpXclwtwFyOx_wxTUpROc174KzRvuh5IdpmgjpWIk3muZEPvaOjBgfB_ueX4OvNUw52CXeXlHGX-H-lV4NH5NYvCw9D6qh0ADp9pMOfc-rtrWogpukjHo4psPFPRweIjIl6g2M9hjxDJMkHkjjwgyYV7G5Mb3q-YI7ZuPeItXjV8fm-K17wPCueGf1vl8je-5va_cXa8ZvA_r2KiT_CR3GjRf6_jv7ndTQ3Jm-Iwp-2Y6Uw5E-YNnl_VmML8uJBE4xKFIgFnZ_TCFZC4n7Bgl-BuW_iHZfhXyhSBAMjiy56XFVtBYQ--J3aMmjhGh_RRd-vSrclmxuHDl4pNG4-xFXhwhTCn6s1Ps8ifgkeOSaFtchrZQd0n0HcPJHNTJ2n3OWtynFSYpGtZdbYcZlw3OQxjykc8mSkrCseZDCm8X_ULqXCCW7xnmdXi4opJohk0lKAE_2TzofrejVnyOYMQFGCRNjsikeHg63S4Rb1KvELzunt6B-j0ahDvMRciBkJxxZ2jUaA5sI2ExN0v0DsRUoLwePAbiN0wCxNghTYo8Hbd6-S6NDPg1Aqnfd2rIb7OEf73PjgriUnrDvo_PiKEOdKAHkWtdjIVkhuqID-CQnrD1sfVQ4YtVH1oeeZCTR35_sZcnHBFp5axLjR2aXl6WWCnp_3-KLKEUo-p8MUtEvU0yimk5hOYxoMYtK-vMPENB4fjFyBBK_ZhSfspla7hfb7bTBXS5T_UexXDFSR8ufRXginYXTgMCx0hXhlIl-0A1A3R_hxSDmfvO74eMVE84SNvpe909cNWoXkpzuLXjnkRjRHAF1-uJpAUHU5-cRUcXZ2dnL6NYJ4-vXFxUWvO-kwfvVCloLinxONP0X7DrHn0ZAnJXSCOnefThE3xovQvQ6S1FZBpuQD9z6R58Fp7jaF2VDXQ9ENmpAP6No4FZrGRlEpLBd7N1R6gci5f255D-yajKLf2h78G4VwYWLfZYGWCuOlyOMm5G5j-LqyMr7ztehm_h_F26stAT6-u9kUl6d2bGVQgTvVwq907STa2rW8Cm2OZfmWr198bVl4X-cqE2vdcbvn22lq0Ef5vsqXPzgGVAJPGe5PTDZUBWt6Pb4v5aI8rJLg_ke1cVgVrdJHddEVQhwy2Qr2Eombt15s2ww0uYQvCi6pIixj1b2DEmoeNPFCdLu2glhgFr0DU63lZtvyDY37EdojDjp0F7d9et4NDjhKD8H5BA291CBNZ2fohHe1lLiWfkbfeOst58EMGJE_VBmqUq6uCuIzAG-iOOKvBnBKV90XHlGxlvLk9G19ArEpXfg2JOrD3rk8Pj75ivqiktm5XSVfjajfx1ng8MfBE5X2c7Ga-69Icj3fw5RS5ni5Dt9n4AVae_EQPcetHPk6kGPXRc-_-t__AoyhE5s=)) -- [One single storage for multiple sets](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_subsets_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJzNWYtOG0kW_ZU7XmllwNiYbDYZA9awQLLWRJANZEajIeopd5ftWtrdnqpuGw_i3_fcerTbj0TJalZaI2PcXXUf5577qOapYaQxKs9Mo_frU0MljV631UhFNi7FWDZ6jbhMRKPVMHmpY_7e2b_PaJ8u8tlSq_GkoGa8R8dHxy8O8etli65_GlwOzuni5sP7mw_nd4Ob6zZvsJveqVhmRiZUZonUVEwknc9EjA9_p0U_Sc3W0HH7iJq84L7h79039k6slGVe0lQsKcsLKo2EGGVopFJJ8jGWs4JURnE-naVKZLGkhSomVpWXY82hX7yQfFgIrBfYMcO3UX0liaIynV-Topj1Op3FYtEW1ux2rsed1C02nXeDi6vr26tDmF5t-5ilgJe0_L1UGo4PlyRmsCwWQ9ibigXlmsRYS9wrcrZ8oVWhsnGLTD4qFkJLKydRptBqWBZr4AU74X99AeATGYA7v6XB7X2D_nF-O7htWTk_D-7-efPxjn4-__Dh_PpucHVLNx8QrOvLAYcK397Q-fUv9OPg-rJFEtBBlXycafYCpiqGVSYOw1sp18wY5c4sM5OxGqmYAo1onM-lzuAWzaSeKkc4GJlYOamaqkIU9tqWc1ZV5z67z_6isjgtE0mncRnnHcNb4sjIItJy1I7LSX9zVSKwKukIrcWyv_PWqMxiVizStfvFRJem6CRyDiOiuYyLXLcnu5YgqGUsd98zscjaW1blOTCA6XMZjXVezszWEpGOc7BgMl23GfYmcrR2TeWIuhTrC7NyKrWKrdDOvuPhDzY_vD-mHAI1E8lHgXBKQOfWDLWSI7qUUwSigInS0CRfMC85zfJM0rBMHwh7yQAQDizuxTCgkDQt00JBGnnpHF2ONkgxteJVlqi5SkqRkkeA4z1Xwpt1aBSsRyjp_P3ArIrGXY71BSjyB5TANr2EOdANelqiiTRd0mIiM8gRKXPMZjxIC_oAZUpEIQir8tjqtIYFa62GiTATNrmFMjDTAoSIYaTJ09KuV9k8T-cAQwKtfMkaBBl8pAEQDwbzfywz6xsWeSBcrtxxkZLM9MwCywzPOOt5k5EzoS2Izr8NY9mv6gKvlyg-KFgZly6p21a41QLC0RB1K54oOXflpmRLrbqUy_VC2qKd5dlhvrAZyYgXy5lcIf4DKqv0FjuKcIVhxUnFDTZlVupZjsoHaqTLNg0KXsZVWWWFRB7bkmYshaB-Cq5a8Z4UIitogQqMNUluVwRlFf9XuS9KZhrr5mJE8TgyzAd-ndHrE3x0Op1TQt_5eIlfqxyjt5xjXNhfQweomhjWOAGu3DAYSBenLR3DMn5AbbF6zqhb6bgup0OUKEjE2rjUWsIRk-agvJNqYbeSncItyde0ekHy0cmWZJnKKcQa1xOM1IUlwu-l1EuGwwX1QS4jjhyEAPEg5Ue5tPEMq2Y6H-IzMmhYU-k3AFjiKtrrIWOk0JFfdeqR5W6x--V2oRAJpFDEqROFInoaDOr3gzF3rI2ZqvPU0MVbGmuRlalAeJefVfHVL6vC1hlnPTkfqemc-pMUBOHztmmDqiV3bvYb1_ZQYTsd5yXCZqtBLVd9YQiR8F9DzBySgWbuXgVhq86__onT8253Bof6EzJ5Ux-uB538kQlAVLel1wsrTsLWakt4BXPXu-7K3G_Fmjtwr-cSBNREvkauDfyXktAcez1MWSKNinxFxG-WtiNbvlnGJu5VNtyic66C5GJ662qebVI7sz3ywERRvQCunLbTTY03130n7Iyeui160SIM5a9a9H2LuvjexYUurnRxqfv9syfW1XRWLH0Rw8gJI3niQwVyxYz_1txP7ht8EcmVYqyci7SUhrsPbvtvXO7LNOEmZFDEYhZgynji1BQTUXClx-yO9oRKF6N-8tCLkZ4_Z2VhbUfdr2rbymfJRkZ8IxgBHw-7J2tzjpthBhZFWFMDFyUyr4aTqjHWxjGy41jVBa-4gm93EjfuGFKQgvzzEm3AlNf62_Vvdc21xsptfko-eQy9z7lTao479z8bSa4jXmY-_DcQNL4JRtE4zYfgd0TzXAVtzUCy_UositITU5bz3OCcAgzHnL9bg6dFjmjVnajgKRFLx0hN_JlEMLhQtrb7xtBv2puYDSKfvEOUu4fmnj2cgeb0lsdD50ATI8PF2z0e_eTjhiqVPEJT0-6-VNP2I_CxXwbJI74c-OZpv-1RJ7T8us1aLEIh4rQ7o_1mBe0BK3A22bUOrWr5WX1zW0vkfBI5fHJtmq7WuT17XiVcc7Sy8eUOXWB-tayyhx74awdPNDm3kTfx0NTEGlJQiUavCG0eHwcHe_Tk6sq6ZW0fVoa_Zenzq_rk_Hi-z553cP08Tb0M5Jplup8CR3npDlhforOWNhy6BNE2Oc1ADYVxJ0rro280lsYSzF2d4ZDOOPIhL8QXzfkf5cII4fs_y4QNqv-pXP8i0xmLb-M576ix_IM0GO0w62OU5xKpRohy9WxkPbZ-4A9UoygyEzSQBIFh1msrytoGKc2am3SGfEAO-CUuO569iCXmSTerN4NdX5FIMP1f3DLX03HNXrewFhprOCTWQWtbNu1IQI9OnOuE3dlGA1MhmC6zFR5QVuRTFd_o5l-dpy3ad0q_O9tK7885_xnsKpsGzhgPJWJiYH1KR60d2cjtvUpFcrJDDHxIZhpAj5r3jdsyjqUx39Eba_GasHBEad_DxsaeD16oURypKU6nzZCCsHJ1vAldGGFAYXAdNVlPGtvxs3LKB6S_h9ZytTqu2Yd3E4E6JihRo5G0xzB-ziaNHTqQPzskus02u4w7BfG4Vk1R9k--GSFO0N5_OgaI9feLjfffNt4v197P9RTGhKSSoNppdU-W6mr7TceG2uK2H7-aMGgtHdyOekZgxQkdHKiKHHUx_PgCNTAaClQpP8fHwhTr6sOw60rDVDzIyAmRjzjUF6e7BuTtibdZB5opvleRnD8-5zzIMWJmsKsobJhXj2qFCTPpSD0ihlP73CmfYl6UYYudwIq8ECmjEIjGE22l0OPlz-yRsew6OlnhWVtUA7WOIf9GhNYwrss7OKsvh-O-cHgjf1Xs1Sd7mqo21YHhIXmXD5zU9lgJarsZmjiMoShXGDCLNzsZy4sgL3KSzsLqtiVCDd8L9yRv98Ot0vjH1ygCPn8rRWZ1gLXMXh0tnza0P4dUHmRopkDqD_8sLGyvntaP0Z0xiYTjBsdwpQVTUtje3D4WBJfqLKuYWQ0HK7_9E8yaV1-iw1aObR2zaxee1tnQWnOC63FzD4yo2PHp2fMlGLmetCE3g6Ynl6QVAqtz79M2Ks8teqq9a0Y-r-Wme3jNj3hqD79r-FV90lQzyPaIXDv-2AZmr56envIJ9Ph1v99vrsR4HNpjWdSOEW_UZscJwrg9f60oXq9lUerMJjpcbLQaXDnQ3fXqX16NbB7H3eOXZRe3-Wmz_X9Y4xAUOIsPDrqv6FDoeHJmptGrIzo8RC8pDm1BTGRymIrp0P6TLFXDmsw4jlNcnLv_aOECpuzsofHcCvcRvLX7QLzx_Mn-_Af5iUl4)) +- [Device-ref APIs for individual operations](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJztWG1v4zYS_itTFcU5G_klQfdaeJPgfMkWNVokRZJtUdSFSlO0xYssqiJlxw3y3-8ZUrLsTbZov9ynS4DEEofPPPPCmaGfIqus1aaw0fiXp0in0fgkjnJRLGuxVNE4knUqojiypq4kPw_fzAp6Q5em3FZ6mTnqySM6HZ2e9vHnbUzXP06vphO6vLn94eZ2cj-9uR7wBr_pey1VYVVKdZGqilymaFIKiX_NSkw_qorZ0OlgRD0WmEXN2iw6eudRtqamldhSYRzVVgFGW1roXJF6lKp0pAuSZlXmWhRS0Ua7zKtqcDwd-rkBMXMnIC-wo8TTYl-ShNtR55_MuXI8HG42m4HwtAemWg7zIGyH308v31_fve-D-m7bhyKHe6lSv9e6guHzLYkSzKSYg28uNmQqEstKYc0ZZr6ptNPFMiZrFm4jKuVxUm1dpee1O3BeyxP27wvAfaKA4yZ3NL2bRfTvyd30LvY4P03vv735cE8_TW5vJ9f30_d3dHOLYF1fTTlUePqGJtc_03fT66uYFFwHVeqxrNgKUNXsVpUGH94pdUBjYQItWyqpF1pSm0a0NGtVFTCLSlWtdEg4kEw9Tq5X2gnn370wzqsazopZ8bkuZF6nis5kLc3Q8haZWOUGss4uPpZIBSTS4aIuJCOL_GJ_3WVVbd0wVWtoSdZKOlMNstdEcrNEtPLXFy3iqpBkgxf6jYGlILhWybIydWlfioBeqhYHsNogikqsvOjwTcihf_ncbqhWaqEqVpmoR4FgKBgfpOaVVgu6Uiu40UG1gjMtO79J6s5fDVbfauic_DC13QndEyors4YAwoSTVvTNxsdvp582mZYZSWTaXPE5bBLYwWzp_KnziKxamoJPGQK7qMzKryHTAwsspntBHg6BYVXloJZZgPyD2loGNniVCZv597VlNqbg_TtXk3e1z0OF80lO2IcZGMFP8Aeiti1VIVYKmetu1SKm3YtpUdZuytyRCPB-kixzMxd5ktDa6JQkom1WyX5YA81egGJO8SGK541j7NLx2Oo_VOKoOJoVTzCVWKdFCQH7JZ3Ty2x5x74gdpx1fPxI1LBfLhNGwo6gdTxu3jTiXshxtgBzOR7zxzQpReU0n4GzRjoOixkyLU3muZEPF73wRttk_3Xv6KhBhvv_-SUs8IQoB9-E603Kmpb4f6VXg0dE228LD0PqyHUAOn2kw59z6u3taiCm6SMejimw8U9HB4iMiQyEqT2GPINvybuWOBSDJjjsgJje9HwKHbNybxFL8a7j831T_MLzrHhm9L-dNHvvb2r3F7PJHwv793Io_ggf6Y6i-f_M-h9k1tyYvCEKe9qqlkKR70JtOP8s6xbklweNMypRwBd0fk4jaAmh_AUbfgXmvop3nJh_IW3hDIw1uuhxnrU5EWrld2rLoIVrbESlfb8q3ZZsbhyqfKVQ3LnRq8KFSYU_V2t8nkX8EjxyTBNrkdfKDug-w3LzRDYzdZ5yJ7AqRzfFJlvLrNHjMuG4EWBmUxgEyEhZVzzsYJTj_8hmSoUT3AY8zy47FVNM4MmkpQAj-iedDdf1as6QTZ9AmoJEOP6Kx4uDw9PhFvUq8RvO6e3oH6PRqEO8xOyIOQqtDedIw0FzYZupCrJfwPcipYXgkWF3NDpgXk3CKrBHg7fvXiXRoZ-HRal03tuxGu7jHO1z4-ZeSw5Y1wz9iAsX50oAeRa12IhWCG7IgP4JCesbsvcqO4xar3rX81wFmrsZ4AyxuGANTy1i3MjswvL0MkDPz3t8keVwJffyMCntAvU0iukkptOYBoOYtE_vMFWNxwdjWSDBe3buCaeplW6h_XkbzNUS6X8U-x0DVaT8ebTnwmkYL9gNC13BX5nIF-2Q1M0afmRSzgevayivqGiecND3onf6ukKrEPx0p9ELh9iIpinQ5YerCRaqLiafmDzOzs5OTr-GE0-_vri46HW9DyNaL0QpCP450fhTtO_gex4feZpCJahz9-kQcWG8CNXrIEhtFmRKPnDtE3kejOZqU5gNdTUU1aBx-YCujVOhaGwUlcJysneDp18QOdfPLZ-BXZFR9Ftbg3-j4C5M9bso0FJhBBV53LjcbQxfaVbGV74W3cz_o_h4tSnADb2bX3HBakdbBhW4dy38TtdOq61ey7tQ5ngt3_IVja82C2_rXGVirTtu93yDTQ3qKN9p-YIIw4BK4CnDHYvJhqxgSS_Hd6pclIdZEsz_KDcOs6IV-igvukSIQyTbhb1A4nauF9s2Ak0sYYuCSaoI21h0r1FCzIMmfhHVrs0gXjCL3oGqVnNzbPkWx_UI5RGNDtXFbZ-ed4MDWukhOHfQUEsNwnR2hkp4V0uJq-tn9I3X3nIezIAR-abKUJVydVUQ9wC8ieKIvz5Al666L0WiYi3lyenb-gTLpnThG5OoD33n8vj45Cvqi0pm53aVfDWifh-9wOGPgyUq7ediNfdfo-R6vocppczxch2-88ALlPbiIXqO23XE62Adpy56_tX__hezoiBz)) +- [One single storage for multiple sets](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/device_subsets_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJzNWYtOG0kW_ZU7XmllwNiYbDYZA9awQLLWRJANZEajIeopd5ftWtrdnqpuGw_i3_fcerTbj0TJalZaI2PcXXUf5577qOapYaQxKs9Mo_frU0MljV631UhFNi7FWDZ6jbhMRKPVMHmpY_7e2b_PaJ8u8tlSq_GkoGa8R8dHxy8O8etli65_GlwOzuni5sP7mw_nd4Ob6zZvsJveqVhmRiZUZonUVEwknc9EjA9_p0U_Sc3W0HH7iJq84L7h79039k6slGVe0lQsKcsLKo2EGGVopFJJ8jGWs4JURnE-naVKZLGkhSomVpWXY82hX7yQfFgIrBfYMcO3UX0liaIynV-Topj1Op3FYtEW1ux2rsed1C02nXeDi6vr26tDmF5t-5ilgJe0_L1UGo4PlyRmsCwWQ9ibigXlmsRYS9wrcrZ8oVWhsnGLTD4qFkJLKydRptBqWBZr4AU74X99AeATGYA7v6XB7X2D_nF-O7htWTk_D-7-efPxjn4-__Dh_PpucHVLNx8QrOvLAYcK397Q-fUv9OPg-rJFEtBBlXycafYCpiqGVSYOw1sp18wY5c4sM5OxGqmYAo1onM-lzuAWzaSeKkc4GJlYOamaqkIU9tqWc1ZV5z67z_6isjgtE0mncRnnHcNb4sjIItJy1I7LSX9zVSKwKukIrcWyv_PWqMxiVizStfvFRJem6CRyDiOiuYyLXLcnu5YgqGUsd98zscjaW1blOTCA6XMZjXVezszWEpGOc7BgMl23GfYmcrR2TeWIuhTrC7NyKrWKrdDOvuPhDzY_vD-mHAI1E8lHgXBKQOfWDLWSI7qUUwSigInS0CRfMC85zfJM0rBMHwh7yQAQDizuxTCgkDQt00JBGnnpHF2ONkgxteJVlqi5SkqRkkeA4z1Xwpt1aBSsRyjp_P3ArIrGXY71BSjyB5TANr2EOdANelqiiTRd0mIiM8gRKXPMZjxIC_oAZUpEIQir8tjqtIYFa62GiTATNrmFMjDTAoSIYaTJ09KuV9k8T-cAQwKtfMkaBBl8pAEQDwbzfywz6xsWeSBcrtxxkZLM9MwCywzPOOt5k5EzoS2Izr8NY9mv6gKvlyg-KFgZly6p21a41QLC0RB1K54oOXflpmRLrbqUy_VC2qKd5dlhvrAZyYgXy5lcIf4DKqv0FjuKcIVhxUnFDTZlVupZjsoHaqTLNg0KXsZVWWWFRB7bkmYshaB-Cq5a8Z4UIitogQqMNUluVwRlFf9XuS9KZhrr5mJE8TgyzAd-ndHrE3x0Op1TQt_5eIlfqxyjt5xjXNhfQweomhjWOAGu3DAYSBenLR3DMn5AbbF6zqhb6bgup0OUKEjE2rjUWsIRk-agvJNqYbeSncItyde0ekHy0cmWZJnKKcQa1xOM1IUlwu-l1EuGwwX1QS4jjhyEAPEg5Ue5tPEMq2Y6H-IzMmhYU-k3AFjiKtrrIWOk0JFfdeqR5W6x--V2oRAJpFDEqROFInoaDOr3gzF3rI2ZqvPU0MVbGmuRlalAeJefVfHVL6vC1hlnPTkfqemc-pMUBOHztmmDqiV3bvYb1_ZQYTsd5yXCZqtBLVd9YQiR8F9DzBySgWbuXgVhq86__onT8253Bof6EzJ5Ux-uB538kQlAVLel1wsrTsLWakt4BXPXu-7K3G_Fmjtwr-cSBNREvkauDfyXktAcez1MWSKNinxFxG-WtiNbvlnGJu5VNtyic66C5GJ662qebVI7sz3ywERRvQCunLbTTY03130n7Iyeui160SIM5a9a9H2LuvjexYUurnRxqfv9syfW1XRWLH0Rw8gJI3niQwVyxYz_1txP7ht8EcmVYqyci7SUhrsPbvtvXO7LNOEmZFDEYhZgynji1BQTUXClx-yO9oRKF6N-8tCLkZ4_Z2VhbUfdr2rbymfJRkZ8IxgBHw-7J2tzjpthBhZFWFMDFyUyr4aTqjHWxjGy41jVBa-4gm93EjfuGFKQgvzzEm3AlNf62_Vvdc21xsptfko-eQy9z7lTao479z8bSa4jXmY-_DcQNL4JRtE4zYfgd0TzXAVtzUCy_UositITU5bz3OCcAgzHnL9bg6dFjmjVnajgKRFLx0hN_JlEMLhQtraHxuBuurwdotI99JvuijJR_XJzzx7XQHx6ywOjc6mJIeLi7R4Pg_JxQ7lKHqG7aXdfqmn7EYjZL4PkEV8OfDu13_aoE4aAuhdaLEJp4kQ8o_1mBfYBK3A22bUOv2r5WX1zW0tUgSRyiOXaNF31c3v2vEq45ohmI849u8BEa3lmj0Hw146iaHtuI2_iMaqJNaSgEq1fERo_Pg4O9ujJVZp1y9o-0ByQliXUr-qT8-P5Pnvewf7zNPUykH2W-34uHOWlO3J9ieBa2nDoEtTbZDkDNRTGnTGtj771WGJLcHl1qkOC4xCITBFfNOd_lB0jhO__Pjc2yP-nsv-L3Gd0vo35vKPG-w_SYPzDeQDjPpdRNULcq-cn69H2h4JAPooiM0GTSRAqzgNtRVnbIKVZc5POkCHICr_E5cuzF7HEzOnm-Waw6ytSC6b_i9vqeoKu2esW1kJjDYfEOmhty68dKenRiXOdsDvbaGByBPdltsIDyop8quIb3fyr87RF-07pd2dbCf855z-DXWXTwBnjoURMDKxP6ai1Iz95BKiSk5zsEAMfkpkG0KPmfeO2jGNpzHf0xlq8JiwcY9r3sLGx54MXqhZHaooTbDMkJaxcHYFCp0YYUCpc103Wk8ZOBVk55UPU30OzuVod6ewDvolAZROUqNFI2qMaP4uTxg4myJ8dEt1mm13GnZR4pKsmLfsn34wQJ2jvPx0DxPr7xcb7bxvvl2vv53oKY4pSSVDttLqnT3W1_aZjQ21x249oTRi0lg5uRz0jsOKEDg5URY66GH7EgaoYDQWqlJ_1Y2GKdfVhIHalYSoeZOSEyEcc_IvTXUP09lTcrAPNFN-rSM4fn3Me5BgxM9hVFDbMtEe1woS5daQeEcOpfTaVTzFTyrDFTmlFXoiUUQhE46m3Uujx8uf6yFh2HZ2s8KwtqoFax5B_I0JrGNflHZzVl8NxXzi8kb8q9uqTPXFVm-rA8CC9ywdOanv0BLXdnE0cxlCUKwyYxZu9jeVFkBc5SWdhddsSoYbvhXvat_sBWGn8I24UAZ-_lSKzOuRaZq-On08b2p9DKg8ytFcg9Yd_Xha2V0_0x-jXmE3CkYRjuNKCuSlsb24fHYJLdZZVzKzGhZXf_ilnzasv0WErx7aO4rULT-tsaK05wfW4uQdGVOz49Oz5EoxcT9qQm0HTk0vSCoHV2fhpG5XnFj3V3jUjn9dy0z3g5sdAtQfkNfyqPmmqGWR7aK4dkWwDs1dPT0_5lHr8ut_vN1diPA7tsSxqB4s3arPjBGHcnr9WFK_Xsih1ZhMdLjZaDa4c6O569W-xRjaP4-7xy7KL2_xE2v7PrHEICpzFBwfdV3QodDw5M9Po1REdHqKXFIe2ICYyOUzFdGj_kZaqYU1mHMcpLs7df71wAXN39tB4boX7CN7afSDeeP5kf_4DPIxWUA==)) - [Using shared memory as storage](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/shared_memory_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJyVWA1vGzcS_SvEHgrIqSzZwRUtFNk41fbhhOTsg-0kKKpiTXEpiefVUuVypaiG__u9GXK1K9nppQ7iRPyYGb55fDPUU1LqsjS2KJPBr0-JyZLBaTfJZTGv5Fwng0RVmUy6SWkrp-hz_82kEG_EhV1tnZkvvOioI_H25O3fj_Hrh664_jS-HI_Exc3tf25uR_fjm-sebeBNH4zSRakzURWZdsIvtBitpMI_caYrPmlH0Yi3vRPRoQWTJM5NkqN3bGVrK7GUW1FYL6pSw4wpxczkWugvSq-8MIVQdrnKjSyUFhvjF-wq2uFwxC_RiJ16ifUSO1b4NGuvFNLvQqefhferQb-_2Wx6ksPuWTfv52Fx2f8wvri6vrs6Rui7bR-LHPAKp3-vjMPBp1shV4hMySnizeVGWCfk3GnMeUuRb5zxpph3RWlnfiOdZjuZKb0z08rvgVfHifO3FwA-WQC40Z0Y300S8fPobnzXZTufx_f_uvl4Lz6Pbm9H1_fjqztxc4tkXV-OKVX49E8xuv5FvB9fX3aFBnRwpb-sHJ0CoRqCVWcBwzut98KY2RBWudLKzIwSNY3E3K61K3AssdJuaQLhEGTGdnKzNF56HntxOHbVnxST4m-mUHmVaTFUlbL9kraotNS-p6rF-eGKTGJF1p9VhSLLMj9cYS1igYm1TufOVquyF4z034TU_YMpVS6Qgixd6qV121R_kQBAw2FYMnVGz8QlJgvAL73GAUo6cCRSEyOlNtgSwVZzLq9hE3sRUy4B8532t3qGWNJ0ntupzNNUrK3JsB9byVj6CDR13vHblS7kUsctg4H-4nXhUxoXa5mbLA0jlPz_90OgwsJy5bewvx2-MI5BtnwudosQTAG26vxoUjyRj35ffAYRjCu9kHluFZ2LkWifHbhYRzDVjHnAoR560cA9AxcWgNl0lZ0h1QCoysLfvLJVKaaVetS-pNEyt77sxu2bhVELdr5h-2utYMz8AQO5lVnJbtI0JjYVL84J4CodMGTDv7aR7PFs5-i3d5Q6dniNma7YaAoOLKiU5zNJ5SuZ705ip_9FIGLm7JKnHe6-dE4yEYSscPvrpWe7UOJI6vSMA-rsJTWEx6rIceDAFAXuvkEgEkjt0Nctz9Foyy9ma5-dl7ntiqfn5m_cfdScn_KlKuewHPadlpC5XNlH0RcX96PGDcODKase4e7l_RsMSMnTYCLldZ2Wm3EBYcTx_9A7_GrAqpKkBSmP_ku6bbSIjXAEJBNmZ6HDEzvk7raFWjhb1LZDjAh5KR-BV-VAxHov65RYSCo5hSkXQQyjpxKGOgfQMLfr9GcWEkGFC6yOpUkWW9GolPHbXnOFFnKtKQ5oS1GCzEthSEkwImsX0bBfSI9IVyvrcCfoELVN4kRB9aXDoNCNkhDrB3BEO_9w1CQojBDZkB8CzOmpKbI0JMq6shM0IqxrHfMKxbBOfZhEBjiMMJSFmCMT2eFX1SXSBB8RREA1MgIYBEJEtjO3cWoz45ZgI4l_llsClPN5WlI6LVcw_CpQFtFf0Moa1dA86DYVa1BRrkgsgo5TexHB7ArT071uPGVHoU4j0hBUA18vTsepr7BMebnPMSJw2CliQFQROdo_49q-CNUwoD0Bh02oQwikZGGYkuaixqLnoLYnXBxWYFJWDJUPu7MyDxWg9zUsEkTagL7Fsd1wNY_km8oytB1sqcHh4YCWLSfoVGAIilzMtYvmm3WRm3VFuBh9uhrdD4gatdBQ7pCcPWfoJxoXYciEjgLwUfOlgaWx7lCS2ltAulYev3IB6g2t5F7UcFMSX8DN9JiJTrj5jbde_YGpciSeBEpd4WedSXJ1e3tzOxDvcQ--y1gygiV4TCLpxPOkeKYIsAU0MkWH2oS6FIfcvud7hAX7ojRJWOfRHNYqT1BJNKo4-xpYcZnb61vxAOD2Ty-xg7nZ2ykzAcZ8cNiCPlSvA69gMxjCfz7-zOsP2wwEeB7yQG3mK63F0_Hpc83yzyaDYiLvh0VdZJXj9tJZZGx-kGG2HPqFoApn4rQ2edFcfsF1SNCKV_bXihL2xt3_xr1cVktRVMspdKbuRer6U6sd3X5uBQ_6oIhvEGy2DpdT1i90PdR_erOkVs4ZejDUtiKG3AUMDXUCpycnJ-eEMhkE5DqIwEMTPnFE0bMFFYbOgA3vHhqetDvHs685qDG75AvFQUfARQm9WYYAg8E4kYaJfcM5dkuXxiXDiGw3zuK6yir3KWrsIq3FiHly3gAPacqaKKiDYQ9zpDFAv5IO5cXjOSnkFHreCo066L14mjad3HxLs4yfFmLfuIMeJdTkcDUr8eDUocv5S9vxqEFmfkdrmXrLqHzj9lcS8s2OWfTgPo09vXV_xXMNMjdrw9Y9PD9_Ub5INfECpNtIWWzVGsilhvTjRc-5a9rZmPvm-YBrYMKV4pUbzdUiXsnwdKLLsXFyRZJHjeTereS2XEjSw4eGGg8HDKp7crBo18zUzBoM6tlhi7MXwIEk1JaabDeaQaeEaq7RYHJ5cNByetmEiHe3lqV075hNj2eo-aVOgntj8EOX9FXAbm-HLvARl0godOWC6QKw45XumhrMD4zI7ddUtP0AEWch_yG_1Mek7enha4z7MzJ0Wlfq6blVXD9InHPRTl_onAUlg0SyfnXsuv2Dl_KwnbDz4XB4CkV7-9M5fO6_qF552b4LVSuTl6hqSre6OO6-UICTbhLl2jXfoiXFWqnTtz9Up5i2Kx--YkuOcX_P1Pffn_4ojqVTi7Nymf54Io6Pga4_5jAynR3ncjnl791yM23ZVErlGFyHL8kwgOdm8Zg8d-t5gLs3n-l18vwb__kfZVnMsw==)) - [Using set as mapping table to handle large keys or indeterministic sentinels](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_set/mapping_table_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy1WQ1v20YS_SsDHe4iJZRkKck1UGyjOqdFjRb2IXZaFLYhr8iVRITkqsulFJ3g_35vZknqw3KdHnIOHIvk7OybmTcfS60buc7z2GR5Y3CzbsRRY9ALGonKpoWa6sagERaRagSN3BQ25Ovuy9uMXtKZma9sPJ05aoYt6h_13wR08ev5h_MhnV1-_Pflx-H1-eVFh2VF_pc41FmuIyqySFtyM03DuQrxp3wS0K_aMhDqd46oyQK3jfLZbaP1XrSsTEGpWlFmHBW5hpo4p0mcaNJfQj13FGcUmnSexCoLNS1jN5OtSj0Ch34vlZixU5BXWDHH1WRbkpSrofPPzLn5oNtdLpcdJbA7xk67iRfOu7-cn_1wcfVDG9DrZZ-yBJ4lq_8oYgvDxytScyAL1Rh4E7UkY0lNrcYzZxj50sYuzqYB5Wbilspq0RPFubPxuHA7zqtwwv5tAbhPZXDc8IrOr24b9K_h1flVIHp-O7_-6fLTNf02_PhxeHF9_sMVXX5EsC4-nHOocPUjDS9-p5_PLz4EpOE6bKW_zC1bAagxu1VH3odXWu_AmBgPK5_rMJ7EIVUMoqlZaJvBLJprm8aeawAZiZ4kTmOnnNx7ZJxs1b3NbrO_xVmYFJGm47AITTfnJeEo164TFrPTfYlIQSLqKmvV6vTgI1fME73zyM1skbtupBfYe7TQoTO2s686NvC0Vqnc7r70cf5e-JcitjBy5Di6I_1FwVsa6Go6fD-2sZ7QB53CVmeV0znNzJJDz0yeqXxGMIgUnEOJMZ-LOYmyipkG6RZnKqFIOVWrvWeH3PvlIp5TXoQzViOPBoONs-45jLt3AfseC-ZzYx3Ik6zoTfcdqOo0fdar3Af7mrNsEylSNgbpaaxDxdAPQLAamljSmRRkOENKgs7tYRa1r5ZqTs2z4VVLdBuwoow_cwibIhNybf2tJm79UWh4Lm8FTBrxxEzZiNODMqxc8FalBXlpQlsMqDj2zl9iyw5dG9KZeLUymndNlJ1q2TwPoEJzolqDNOf04tzkYKY6c5VBiQk_V7b6SuHZX-YtJRqwOvSTWeKDDXydqpUuTZFEkFGS-IqrmUCCK4AmldIV6RBMg3ejAplm6i1mqLlki8zFqUbhyh3T4zGqTeEdRlHMzlRJsgoOxYpDiCggWFHEuc6pCvZ4R0MsNEkSS2HGU5MUQoFUu5mJtuhRG1fWO9FrxXt4sIiRPgr8Bu5MJ1CuuFZHKIacB0VWcGvIE4MQohDy3hXLCclShK6wGtFjH1c6FioppP4pBqbtAhr8PdGeIp1BUco4AsRVjONXKgck2J526MI4XfpWEAnz4ICSdytxxoE9uVZNcAtlXc_UIkax8H4IUX7H9QacY776aqmMHARazrRHEWfzwomh5FZz0SvNS38RqkfaccXMPAZsHUcHkIA8lZsU92MwCBSMNgQQWLnUI6iPmFpbJQjGVTFlLnoKMDrpLXnBeb5CLBnETo3KRDnH0KJYCid8CSyrFrQ9qloD30_bEkjOttLjcGMZmA0xifNIKAIIh0ogSX3vVCo_CcT7du_el9DHnsrVRNdFS4VhyfV7VnbT7t3ds1CciaNrtWegkUkFF9OZ48I1CiSHmFtJwBRcafghENftHvaLMbzCC23hZ9ur2WKsct-4S2M3kfsesdU-fmVDYYRcsDYhZMfPCzs3XJG5-nXo3JW8gEanQVQBk0uzgRNTFA1RX5cbR0uMQ5CJfMyqzVQyBWI3SzeNuG56vpt9QvTaVSbUfmFVlcluaWpr_UrH_krpmi63vCEJUO7iNHYHOdGUcTNTKHTX6Li-DlC6GvlN1hCvr5rX2BEiL4UaLRrQiD-s-b8HWj-wrMdOG_RnMx1-RsgnNUpPuoipE2KLOjTwSmxrNnqulroOcNI_qfby5iazXFif6IkDMUCKnGsiVOovu5K2lPTT9ROiVqMiZvQCPtEvKgtkqtmzwS_o8p_RqJxuRt5XPNvR2JikbMLGNltNUOZ1f-QYb0DVBSC1_BrWI46HO7vegaBJQm9I-w4puao8FKFUWcT0syCptImHNJ5PosFgqt3x0WlTQnkDRHctOjl5UsCKgO_6m5896d5z6np_SV3_OXX9v6Tu9XPqXu-o40MRCbdr-stT3H94_1y6SnWaFFlYFX9fex-V2U19kubsSfjNctkX1jqZ_eXXZPMWm1UB9Ac4LGAPEveSx0TZyw86k9hiWigpvM3gp4vtV3FWEPwPcXKxNBcZnmCNYMRCV6ObYu7N6F5A3h-IxzXPE2XXFKE_j8fCYLaQrSrX_8Mvw5AuXhPTQoNx5fgYx8sbHC3x4bHZfpEXCuigVO-x1E5KHFjS_4olT4F6Xa29Obr7U1wbwd7d10PbrOp_rfrXpeDd3S1i32ASMAc4zqmKsyYHo_I7uHo92x0c6lmxPLVgcs2qM041bPAIyT1_rpPEE7zgD7yQY86a_Uj3M_TtFBnBeFyUKRRQiMNWIF0i2BaTPnccmQKTWkBvPv1yevq-xHueqinXe95WxNC-_iOAZ8WUTZEn24akxuoKNpoOj-244kabItXy2I-TpfpNQ3liJuaXPkxUmGANT90y0G8dKXy9EJbTTPKQTnyYxTR_9j-GZ07X-45Z93pFQC_UCxzp0Hkfu2Td6wTUx-9r_L7pPDwEuwQ6VPj_35r7fdY8flrzW2j8J36_w--7b4r5G2ieqCT_Gnc8vN8LrbDupIxwh6-avhD7Vz2Dwc67Hok3Rb7P-DW-NNc5qHAI3ctAlfkeIzdwuC6z5r7OTtKZKXBgF1CL8u3WcmaS6vBnVTbV5SacvNWri_K9yfCqzg89mcRhrKUZKH9YKpPeZPKmrMzoocD0eV3iqTJTDkdyYD6YOFgeijakEKqFPzj7c9LWIYl1bfbYDJJoLG41gndGtdoTavdKH27FhTWN5s7isXd3h_9rtjqokmWERJoPmlyadt9hrf0N_YV74bHwgUM7cqdrCfhL6j8E3p1jrgJvj_5OiVEReMRhfoJ_W0QU7ZUt68dWPcnh-qc6k6wrS59f4rdNoF_ZEb87QCSPe8FmIhJ6nq53rzcbPDzJ1Xz_HP22ntT34uLXjGTNyRNJUgf-dH0UEAD2kYDIP598_D7WH-ibW8o6Y41-0OT3d1s3cS5ttlrb7Ciy-I9C-_03P88jaXLcW_vZv6VthL2gh9FhXMOAtdAjnFmaWyIVxkdqsiLdCdSJ7xH8yp1f0x3SEezvXVm5N0MhTta_5fDzAm8lD-SFPnLRlTWiitegGhdIxtKmwIwB6eg9_oiC9_TqVdzaGnQ_lgb7t1mEzVnj_isHL-7nP1_7brZsuInvNiMs_ymH3iM_uTSCBvfuONF28x1SI1uEYa__tujhsZk7_wVTow0fnISvXvW-o7ay4ewkT0ffHVG7jRLi2pLUkY7aiUrH8q1TEo-3dIZhmODmwn9PhBvoOtnnxkNQPUcW7TwHZRoPd_Lvv3bzBxQ=)) diff --git a/examples/static_set/device_ref_example.cu b/examples/static_set/device_ref_example.cu index 76edd041d..74346616a 100644 --- a/examples/static_set/device_ref_example.cu +++ b/examples/static_set/device_ref_example.cu @@ -42,7 +42,7 @@ __global__ void custom_cooperative_insert(SetRef set, InputIterator keys, std::s constexpr auto cg_size = SetRef::cg_size; - auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tile = cg::tiled_partition(cg::this_thread_block()); int64_t const loop_stride = gridDim.x * blockDim.x / cg_size; int64_t idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size; @@ -60,7 +60,7 @@ __global__ void custom_contains(SetRef set, InputIterator keys, std::size_t n, O constexpr auto cg_size = SetRef::cg_size; - auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tile = cg::tiled_partition(cg::this_thread_block()); int64_t const loop_stride = gridDim.x * blockDim.x / cg_size; int64_t idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size; diff --git a/examples/static_set/device_subsets_example.cu b/examples/static_set/device_subsets_example.cu index 89e1f81cc..0e3758649 100644 --- a/examples/static_set/device_subsets_example.cu +++ b/examples/static_set/device_subsets_example.cu @@ -80,7 +80,7 @@ __global__ void insert(ref_type* set_refs) { namespace cg = cooperative_groups; - auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const tile = cg::tiled_partition(cg::this_thread_block()); // Get subset (or CG) index auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size; @@ -105,7 +105,7 @@ __global__ void find(ref_type* set_refs) { namespace cg = cooperative_groups; - auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const tile = cg::tiled_partition(cg::this_thread_block()); auto const idx = (blockDim.x * blockIdx.x + threadIdx.x) / cg_size; auto raw_set_ref = *(set_refs + idx); diff --git a/include/cuco/bloom_filter_ref.cuh b/include/cuco/bloom_filter_ref.cuh index 2f3dcfa2b..706f2b539 100644 --- a/include/cuco/bloom_filter_ref.cuh +++ b/include/cuco/bloom_filter_ref.cuh @@ -91,7 +91,7 @@ class bloom_filter_ref { * @param group The Cooperative Group this operation is executed with */ template - __device__ constexpr void clear(CG const& group); + __device__ constexpr void clear(CG group); /** * @brief Erases all information from the filter. @@ -132,7 +132,7 @@ class bloom_filter_ref { * @param key The key to be added */ template - __device__ void add(CG const& group, ProbeKey const& key); + __device__ void add(CG group, ProbeKey const& key); /** * @brief Device function that adds all keys in the range `[first, last)` to the filter. @@ -148,7 +148,7 @@ class bloom_filter_ref { * @param last End of the sequence of keys */ template - __device__ void add(CG const& group, InputIt first, InputIt last); + __device__ void add(CG group, InputIt first, InputIt last); /** * @brief Adds all keys in the range `[first, last)` to the filter. @@ -255,11 +255,11 @@ class bloom_filter_ref { * @return `true` iff the key's fingerprint was present in the filter */ template - [[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const; + [[nodiscard]] __device__ bool contains(CG group, ProbeKey const& key) const; // TODO // template - // __device__ void contains(CG const& group, InputIt first, InputIt last, OutputIt output_begin) + // __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin) // const; /** diff --git a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh index bc194f7df..1cbc50a0d 100644 --- a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh +++ b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh @@ -95,7 +95,7 @@ class bloom_filter_impl { } template - __device__ constexpr void clear(CG const& group) + __device__ constexpr void clear(CG group) { for (int i = group.thread_rank(); i < num_blocks_ * words_per_block; i += group.size()) { words_[i] = 0; @@ -149,7 +149,7 @@ class bloom_filter_impl { } template - __device__ void add(CG const& group, ProbeKey const& key) + __device__ void add(CG group, ProbeKey const& key) { constexpr auto num_threads = tile_size_v; constexpr auto optimal_num_threads = add_optimal_cg_size(); @@ -166,7 +166,7 @@ class bloom_filter_impl { } template - __device__ void add(CG const& group, InputIt first, InputIt last) + __device__ void add(CG group, InputIt first, InputIt last) { namespace cg = cooperative_groups; @@ -208,7 +208,7 @@ class bloom_filter_impl { typename policy_type::hash_result_type hash_value; size_type block_index; - auto const worker_group = cg::tiled_partition(group); + auto const worker_group = cg::tiled_partition(group); auto const worker_offset = worker_num_threads * worker_group.meta_group_rank(); auto const group_iters = cuco::detail::int_div_ceil(num_keys, num_threads); @@ -229,7 +229,7 @@ class bloom_filter_impl { } template - __device__ void add_impl(CG const& group, HashValue const& hash_value, BlockIndex block_index) + __device__ void add_impl(CG group, HashValue const& hash_value, BlockIndex block_index) { constexpr auto num_threads = tile_size_v; @@ -327,7 +327,7 @@ class bloom_filter_impl { } template - [[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const + [[nodiscard]] __device__ bool contains(CG group, ProbeKey const& key) const { constexpr auto num_threads = tile_size_v; constexpr auto optimal_num_threads = contains_optimal_cg_size(); @@ -359,7 +359,7 @@ class bloom_filter_impl { // TODO // template - // __device__ void contains(CG const& group, InputIt first, InputIt last, OutputIt output_begin) + // __device__ void contains(CG group, InputIt first, InputIt last, OutputIt output_begin) // const; template @@ -432,7 +432,7 @@ class bloom_filter_impl { // [[nodiscard]] __host__ double expected_false_positive_rate(size_t unique_keys) const // [[nodiscard]] __host__ __device__ static uint32_t optimal_pattern_bits(size_t num_blocks) // template - // [[nodiscard]] __device__ constexpr auto make_copy(CG const& group, word_type* const + // [[nodiscard]] __device__ constexpr auto make_copy(CG group, word_type* const // memory_to_use, cuda_thread_scope scope = {}) const noexcept; private: diff --git a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl index 96d2c0573..cb5a47cbc 100644 --- a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl +++ b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl @@ -39,7 +39,7 @@ __host__ __device__ constexpr bloom_filter_ref::bloo template template -__device__ constexpr void bloom_filter_ref::clear(CG const& group) +__device__ constexpr void bloom_filter_ref::clear(CG group) { impl_.clear(group); } @@ -66,15 +66,14 @@ __device__ void bloom_filter_ref::add(ProbeKey const template template -__device__ void bloom_filter_ref::add(CG const& group, - ProbeKey const& key) +__device__ void bloom_filter_ref::add(CG group, ProbeKey const& key) { impl_.add(group, key); } template template -__device__ void bloom_filter_ref::add(CG const& group, +__device__ void bloom_filter_ref::add(CG group, InputIt first, InputIt last) { @@ -125,7 +124,7 @@ template template template [[nodiscard]] __device__ bool bloom_filter_ref::contains( - CG const& group, ProbeKey const& key) const + CG group, ProbeKey const& key) const { return impl_.contains(group, key); } diff --git a/include/cuco/detail/bloom_filter/kernels.cuh b/include/cuco/detail/bloom_filter/kernels.cuh index 91361b971..8af37fc84 100644 --- a/include/cuco/detail/bloom_filter/kernels.cuh +++ b/include/cuco/detail/bloom_filter/kernels.cuh @@ -44,7 +44,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void add(InputIt first, if (tile_start >= n) { return; } auto const tile_stop = (tile_start + items_per_tile < n) ? tile_start + items_per_tile : n; - auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const tile = cg::tiled_partition(cg::this_thread_block()); ref.add(tile, first + tile_start, first + tile_stop); } @@ -63,7 +63,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void add_if_n( auto const loop_stride = cuco::detail::grid_stride() / CGSize; auto idx = cuco::detail::global_thread_id() / CGSize; - [[maybe_unused]] auto const tile = cg::tiled_partition(cg::this_thread_block()); + [[maybe_unused]] auto const tile = + cg::tiled_partition(cg::this_thread_block()); while (idx < n) { if (pred(*(stencil + idx))) { @@ -94,7 +95,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first, auto const loop_stride = cuco::detail::grid_stride() / CGSize; auto idx = cuco::detail::global_thread_id() / CGSize; - [[maybe_unused]] auto const tile = cg::tiled_partition(cg::this_thread_block()); + [[maybe_unused]] auto const tile = + cg::tiled_partition(cg::this_thread_block()); if constexpr (CGSize == 1) { while (idx < n) { @@ -103,7 +105,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first, idx += loop_stride; } } else { - auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const tile = cg::tiled_partition(cg::this_thread_block()); while (idx < n) { typename cuda::std::iterator_traits::value_type const& key = *(first + idx); auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false; diff --git a/include/cuco/detail/dynamic_map_kernels.cuh b/include/cuco/detail/dynamic_map_kernels.cuh index 5b1f328fd..6d3f9b9c7 100644 --- a/include/cuco/detail/dynamic_map_kernels.cuh +++ b/include/cuco/detail/dynamic_map_kernels.cuh @@ -166,7 +166,7 @@ CUCO_KERNEL void insert(InputIt first, __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tile = cg::tiled_partition(cg::this_thread_block()); auto tid = blockDim.x * blockIdx.x + threadIdx.x; auto it = first + tid / tile_size; @@ -312,7 +312,7 @@ CUCO_KERNEL void erase(InputIt first, extern __shared__ unsigned long long submap_block_num_successes[]; auto block = cg::this_thread_block(); - auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tile = cg::tiled_partition(cg::this_thread_block()); auto tid = block_size * block.group_index().x + block.thread_rank(); auto it = first + tid / tile_size; @@ -456,9 +456,9 @@ CUCO_KERNEL void find(InputIt first, Hash hash, KeyEqual key_equal) { - auto tile = cg::tiled_partition(cg::this_thread_block()); - auto tid = blockDim.x * blockIdx.x + threadIdx.x; - auto key_idx = tid / tile_size; + auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tid = blockDim.x * blockIdx.x + threadIdx.x; + auto key_idx = tid / tile_size; auto empty_value_sentinel = submap_views[0].get_empty_value_sentinel(); __shared__ Value writeBuffer[block_size]; @@ -677,7 +677,7 @@ CUCO_KERNEL void contains(InputIt first, Hash hash, KeyEqual key_equal) { - auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tile = cg::tiled_partition(cg::this_thread_block()); auto tid = blockDim.x * blockIdx.x + threadIdx.x; auto key_idx = tid / tile_size; __shared__ bool writeBuffer[block_size]; diff --git a/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh b/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh index 8eb413207..2de3fbe12 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh @@ -106,7 +106,7 @@ class hyperloglog_impl { * @param group CUDA Cooperative group this operation is executed in */ template - __device__ constexpr void clear(CG const& group) noexcept + __device__ constexpr void clear(CG group) noexcept { for (int i = group.thread_rank(); i < this->sketch_.size(); i += group.size()) { new (&(this->sketch_[i])) register_type{}; @@ -280,8 +280,7 @@ class hyperloglog_impl { * @param other Other estimator reference to be merged into `*this` */ template - __device__ constexpr void merge(CG const& group, - hyperloglog_impl const& other) + __device__ constexpr void merge(CG group, hyperloglog_impl const& other) { // TODO find a better way to do error handling in device code // if (other.precision_ != this->precision_) { __trap(); } @@ -362,7 +361,8 @@ class hyperloglog_impl { } // warp reduce Z and V - auto const warp = cooperative_groups::tiled_partition<32>(group); + auto const warp = + cooperative_groups::tiled_partition<32, cooperative_groups::thread_block>(group); #if defined(CUCO_HAS_CG_REDUCE_UPDATE_ASYNC) cooperative_groups::reduce_update_async( warp, block_sum, thread_sum, cooperative_groups::plus()); diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.inl b/include/cuco/detail/hyperloglog/hyperloglog_ref.inl index 096b68bc9..1f60596a1 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.inl +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.inl @@ -25,7 +25,7 @@ __host__ __device__ constexpr hyperloglog_ref::hyperloglog_ref( template template -__device__ constexpr void hyperloglog_ref::clear(CG const& group) noexcept +__device__ constexpr void hyperloglog_ref::clear(CG group) noexcept { impl_.clear(group); } @@ -70,7 +70,7 @@ __host__ constexpr void hyperloglog_ref::add(InputIt first, template template __device__ constexpr void hyperloglog_ref::merge( - CG const& group, hyperloglog_ref const& other) + CG group, hyperloglog_ref const& other) { impl_.merge(group, other.impl_); } diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh index 62df3df40..79e10502c 100644 --- a/include/cuco/detail/open_addressing/kernels.cuh +++ b/include/cuco/detail/open_addressing/kernels.cuh @@ -84,7 +84,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n(InputIt first, if (ref.insert(insert_element)) { thread_num_successes++; }; } else { auto const tile = - cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + cooperative_groups::tiled_partition( + cooperative_groups::this_thread_block()); if (ref.insert(tile, insert_element) && tile.thread_rank() == 0) { thread_num_successes++; } } } @@ -143,7 +144,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_if_n( ref.insert(insert_element); } else { auto const tile = - cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + cooperative_groups::tiled_partition( + cooperative_groups::this_thread_block()); ref.insert(tile, insert_element); } } @@ -178,7 +180,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void erase(InputIt first, ref.erase(erase_element); } else { auto const tile = - cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + cooperative_groups::tiled_partition( + cooperative_groups::this_thread_block()); ref.erase(tile, erase_element); } idx += loop_stride; @@ -218,7 +221,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void for_each_n(InputIt first, ref.for_each(key, callback_op); } else { auto const tile = - cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + cooperative_groups::tiled_partition( + cooperative_groups::this_thread_block()); ref.for_each(tile, key, callback_op); } idx += loop_stride; @@ -288,7 +292,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first, block.sync(); if (idx < n) { *(output_begin + idx) = output_buffer[thread_idx]; } } else { - auto const tile = cg::tiled_partition(block); + auto const tile = cg::tiled_partition(block); if (idx < n) { typename cuda::std::iterator_traits::value_type const& key = *(first + idx); auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false; @@ -405,7 +409,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find_if_n(InputIt first, block.sync(); if (idx < n) { *(output_begin + idx) = output_buffer[thread_idx]; } } else { - auto const tile = cg::tiled_partition(block); + auto const tile = cg::tiled_partition(block); if (idx < n) { typename cuda::std::iterator_traits::value_type const& key = *(first + idx); auto const found = ref.find(tile, key); @@ -500,7 +504,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_and_find(InputIt first, *(inserted_begin + idx) = output_inserted_buffer[thread_idx]; } } else { - auto const tile = cg::tiled_partition(cg::this_thread_block()); + auto const tile = cg::tiled_partition(cg::this_thread_block()); if (idx < n) { typename cuda::std::iterator_traits::value_type const& insert_element{ *(first + idx)}; @@ -562,7 +566,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count(InputIt first, } } else { auto const tile = - cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + cooperative_groups::tiled_partition( + cooperative_groups::this_thread_block()); if constexpr (IsOuter) { auto temp_count = ref.count(tile, key); if (tile.all(temp_count == 0) and tile.thread_rank() == 0) { ++temp_count; } @@ -621,7 +626,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void count_each(InputIt first, } } else { auto const tile = - cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + cooperative_groups::tiled_partition( + cooperative_groups::this_thread_block()); if constexpr (IsOuter) { auto temp_count = ref.count(tile, key); if (tile.all(temp_count == 0) and tile.thread_rank() == 0) { ++temp_count; } @@ -758,7 +764,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void rehash( auto constexpr cg_size = ContainerRef::cg_size; auto const block = cg::this_thread_block(); - auto const tile = cg::tiled_partition(block); + auto const tile = cg::tiled_partition(block); auto const thread_rank = block.thread_rank(); auto constexpr tiles_per_block = BlockSize / cg_size; // tile.meta_group_size() but constexpr diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index e6b9e5577..5d09093f7 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -313,7 +313,7 @@ class open_addressing_ref_impl { * the ownership of the memory */ template - __device__ void make_copy(CG const& g, value_type* const memory_to_use) const noexcept + __device__ void make_copy(CG g, value_type* const memory_to_use) const noexcept { auto const num_slots = this->capacity(); #if defined(CUCO_HAS_CUDA_BARRIER) @@ -348,7 +348,7 @@ class open_addressing_ref_impl { * @param tile The cooperative thread group used to initialize the container */ template - __device__ constexpr void initialize(CG const& tile) noexcept + __device__ constexpr void initialize(CG tile) noexcept { auto tid = tile.thread_rank(); auto const extent = static_cast(this->extent()); @@ -425,8 +425,8 @@ class open_addressing_ref_impl { * * @return True if the given element is successfully inserted */ - template - __device__ bool insert(cooperative_groups::thread_block_tile const& group, + template + __device__ bool insert(cooperative_groups::thread_block_tile group, Value const& value) noexcept { auto const val = this->heterogeneous_value(value); @@ -585,9 +585,9 @@ class open_addressing_ref_impl { * @return a pair consisting of an iterator to the element and a bool indicating whether the * insertion is successful or not. */ - template + template __device__ cuda::std::pair insert_and_find( - cooperative_groups::thread_block_tile const& group, Value const& value) noexcept + cooperative_groups::thread_block_tile group, Value const& value) noexcept { #if __CUDA_ARCH__ < 700 // Spinning to ensure that the write to the value part took place requires @@ -727,8 +727,8 @@ class open_addressing_ref_impl { * * @return True if the given element is successfully erased */ - template - __device__ bool erase(cooperative_groups::thread_block_tile const& group, + template + __device__ bool erase(cooperative_groups::thread_block_tile group, ProbeKey const& key) noexcept { auto probing_iter = @@ -824,9 +824,10 @@ class open_addressing_ref_impl { * * @return A boolean indicating whether the probe key is present */ - template + template [[nodiscard]] __device__ bool contains( - cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept + cooperative_groups::thread_block_tile group, + ProbeKey const& key) const noexcept { auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); @@ -907,9 +908,10 @@ class open_addressing_ref_impl { * * @return An iterator to the position at which the equivalent key is stored */ - template - [[nodiscard]] __device__ iterator find( - cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept + template + [[nodiscard]] __device__ iterator + find(cooperative_groups::thread_block_tile group, + ProbeKey const& key) const noexcept { auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); @@ -1003,9 +1005,10 @@ class open_addressing_ref_impl { * * @return Number of occurrences found by the current thread */ - template - [[nodiscard]] __device__ size_type count( - cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept + template + [[nodiscard]] __device__ size_type + count(cooperative_groups::thread_block_tile group, + ProbeKey const& key) const noexcept { auto probing_iter = probing_scheme_.template make_iterator(group, key, storage_ref_.extent()); @@ -1195,8 +1198,8 @@ class open_addressing_ref_impl { auto constexpr max_matches_per_step = flushing_tile_size * bucket_size; auto constexpr buffer_size = buffer_multiplier * max_matches_per_step + flushing_tile_size; - auto const flushing_tile = cg::tiled_partition(block); - auto const probing_tile = cg::tiled_partition(block); + auto const flushing_tile = cg::tiled_partition(block); + auto const probing_tile = cg::tiled_partition(block); auto const flushing_tile_id = flushing_tile.meta_group_rank(); auto const stride = probing_tile.meta_group_size(); @@ -1208,7 +1211,7 @@ class open_addressing_ref_impl { if (flushing_tile.thread_rank() == 0) { counters[flushing_tile_id] = 0; } flushing_tile.sync(); - auto flush_buffers = [&](auto const& tile) { + auto flush_buffers = [&](auto tile) { size_type offset = 0; auto const count = counters[flushing_tile_id]; auto const rank = tile.thread_rank(); @@ -1408,8 +1411,8 @@ class open_addressing_ref_impl { * @param key The key to search for * @param callback_op Function to apply to every matched slot */ - template - __device__ void for_each(cooperative_groups::thread_block_tile const& group, + template + __device__ void for_each(cooperative_groups::thread_block_tile group, ProbeKey const& key, CallbackOp&& callback_op) const noexcept { @@ -1472,8 +1475,8 @@ class open_addressing_ref_impl { * @param callback_op Function to apply to every matched slot * @param sync_op Function that is allowed to synchronize `group` inbetween probing buckets */ - template - __device__ void for_each(cooperative_groups::thread_block_tile const& group, + template + __device__ void for_each(cooperative_groups::thread_block_tile group, ProbeKey const& key, CallbackOp&& callback_op, SyncOp&& sync_op) const noexcept diff --git a/include/cuco/detail/probe_sequence_impl.cuh b/include/cuco/detail/probe_sequence_impl.cuh index a732363da..71285e3a8 100644 --- a/include/cuco/detail/probe_sequence_impl.cuh +++ b/include/cuco/detail/probe_sequence_impl.cuh @@ -206,9 +206,9 @@ class linear_probing_impl * @param k The key to get the slot for * @return Pointer to the initial slot for `k` */ - template - __device__ __forceinline__ iterator - initial_slot(cooperative_groups::thread_block_tile const& g, ProbeKey const& k) noexcept + template + __device__ __forceinline__ iterator initial_slot( + cooperative_groups::thread_block_tile g, ProbeKey const& k) noexcept { return const_cast(cuda::std::as_const(*this).initial_slot(g, k)); } @@ -224,9 +224,9 @@ class linear_probing_impl * @param k The key to get the slot for * @return Pointer to the initial slot for `k` */ - template + template __device__ __forceinline__ const_iterator initial_slot( - cooperative_groups::thread_block_tile const& g, ProbeKey const& k) const noexcept + cooperative_groups::thread_block_tile g, ProbeKey const& k) const noexcept { auto const hash_value = [&]() { auto const tmp = hash_(k); @@ -360,9 +360,9 @@ class double_hashing_impl * @param k The key to get the slot for * @return Pointer to the initial slot for `k` */ - template - __device__ __forceinline__ iterator - initial_slot(cooperative_groups::thread_block_tile const& g, ProbeKey const& k) noexcept + template + __device__ __forceinline__ iterator initial_slot( + cooperative_groups::thread_block_tile g, ProbeKey const& k) noexcept { return const_cast(cuda::std::as_const(*this).initial_slot(g, k)); } @@ -379,9 +379,9 @@ class double_hashing_impl * @param k The key to get the slot for * @return Pointer to the initial slot for `k` */ - template + template __device__ __forceinline__ const_iterator initial_slot( - cooperative_groups::thread_block_tile const& g, ProbeKey const& k) const noexcept + cooperative_groups::thread_block_tile g, ProbeKey const& k) const noexcept { std::size_t index; auto const hash_value = hash1_(k); diff --git a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl index 5f8a55ae5..9e88512af 100644 --- a/include/cuco/detail/probing_scheme/probing_scheme_impl.inl +++ b/include/cuco/detail/probing_scheme/probing_scheme_impl.inl @@ -116,9 +116,9 @@ __host__ __device__ constexpr auto linear_probing::make_iterator( } template -template +template __host__ __device__ constexpr auto linear_probing::make_iterator( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, ProbeKey const& probe_key, Extent upper_bound) const noexcept { @@ -182,9 +182,9 @@ __host__ __device__ constexpr auto double_hashing::make_it } template -template +template __host__ __device__ constexpr auto double_hashing::make_iterator( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, ProbeKey const& probe_key, Extent upper_bound) const noexcept { diff --git a/include/cuco/detail/static_map.inl b/include/cuco/detail/static_map.inl index a9e39d08b..cf9d66804 100644 --- a/include/cuco/detail/static_map.inl +++ b/include/cuco/detail/static_map.inl @@ -522,7 +522,7 @@ __device__ template template __device__ bool static_map::device_mutable_view::insert( - CG const& g, value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept + CG g, value_type const& insert_pair, Hash hash, KeyEqual key_equal) noexcept { auto current_slot = this->initial_slot(g, insert_pair.first, hash); @@ -634,7 +634,7 @@ __device__ bool static_map::device_mutable_view::e template template __device__ bool static_map::device_mutable_view::erase( - CG const& g, key_type const& k, Hash hash, KeyEqual key_equal) noexcept + CG g, key_type const& k, Hash hash, KeyEqual key_equal) noexcept { auto current_slot = this->initial_slot(g, k, hash); value_type const insert_pair = @@ -834,7 +834,7 @@ __device__ bool static_map::device_view::contains( template template __device__ cuda::std::enable_if_t, bool> -static_map::device_view::contains(CG const& g, +static_map::device_view::contains(CG g, ProbeKey const& k, Hash hash, KeyEqual key_equal) const noexcept diff --git a/include/cuco/detail/static_map/kernels.cuh b/include/cuco/detail/static_map/kernels.cuh index ce9f68c5a..a1cdc5bf9 100644 --- a/include/cuco/detail/static_map/kernels.cuh +++ b/include/cuco/detail/static_map/kernels.cuh @@ -61,7 +61,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_assign(InputIt first, ref.insert_or_assign(insert_pair); } else { auto const tile = - cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + cooperative_groups::tiled_partition( + cooperative_groups::this_thread_block()); ref.insert_or_assign(tile, insert_pair); } idx += loop_stride; @@ -119,7 +120,8 @@ __global__ void insert_or_apply( } } else { auto const tile = - cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + cooperative_groups::tiled_partition( + cooperative_groups::this_thread_block()); if constexpr (HasInit) { ref.insert_or_apply(tile, insert_pair, init, op); } else { @@ -186,7 +188,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void insert_or_apply_shmem( auto const loop_stride = cuco::detail::grid_stride() / CGSize; auto idx = cuco::detail::global_thread_id() / CGSize; - auto warp = cg::tiled_partition<32>(block); + auto warp = cg::tiled_partition<32, cg::thread_block>(block); auto const warp_thread_idx = warp.thread_rank(); // Shared map initialization diff --git a/include/cuco/detail/static_map/static_map_ref.inl b/include/cuco/detail/static_map/static_map_ref.inl index 85849bdf9..367549e05 100644 --- a/include/cuco/detail/static_map/static_map_ref.inl +++ b/include/cuco/detail/static_map/static_map_ref.inl @@ -387,7 +387,7 @@ template __device__ constexpr auto static_map_ref::make_copy( - CG const& tile, + CG tile, typename StorageRef::value_type* const memory_to_use, cuda_thread_scope scope) const noexcept { @@ -412,7 +412,7 @@ template __device__ constexpr void static_map_ref::initialize( - CG const& tile) noexcept + CG tile) noexcept { this->impl_.initialize(tile); } @@ -465,8 +465,8 @@ class operator_impl< * * @return True if the given element is successfully inserted */ - template - __device__ bool insert(cooperative_groups::thread_block_tile const& group, + template + __device__ bool insert(cooperative_groups::thread_block_tile group, Value const& value) noexcept { auto& ref_ = static_cast(*this); @@ -556,8 +556,8 @@ class operator_impl< * @param group The Cooperative Group used to perform group insert * @param value The element to insert */ - template - __device__ void insert_or_assign(cooperative_groups::thread_block_tile const& group, + template + __device__ void insert_or_assign(cooperative_groups::thread_block_tile group, Value const& value) noexcept { ref_type& ref_ = static_cast(*this); @@ -753,8 +753,8 @@ class operator_impl< * @return Returns `true` if the given `value` is inserted successfully. */ - template - __device__ bool insert_or_apply(cooperative_groups::thread_block_tile const& group, + template + __device__ bool insert_or_apply(cooperative_groups::thread_block_tile group, Value const& value, Op op) { @@ -785,8 +785,8 @@ class operator_impl< * * @return Returns `true` if the given `value` is inserted successfully. */ - template - __device__ bool insert_or_apply(cooperative_groups::thread_block_tile const& group, + template + __device__ bool insert_or_apply(cooperative_groups::thread_block_tile group, Value const& value, Init init, Op op) @@ -843,9 +843,9 @@ class operator_impl< * @param op The callable object to perform binary operation between existing value at the slot * and the element to insert. */ - template + template __device__ bool dispatch_insert_or_apply( - cooperative_groups::thread_block_tile const& group, + cooperative_groups::thread_block_tile group, Value const& value, Init init, Op op) @@ -953,10 +953,9 @@ class operator_impl< * * @return Returns `true` if the given `value` is inserted successfully. */ - template - __device__ bool insert_or_apply_impl(cooperative_groups::thread_block_tile const& group, - Value const& value, - Op op) + template + __device__ bool insert_or_apply_impl( + cooperative_groups::thread_block_tile group, Value const& value, Op op) { ref_type& ref_ = static_cast(*this); @@ -1149,9 +1148,9 @@ class operator_impl< * @return a pair consisting of an iterator to the element and a bool indicating whether the * insertion is successful or not. */ - template + template __device__ cuda::std::pair insert_and_find( - cooperative_groups::thread_block_tile const& group, Value const& value) noexcept + cooperative_groups::thread_block_tile group, Value const& value) noexcept { ref_type& ref_ = static_cast(*this); return ref_.impl_.insert_and_find(group, value); @@ -1203,8 +1202,8 @@ class operator_impl< * * @return True if the given element is successfully erased */ - template - __device__ bool erase(cooperative_groups::thread_block_tile const& group, + template + __device__ bool erase(cooperative_groups::thread_block_tile group, ProbeKey const& key) noexcept { auto& ref_ = static_cast(*this); @@ -1264,9 +1263,10 @@ class operator_impl< * * @return A boolean indicating whether the probe key is present */ - template + template [[nodiscard]] __device__ bool contains( - cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept + cooperative_groups::thread_block_tile group, + ProbeKey const& key) const noexcept { auto const& ref_ = static_cast(*this); return ref_.impl_.contains(group, key); @@ -1327,9 +1327,10 @@ class operator_impl< * * @return An iterator to the position at which the equivalent key is stored */ - template - [[nodiscard]] __device__ iterator find( - cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept + template + [[nodiscard]] __device__ iterator + find(cooperative_groups::thread_block_tile group, + ProbeKey const& key) const noexcept { auto const& ref_ = static_cast(*this); return ref_.impl_.find(group, key); @@ -1395,8 +1396,8 @@ class operator_impl< * @param key The key to search for * @param callback_op Function to apply to the copy of the matched key-value pair */ - template - __device__ void for_each(cooperative_groups::thread_block_tile const& group, + template + __device__ void for_each(cooperative_groups::thread_block_tile group, ProbeKey const& key, CallbackOp&& callback_op) const noexcept { @@ -1452,8 +1453,8 @@ class operator_impl< * * @return Number of occurrences found by the current thread */ - template - __device__ size_type count(cooperative_groups::thread_block_tile const& group, + template + __device__ size_type count(cooperative_groups::thread_block_tile group, ProbeKey const& key) const noexcept { auto const& ref_ = static_cast(*this); @@ -1516,7 +1517,7 @@ class operator_impl< class OutputProbeIt, class OutputMatchIt, class AtomicCounter> - __device__ void retrieve(cooperative_groups::thread_block const& block, + __device__ void retrieve(cooperative_groups::thread_block block, InputProbeIt input_probe_begin, InputProbeIt input_probe_end, OutputProbeIt output_probe, diff --git a/include/cuco/detail/static_map_kernels.cuh b/include/cuco/detail/static_map_kernels.cuh index 74b3f1065..363931abb 100644 --- a/include/cuco/detail/static_map_kernels.cuh +++ b/include/cuco/detail/static_map_kernels.cuh @@ -147,7 +147,7 @@ CUCO_KERNEL void insert( __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tile = cg::tiled_partition(cg::this_thread_block()); int64_t const loop_stride = gridDim.x * block_size / tile_size; int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; @@ -254,7 +254,7 @@ CUCO_KERNEL void erase( __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tile = cg::tiled_partition(cg::this_thread_block()); int64_t const loop_stride = gridDim.x * block_size / tile_size; int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; @@ -324,7 +324,7 @@ CUCO_KERNEL void insert_if_n(InputIt first, __shared__ typename BlockReduce::TempStorage temp_storage; std::size_t thread_num_successes = 0; - auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tile = cg::tiled_partition(cg::this_thread_block()); int64_t const loop_stride = gridDim.x * block_size / tile_size; int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; @@ -440,7 +440,7 @@ template (cg::this_thread_block()); + auto tile = cg::tiled_partition(cg::this_thread_block()); int64_t const loop_stride = gridDim.x * block_size / tile_size; int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; #pragma nv_diagnostic push @@ -558,7 +558,7 @@ template (cg::this_thread_block()); + auto tile = cg::tiled_partition(cg::this_thread_block()); int64_t const loop_stride = gridDim.x * block_size / tile_size; int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; __shared__ bool writeBuffer[block_size / tile_size]; diff --git a/include/cuco/detail/static_multimap/device_view_impl.inl b/include/cuco/detail/static_multimap/device_view_impl.inl index 4e3b2881f..f6af1b4c3 100644 --- a/include/cuco/detail/static_multimap/device_view_impl.inl +++ b/include/cuco/detail/static_multimap/device_view_impl.inl @@ -77,9 +77,9 @@ class static_multimap::device_view_ * @param k The key to get the slot for * @return Pointer to the initial slot for `k` */ - template + template __device__ __forceinline__ iterator - initial_slot(cooperative_groups::thread_block_tile const& g, + initial_slot(cooperative_groups::thread_block_tile g, ProbeKey const& k) noexcept { return probe_sequence_.initial_slot(g, k); @@ -96,9 +96,9 @@ class static_multimap::device_view_ * @param k The key to get the slot for * @return Pointer to the initial slot for `k` */ - template + template __device__ __forceinline__ const_iterator - initial_slot(cooperative_groups::thread_block_tile const& g, + initial_slot(cooperative_groups::thread_block_tile g, ProbeKey const& k) const noexcept { return probe_sequence_.initial_slot(g, k); @@ -483,7 +483,7 @@ class static_multimap::device_view_ * @param output_begin Beginning of the output sequence of key/value pairs */ template - __device__ __forceinline__ void flush_output_buffer(CG const& g, + __device__ __forceinline__ void flush_output_buffer(CG g, uint32_t const num_outputs, value_type* output_buffer, atomicT* num_matches, @@ -541,7 +541,7 @@ class static_multimap::device_view_ * pairs */ template - __device__ __forceinline__ void flush_output_buffer(CG const& g, + __device__ __forceinline__ void flush_output_buffer(CG g, uint32_t const num_outputs, value_type* probe_output_buffer, value_type* contained_output_buffer, @@ -584,9 +584,13 @@ class static_multimap::device_view_ * @param equal The binary function to compare input element and slot content for equality * @return A boolean indicating whether the key/value pair represented by `element` was inserted */ - template + template __device__ __forceinline__ cuda::std::enable_if_t contains( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, ProbeT const& element, Equal equal) const noexcept { @@ -650,9 +654,13 @@ class static_multimap::device_view_ * @param equal The binary function to compare input element and slot content for equality * @return A boolean indicating whether the key/value pair represented by `element` was inserted */ - template + template __device__ __forceinline__ cuda::std::enable_if_t contains( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, ProbeT const& element, Equal equal) const noexcept { @@ -706,7 +714,7 @@ class static_multimap::device_view_ */ template __device__ __forceinline__ cuda::std::enable_if_t count( - CG const& g, Key const& k, KeyEqual key_equal) noexcept + CG g, Key const& k, KeyEqual key_equal) noexcept { std::size_t count = 0; auto current_slot = initial_slot(g, k); @@ -756,7 +764,7 @@ class static_multimap::device_view_ */ template __device__ __forceinline__ cuda::std::enable_if_t count( - CG const& g, Key const& k, KeyEqual key_equal) noexcept + CG g, Key const& k, KeyEqual key_equal) noexcept { std::size_t count = 0; auto current_slot = initial_slot(g, k); @@ -804,7 +812,7 @@ class static_multimap::device_view_ */ template __device__ __forceinline__ cuda::std::enable_if_t pair_count( - CG const& g, value_type const& pair, PairEqual pair_equal) noexcept + CG g, value_type const& pair, PairEqual pair_equal) noexcept { std::size_t count = 0; auto key = pair.first; @@ -857,7 +865,7 @@ class static_multimap::device_view_ */ template __device__ __forceinline__ cuda::std::enable_if_t pair_count( - CG const& g, value_type const& pair, PairEqual pair_equal) noexcept + CG g, value_type const& pair, PairEqual pair_equal) noexcept { std::size_t count = 0; auto key = pair.first; @@ -923,8 +931,8 @@ class static_multimap::device_view_ typename atomicT, typename OutputIt, typename KeyEqual> - __device__ __forceinline__ void retrieve(FlushingCG const& flushing_cg, - ProbingCG const& probing_cg, + __device__ __forceinline__ void retrieve(FlushingCG flushing_cg, + ProbingCG probing_cg, Key const& k, uint32_t* flushing_cg_counter, value_type* output_buffer, @@ -1033,7 +1041,7 @@ class static_multimap::device_view_ typename atomicT, typename OutputIt, typename KeyEqual> - __device__ __forceinline__ void retrieve(CG const& g, + __device__ __forceinline__ void retrieve(CG g, Key const& k, uint32_t* cg_counter, value_type* output_buffer, @@ -1141,7 +1149,7 @@ class static_multimap::device_view_ typename OutputIt4, typename PairEqual> __device__ __forceinline__ cuda::std::enable_if_t pair_retrieve( - ProbingCG const& probing_cg, + ProbingCG probing_cg, value_type const& pair, OutputIt1 probe_key_begin, OutputIt2 probe_val_begin, @@ -1252,7 +1260,7 @@ class static_multimap::device_view_ typename OutputIt4, typename PairEqual> __device__ __forceinline__ cuda::std::enable_if_t pair_retrieve( - ProbingCG const& probing_cg, + ProbingCG probing_cg, value_type const& pair, OutputIt1 probe_key_begin, OutputIt2 probe_val_begin, @@ -1348,8 +1356,8 @@ class static_multimap::device_view_ typename OutputIt1, typename OutputIt2, typename PairEqual> - __device__ __forceinline__ void pair_retrieve(FlushingCG const& flushing_cg, - ProbingCG const& probing_cg, + __device__ __forceinline__ void pair_retrieve(FlushingCG flushing_cg, + ProbingCG probing_cg, value_type const& pair, uint32_t* flushing_cg_counter, value_type* probe_output_buffer, @@ -1476,7 +1484,7 @@ class static_multimap::device_view_ typename OutputIt1, typename OutputIt2, typename PairEqual> - __device__ __forceinline__ void pair_retrieve(CG const& g, + __device__ __forceinline__ void pair_retrieve(CG g, value_type const& pair, uint32_t* cg_counter, value_type* probe_output_buffer, diff --git a/include/cuco/detail/static_multimap/kernels.cuh b/include/cuco/detail/static_multimap/kernels.cuh index f43667a15..1e7b9d985 100644 --- a/include/cuco/detail/static_multimap/kernels.cuh +++ b/include/cuco/detail/static_multimap/kernels.cuh @@ -83,7 +83,7 @@ CUCO_KERNEL void initialize(pair_atomic_type* const slots, Key k, Value v, int64 template CUCO_KERNEL void insert(InputIt first, int64_t n, viewT view) { - auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tile = cg::tiled_partition(cg::this_thread_block()); int64_t const loop_stride = gridDim.x * block_size / tile_size; int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; @@ -131,7 +131,7 @@ template CUCO_KERNEL void insert_if_n(InputIt first, StencilIt s, int64_t n, viewT view, Predicate pred) { - auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tile = cg::tiled_partition(cg::this_thread_block()); int64_t const loop_stride = gridDim.x * block_size / tile_size; int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; @@ -178,7 +178,7 @@ template CUCO_KERNEL void contains(InputIt first, int64_t n, OutputIt output_begin, viewT view, Equal equal) { - auto tile = cg::tiled_partition(cg::this_thread_block()); + auto tile = cg::tiled_partition(cg::this_thread_block()); int64_t const loop_stride = gridDim.x * block_size / tile_size; int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; __shared__ bool writeBuffer[block_size / tile_size]; @@ -237,7 +237,7 @@ template (cg::this_thread_block()); + auto tile = cg::tiled_partition(cg::this_thread_block()); int64_t const loop_stride = gridDim.x * block_size / tile_size; int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; @@ -296,7 +296,7 @@ template (cg::this_thread_block()); + auto tile = cg::tiled_partition(cg::this_thread_block()); int64_t const loop_stride = gridDim.x * block_size / tile_size; int64_t idx = (block_size * blockIdx.x + threadIdx.x) / tile_size; @@ -374,8 +374,9 @@ CUCO_KERNEL void retrieve(InputIt first, constexpr uint32_t num_flushing_cgs = block_size / flushing_cg_size; const uint32_t flushing_cg_id = threadIdx.x / flushing_cg_size; - auto flushing_cg = cg::tiled_partition(cg::this_thread_block()); - auto probing_cg = cg::tiled_partition(cg::this_thread_block()); + auto flushing_cg = + cg::tiled_partition(cg::this_thread_block()); + auto probing_cg = cg::tiled_partition(cg::this_thread_block()); int64_t const loop_stride = gridDim.x * block_size / probing_cg_size; int64_t idx = (block_size * blockIdx.x + threadIdx.x) / probing_cg_size; @@ -488,8 +489,9 @@ CUCO_KERNEL void pair_retrieve(InputIt first, constexpr uint32_t num_flushing_cgs = block_size / flushing_cg_size; const uint32_t flushing_cg_id = threadIdx.x / flushing_cg_size; - auto flushing_cg = cg::tiled_partition(cg::this_thread_block()); - auto probing_cg = cg::tiled_partition(cg::this_thread_block()); + auto flushing_cg = + cg::tiled_partition(cg::this_thread_block()); + auto probing_cg = cg::tiled_partition(cg::this_thread_block()); int64_t const loop_stride = gridDim.x * block_size / probing_cg_size; int64_t idx = (block_size * blockIdx.x + threadIdx.x) / probing_cg_size; diff --git a/include/cuco/detail/static_multimap/static_multimap.inl b/include/cuco/detail/static_multimap/static_multimap.inl index ad50e7eaa..50de45ad2 100644 --- a/include/cuco/detail/static_multimap/static_multimap.inl +++ b/include/cuco/detail/static_multimap/static_multimap.inl @@ -1126,9 +1126,10 @@ template +template __device__ __forceinline__ void static_multimap::device_mutable_view::insert( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, value_type const& insert_pair) noexcept { impl_.template insert(g, insert_pair); @@ -1181,7 +1182,7 @@ template __device__ __forceinline__ void static_multimap::device_view::flush_output_buffer( - CG const& g, + CG g, uint32_t const num_outputs, value_type* output_buffer, atomicT* num_matches, @@ -1198,7 +1199,7 @@ template __device__ __forceinline__ void static_multimap::device_view::flush_output_buffer( - CG const& g, + CG g, uint32_t const num_outputs, value_type* probe_output_buffer, value_type* contained_output_buffer, @@ -1220,10 +1221,10 @@ template -template +template __device__ __forceinline__ bool static_multimap::device_view::contains( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, ProbeKey const& k, KeyEqual key_equal) const noexcept { @@ -1236,10 +1237,10 @@ template -template +template __device__ __forceinline__ bool static_multimap::device_view::pair_contains( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, ProbePair const& p, PairEqual pair_equal) const noexcept { @@ -1252,10 +1253,10 @@ template -template +template __device__ __forceinline__ std::size_t static_multimap::device_view::count( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, Key const& k, KeyEqual key_equal) noexcept { @@ -1268,10 +1269,10 @@ template -template +template __device__ __forceinline__ std::size_t static_multimap::device_view::count_outer( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, Key const& k, KeyEqual key_equal) noexcept { @@ -1284,10 +1285,10 @@ template -template +template __device__ __forceinline__ std::size_t static_multimap::device_view::pair_count( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, value_type const& pair, PairEqual pair_equal) noexcept { @@ -1300,10 +1301,10 @@ template -template +template __device__ __forceinline__ std::size_t static_multimap::device_view::pair_count_outer( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, value_type const& pair, PairEqual pair_equal) noexcept { @@ -1320,11 +1321,12 @@ template + typename KeyEqual, + typename ParentCG> __device__ __forceinline__ void static_multimap::device_view::retrieve( - FlushingCG const& flushing_cg, - cooperative_groups::thread_block_tile const& probing_cg, + FlushingCG flushing_cg, + cooperative_groups::thread_block_tile probing_cg, Key const& k, uint32_t* flushing_cg_counter, value_type* output_buffer, @@ -1358,11 +1360,12 @@ template + typename KeyEqual, + typename ParentCG> __device__ __forceinline__ void static_multimap::device_view::retrieve_outer( - FlushingCG const& flushing_cg, - cooperative_groups::thread_block_tile const& probing_cg, + FlushingCG flushing_cg, + cooperative_groups::thread_block_tile probing_cg, Key const& k, uint32_t* flushing_cg_counter, value_type* output_buffer, @@ -1396,10 +1399,11 @@ template + typename PairEqual, + typename ParentCG> __device__ __forceinline__ void static_multimap::device_view::pair_retrieve( - cooperative_groups::thread_block_tile const& probing_cg, + cooperative_groups::thread_block_tile probing_cg, value_type const& pair, OutputIt1 probe_key_begin, OutputIt2 probe_val_begin, @@ -1427,11 +1431,12 @@ template + typename PairEqual, + typename ParentCG> __device__ __forceinline__ void static_multimap::device_view::pair_retrieve( - FlushingCG const& flushing_cg, - cooperative_groups::thread_block_tile const& probing_cg, + FlushingCG flushing_cg, + cooperative_groups::thread_block_tile probing_cg, value_type const& pair, uint32_t* flushing_cg_counter, value_type* probe_output_buffer, @@ -1476,10 +1481,11 @@ template + typename PairEqual, + typename ParentCG> __device__ __forceinline__ void static_multimap::device_view::pair_retrieve_outer( - cooperative_groups::thread_block_tile const& probing_cg, + cooperative_groups::thread_block_tile probing_cg, value_type const& pair, OutputIt1 probe_key_begin, OutputIt2 probe_val_begin, @@ -1507,11 +1513,12 @@ template + typename PairEqual, + typename ParentCG> __device__ __forceinline__ void static_multimap::device_view::pair_retrieve_outer( - FlushingCG const& flushing_cg, - cooperative_groups::thread_block_tile const& probing_cg, + FlushingCG flushing_cg, + cooperative_groups::thread_block_tile probing_cg, value_type const& pair, uint32_t* flushing_cg_counter, value_type* probe_output_buffer, diff --git a/include/cuco/detail/static_multimap/static_multimap_ref.inl b/include/cuco/detail/static_multimap/static_multimap_ref.inl index 8d1e6c126..ad856dd86 100644 --- a/include/cuco/detail/static_multimap/static_multimap_ref.inl +++ b/include/cuco/detail/static_multimap/static_multimap_ref.inl @@ -391,9 +391,7 @@ template __device__ constexpr auto static_multimap_ref::make_copy( - CG const& tile, - bucket_type* const memory_to_use, - cuda_thread_scope scope) const noexcept + CG tile, bucket_type* const memory_to_use, cuda_thread_scope scope) const noexcept { impl_.make_copy(tile, memory_to_use); return static_multimap_ref{ @@ -416,7 +414,7 @@ template __device__ constexpr void static_multimap_ref::initialize( - CG const& tile) noexcept + CG tile) noexcept { impl_.initialize(tile); } @@ -470,8 +468,8 @@ class operator_impl< * * @return True if the given element is successfully inserted */ - template - __device__ bool insert(cooperative_groups::thread_block_tile const& group, + template + __device__ bool insert(cooperative_groups::thread_block_tile group, Value const& value) noexcept { auto& ref_ = static_cast(*this); @@ -536,9 +534,10 @@ class operator_impl< * * @return A boolean indicating whether the probe key is present */ - template + template [[nodiscard]] __device__ bool contains( - cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept + cooperative_groups::thread_block_tile group, + ProbeKey const& key) const noexcept { auto const& ref_ = static_cast(*this); return ref_.impl_.contains(group, key); @@ -603,8 +602,8 @@ class operator_impl< * @param key The key to search for * @param callback_op Function to call on every element found */ - template - __device__ void for_each(cooperative_groups::thread_block_tile const& group, + template + __device__ void for_each(cooperative_groups::thread_block_tile group, ProbeKey const& key, CallbackOp&& callback_op) const noexcept { @@ -641,8 +640,8 @@ class operator_impl< * @param callback_op Function to call on every element found * @param sync_op Function that is allowed to synchronize `group` inbetween probing buckets */ - template - __device__ void for_each(cooperative_groups::thread_block_tile const& group, + template + __device__ void for_each(cooperative_groups::thread_block_tile group, ProbeKey const& key, CallbackOp&& callback_op, SyncOp&& sync_op) const noexcept @@ -709,9 +708,10 @@ class operator_impl< * * @return An iterator to the position at which the equivalent key is stored */ - template - [[nodiscard]] __device__ const_iterator find( - cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept + template + [[nodiscard]] __device__ const_iterator + find(cooperative_groups::thread_block_tile group, + ProbeKey const& key) const noexcept { auto const& ref_ = static_cast(*this); return ref_.impl_.find(group, key); @@ -765,8 +765,8 @@ class operator_impl< * * @return Number of occurrences found by the current thread */ - template - __device__ size_type count(cooperative_groups::thread_block_tile const& group, + template + __device__ size_type count(cooperative_groups::thread_block_tile group, ProbeKey const& key) const noexcept { auto const& ref_ = static_cast(*this); diff --git a/include/cuco/detail/static_multiset/static_multiset_ref.inl b/include/cuco/detail/static_multiset/static_multiset_ref.inl index 054f7bb80..1a618059d 100644 --- a/include/cuco/detail/static_multiset/static_multiset_ref.inl +++ b/include/cuco/detail/static_multiset/static_multiset_ref.inl @@ -331,9 +331,7 @@ template __device__ constexpr auto static_multiset_ref::make_copy( - CG const& tile, - bucket_type* const memory_to_use, - cuda_thread_scope scope) const noexcept + CG tile, bucket_type* const memory_to_use, cuda_thread_scope scope) const noexcept { auto const storage_ref = this->storage_ref().make_copy(tile, memory_to_use); return static_multiset_ref __device__ constexpr void static_multiset_ref::initialize( - CG const& tile) noexcept + CG tile) noexcept { this->storage_ref().initialize(tile, this->empty_key_sentinel()); } @@ -409,8 +407,8 @@ class operator_impl< * * @return True if the given element is successfully inserted */ - template - __device__ bool insert(cooperative_groups::thread_block_tile const& group, + template + __device__ bool insert(cooperative_groups::thread_block_tile group, Value const& value) noexcept { auto& ref_ = static_cast(*this); @@ -467,9 +465,10 @@ class operator_impl< * * @return A boolean indicating whether the probe key is present */ - template + template [[nodiscard]] __device__ bool contains( - cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept + cooperative_groups::thread_block_tile group, + ProbeKey const& key) const noexcept { auto const& ref_ = static_cast(*this); return ref_.impl_.contains(group, key); @@ -530,9 +529,10 @@ class operator_impl< * * @return An iterator to the position at which the equivalent key is stored */ - template - [[nodiscard]] __device__ const_iterator find( - cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept + template + [[nodiscard]] __device__ const_iterator + find(cooperative_groups::thread_block_tile group, + ProbeKey const& key) const noexcept { auto const& ref_ = static_cast(*this); return ref_.impl_.find(group, key); @@ -713,8 +713,8 @@ class operator_impl< * @param key The key to search for * @param callback_op Function to call on every element found */ - template - __device__ void for_each(cooperative_groups::thread_block_tile const& group, + template + __device__ void for_each(cooperative_groups::thread_block_tile group, ProbeKey const& key, CallbackOp&& callback_op) const noexcept { @@ -751,8 +751,8 @@ class operator_impl< * @param callback_op Function to call on every element found * @param sync_op Function that is allowed to synchronize `group` inbetween probing buckets */ - template - __device__ void for_each(cooperative_groups::thread_block_tile const& group, + template + __device__ void for_each(cooperative_groups::thread_block_tile group, ProbeKey const& key, CallbackOp&& callback_op, SyncOp&& sync_op) const noexcept @@ -810,8 +810,8 @@ class operator_impl< * * @return Number of occurrences found by the current thread */ - template - __device__ size_type count(cooperative_groups::thread_block_tile const& group, + template + __device__ size_type count(cooperative_groups::thread_block_tile group, ProbeKey const& key) const noexcept { auto const& ref_ = static_cast(*this); diff --git a/include/cuco/detail/static_set/static_set_ref.inl b/include/cuco/detail/static_set/static_set_ref.inl index 07a5c1cf6..35514fa38 100644 --- a/include/cuco/detail/static_set/static_set_ref.inl +++ b/include/cuco/detail/static_set/static_set_ref.inl @@ -331,7 +331,7 @@ template __device__ constexpr auto static_set_ref::make_copy( - CG const& tile, + CG tile, typename StorageRef::value_type* const memory_to_use, cuda_thread_scope scope) const noexcept { @@ -354,7 +354,7 @@ template __device__ constexpr void static_set_ref::initialize( - CG const& tile) noexcept + CG tile) noexcept { this->impl_.initialize(tile); } @@ -404,8 +404,8 @@ class operator_impl - __device__ bool insert(cooperative_groups::thread_block_tile const& group, + template + __device__ bool insert(cooperative_groups::thread_block_tile group, Value const& value) noexcept { auto& ref_ = static_cast(*this); @@ -472,9 +472,9 @@ class operator_impl + template __device__ cuda::std::pair insert_and_find( - cooperative_groups::thread_block_tile const& group, Value const& value) noexcept + cooperative_groups::thread_block_tile group, Value const& value) noexcept { ref_type& ref_ = static_cast(*this); return ref_.impl_.insert_and_find(group, value); @@ -524,8 +524,8 @@ class operator_impl - __device__ bool erase(cooperative_groups::thread_block_tile const& group, + template + __device__ bool erase(cooperative_groups::thread_block_tile group, ProbeKey const& key) noexcept { auto& ref_ = static_cast(*this); @@ -582,9 +582,10 @@ class operator_impl + template [[nodiscard]] __device__ bool contains( - cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept + cooperative_groups::thread_block_tile group, + ProbeKey const& key) const noexcept { auto const& ref_ = static_cast(*this); return ref_.impl_.contains(group, key); @@ -643,9 +644,10 @@ class operator_impl - [[nodiscard]] __device__ const_iterator find( - cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept + template + [[nodiscard]] __device__ const_iterator + find(cooperative_groups::thread_block_tile group, + ProbeKey const& key) const noexcept { auto const& ref_ = static_cast(*this); return ref_.impl_.find(group, key); @@ -709,8 +711,8 @@ class operator_impl - __device__ void for_each(cooperative_groups::thread_block_tile const& group, + template + __device__ void for_each(cooperative_groups::thread_block_tile group, ProbeKey const& key, CallbackOp&& callback_op) const noexcept { @@ -764,8 +766,8 @@ class operator_impl - __device__ size_type count(cooperative_groups::thread_block_tile const& group, + template + __device__ size_type count(cooperative_groups::thread_block_tile group, ProbeKey const& key) const noexcept { auto const& ref_ = static_cast(*this); diff --git a/include/cuco/detail/utils.cuh b/include/cuco/detail/utils.cuh index 9c5f52f14..3acfbe12c 100644 --- a/include/cuco/detail/utils.cuh +++ b/include/cuco/detail/utils.cuh @@ -131,7 +131,7 @@ __host__ __device__ constexpr SizeType sanitize_hash(HashType hash) noexcept * @return Converted hash value */ template -__device__ constexpr SizeType sanitize_hash(CG const& group, HashType hash) noexcept +__device__ constexpr SizeType sanitize_hash(CG group, HashType hash) noexcept { auto const base_hash = sanitize_hash(hash); auto const max_size = cuda::std::numeric_limits::max(); diff --git a/include/cuco/hyperloglog_ref.cuh b/include/cuco/hyperloglog_ref.cuh index 8946fa8c1..83aad101a 100644 --- a/include/cuco/hyperloglog_ref.cuh +++ b/include/cuco/hyperloglog_ref.cuh @@ -75,7 +75,7 @@ class hyperloglog_ref { * @param group CUDA Cooperative group this operation is executed in */ template - __device__ constexpr void clear(CG const& group) noexcept; + __device__ constexpr void clear(CG group) noexcept; /** * @brief Asynchronously resets the estimator, i.e., clears the current count estimate. @@ -144,8 +144,7 @@ class hyperloglog_ref { * @param other Other estimator reference to be merged into `*this` */ template - __device__ constexpr void merge(CG const& group, - hyperloglog_ref const& other); + __device__ constexpr void merge(CG group, hyperloglog_ref const& other); /** * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator. diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp index f2bc89fd0..875635c01 100644 --- a/include/cuco/operator.hpp +++ b/include/cuco/operator.hpp @@ -56,8 +56,8 @@ inline namespace op { * template * __device__ bool insert(Value const& value) noexcept * - * template - * __device__ bool insert(cooperative_groups::thread_block_tile const& group, + * template + * __device__ bool insert(cooperative_groups::thread_block_tile group, * Value const& value) noexcept * ``` * @@ -85,9 +85,9 @@ struct insert_tag { * template * __device__ cuda::std::pair insert_and_find(Value const& value) noexcept * - * template + * template * __device__ cuda::std::pair insert_and_find( - * cooperative_groups::thread_block_tile const& group, Value const& value) noexcept + * cooperative_groups::thread_block_tile group, Value const& value) noexcept * ``` * * Where: @@ -114,8 +114,8 @@ struct insert_and_find_tag { * template * __device__ void insert_or_assign(Value const& value) noexcept * - * template - * __device__ void insert_or_assign(cooperative_groups::thread_block_tile const& group, + * template + * __device__ void insert_or_assign(cooperative_groups::thread_block_tile group, * Value const& value) noexcept * ``` * @@ -144,13 +144,13 @@ struct insert_or_assign_tag { * typename Op> * __device__ bool insert_or_apply(Value const& value, Init init, Op op) * - * template - * __device__ bool insert_or_apply(cooperative_groups::thread_block_tile const& group, + * template + * __device__ bool insert_or_apply(cooperative_groups::thread_block_tile group, * Value const& value, * Op op) * - * template - * __device__ bool insert_or_apply(cooperative_groups::thread_block_tile const& group, + * template + * __device__ bool insert_or_apply(cooperative_groups::thread_block_tile group, * Value const& value, * Init init, * Op op) @@ -182,13 +182,13 @@ struct insert_or_apply_tag { * template * __device__ bool erase(ProbeKey const& key) noexcept * - * template - * __device__ bool erase(cooperative_groups::thread_block_tile const& group, + * template + * __device__ bool erase(cooperative_groups::thread_block_tile group, * ProbeKey const& key) noexcept * ``` * * Where: - * @see @tparam ProbeKey Input key type which is convertible to the containser's 'key_type' + * @see @tparam ProbeKey Input key type which is convertible to the container's 'key_type' * * @see @param group The Cooperative Group used to perform this operation * @see @param key The key to search for @@ -207,9 +207,9 @@ struct erase_tag { * template * __device__ bool contains(ProbeKey const& key) const noexcept * - * template + * template * __device__ bool contains( - * cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const + * cooperative_groups::thread_block_tile group, ProbeKey const& key) const * noexcept * ``` * @@ -233,8 +233,8 @@ struct contains_tag { * template * __device__ size_type count(ProbeKey const& key) const noexcept * - * template - * __device__ size_type count(cooperative_groups::thread_block_tile const& group, + * template + * __device__ size_type count(cooperative_groups::thread_block_tile group, * ProbeKey const& key) const noexcept * ``` * @@ -258,9 +258,9 @@ struct count_tag { * template * __device__ const_iterator find(ProbeKey const& key) const noexcept * - * template + * template * __device__ const_iterator find( - * cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const + * cooperative_groups::thread_block_tile group, ProbeKey const& key) const * noexcept * ``` * @@ -295,7 +295,7 @@ struct find_tag { * class OutputProbeIt, * class OutputMatchIt, * class AtomicCounter> - * __device__ void retrieve(cooperative_groups::thread_block const& block, + * __device__ void retrieve(cooperative_groups::thread_block const& block, * InputProbeIt input_probe_begin, * InputProbeIt input_probe_end, * OutputProbeIt output_probe, @@ -349,13 +349,13 @@ struct retrieve_tag { * template * __device__ void for_each(ProbeKey const& key, CallbackOp&& callback_op) const noexcept * - * template - * __device__ void for_each(cooperative_groups::thread_block_tile const& group, + * template + * __device__ void for_each(cooperative_groups::thread_block_tile group, * ProbeKey const& key, * CallbackOp&& callback_op) const noexcept * - * template - * __device__ void for_each(cooperative_groups::thread_block_tile const& group, + * template + * __device__ void for_each(cooperative_groups::thread_block_tile group, * ProbeKey const& key, * CallbackOp&& callback_op, * SyncOp&& sync_op) const noexcept diff --git a/include/cuco/probing_scheme.cuh b/include/cuco/probing_scheme.cuh index 94022d249..6506c99f5 100644 --- a/include/cuco/probing_scheme.cuh +++ b/include/cuco/probing_scheme.cuh @@ -92,9 +92,9 @@ class linear_probing : private detail::probing_scheme_base { * @param upper_bound Upper bound of the iteration * @return An iterator whose value_type is convertible to slot index type */ - template + template __host__ __device__ constexpr auto make_iterator( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, ProbeKey const& probe_key, Extent upper_bound) const noexcept; @@ -189,9 +189,9 @@ class double_hashing : private detail::probing_scheme_base { * @param upper_bound Upper bound of the iteration * @return An iterator whose value_type is convertible to slot index type */ - template + template __host__ __device__ constexpr auto make_iterator( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, ProbeKey const& probe_key, Extent upper_bound) const noexcept; diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh index 1d7bdeafe..870eb6a4a 100644 --- a/include/cuco/static_map.cuh +++ b/include/cuco/static_map.cuh @@ -1646,7 +1646,7 @@ class static_map { * @return Pointer to the initial slot for `k` */ template - __device__ iterator initial_slot(CG const& g, ProbeKey const& k, Hash hash) noexcept + __device__ iterator initial_slot(CG g, ProbeKey const& k, Hash hash) noexcept { return &slots_[(hash(k) + g.thread_rank()) % capacity_]; } @@ -1666,7 +1666,7 @@ class static_map { * @return Pointer to the initial slot for `k` */ template - __device__ const_iterator initial_slot(CG const& g, ProbeKey const& k, Hash hash) const noexcept + __device__ const_iterator initial_slot(CG g, ProbeKey const& k, Hash hash) const noexcept { return &slots_[(hash(k) + g.thread_rank()) % capacity_]; } @@ -1706,7 +1706,7 @@ class static_map { * @return The next slot after `s` */ template - __device__ iterator next_slot(CG const& g, iterator s) noexcept + __device__ iterator next_slot(CG g, iterator s) noexcept { uint32_t index = s - slots_; return &slots_[(index + g.size()) % capacity_]; @@ -1724,7 +1724,7 @@ class static_map { * @return The next slot after `s` */ template - __device__ const_iterator next_slot(CG const& g, const_iterator s) const noexcept + __device__ const_iterator next_slot(CG g, const_iterator s) const noexcept { uint32_t index = s - slots_; return &slots_[(index + g.size()) % capacity_]; @@ -2015,7 +2015,7 @@ class static_map { */ template __device__ static device_mutable_view make_from_uninitialized_slots( - CG const& g, + CG g, pair_atomic_type* slots, std::size_t capacity, empty_key empty_key_sentinel, @@ -2046,7 +2046,7 @@ class static_map { */ template __device__ static device_mutable_view make_from_uninitialized_slots( - CG const& g, + CG g, pair_atomic_type* slots, std::size_t capacity, empty_key empty_key_sentinel, @@ -2128,7 +2128,7 @@ class static_map { template , typename KeyEqual = cuda::std::equal_to> - __device__ bool insert(CG const& g, + __device__ bool insert(CG g, value_type const& insert_pair, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; @@ -2172,7 +2172,7 @@ class static_map { template , typename KeyEqual = cuda::std::equal_to> - __device__ bool erase(CG const& g, + __device__ bool erase(CG g, key_type const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) noexcept; @@ -2477,10 +2477,7 @@ class static_map { typename Hash = cuco::default_hash_function, typename KeyEqual = cuda::std::equal_to> __device__ cuda::std::enable_if_t, bool> contains( - CG const& g, - ProbeKey const& k, - Hash hash = Hash{}, - KeyEqual key_equal = KeyEqual{}) const noexcept; + CG g, ProbeKey const& k, Hash hash = Hash{}, KeyEqual key_equal = KeyEqual{}) const noexcept; }; // class device_view /** diff --git a/include/cuco/static_map_ref.cuh b/include/cuco/static_map_ref.cuh index 91da7cf4f..51c2769ca 100644 --- a/include/cuco/static_map_ref.cuh +++ b/include/cuco/static_map_ref.cuh @@ -292,7 +292,7 @@ class static_map_ref */ template [[nodiscard]] __device__ constexpr auto make_copy( - CG const& tile, + CG tile, typename StorageRef::value_type* const memory_to_use, cuda_thread_scope scope = {}) const noexcept; @@ -306,7 +306,7 @@ class static_map_ref * @param tile The cooperative thread group used to initialize the map */ template - __device__ constexpr void initialize(CG const& tile) noexcept; + __device__ constexpr void initialize(CG tile) noexcept; private: impl_type impl_; ///< Static map ref implementation diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index 1e86ee03c..9b8eaf8cb 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -1587,8 +1587,9 @@ class static_multimap { * @param g The Cooperative Group that performs the insert * @param insert_pair The pair to insert */ + template __device__ __forceinline__ void insert( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, value_type const& insert_pair) noexcept; private: @@ -1670,7 +1671,7 @@ class static_multimap { * @param output_begin Beginning of the output sequence of key/value pairs */ template - __device__ __forceinline__ void flush_output_buffer(CG const& g, + __device__ __forceinline__ void flush_output_buffer(CG g, uint32_t const num_outputs, value_type* output_buffer, atomicT* num_matches, @@ -1700,7 +1701,7 @@ class static_multimap { * pairs */ template - __device__ __forceinline__ void flush_output_buffer(CG const& g, + __device__ __forceinline__ void flush_output_buffer(CG g, uint32_t const num_outputs, value_type* probe_output_buffer, value_type* contained_output_buffer, @@ -1733,9 +1734,11 @@ class static_multimap { * @return A boolean indicating whether the key/value pair * containing `k` was inserted */ - template > + template , + typename ParentCG = void> __device__ __forceinline__ bool contains( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, ProbeKey const& k, KeyEqual key_equal = KeyEqual{}) const noexcept; @@ -1763,9 +1766,9 @@ class static_multimap { * for equality * @return A boolean indicating whether the input pair was inserted in the map */ - template + template __device__ __forceinline__ bool pair_contains( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, ProbePair const& p, PairEqual pair_equal) const noexcept; @@ -1782,9 +1785,9 @@ class static_multimap { * for equality * @return Number of matches found by the current thread */ - template > + template , typename ParentCG = void> __device__ __forceinline__ std::size_t count( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, Key const& k, KeyEqual key_equal = KeyEqual{}) noexcept; @@ -1802,9 +1805,9 @@ class static_multimap { * for equality * @return Number of matches found by the current thread */ - template > + template , typename ParentCG = void> __device__ __forceinline__ std::size_t count_outer( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, Key const& k, KeyEqual key_equal = KeyEqual{}) noexcept; @@ -1821,9 +1824,9 @@ class static_multimap { * for equality * @return Number of matches found by the current thread */ - template + template __device__ __forceinline__ std::size_t pair_count( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, value_type const& pair, PairEqual pair_equal) noexcept; @@ -1841,9 +1844,9 @@ class static_multimap { * for equality * @return Number of matches found by the current thread */ - template + template __device__ __forceinline__ std::size_t pair_count_outer( - cooperative_groups::thread_block_tile const& g, + cooperative_groups::thread_block_tile g, value_type const& pair, PairEqual pair_equal) noexcept; @@ -1874,10 +1877,11 @@ class static_multimap { typename FlushingCG, typename atomicT, typename OutputIt, - typename KeyEqual = cuda::std::equal_to> + typename KeyEqual = cuda::std::equal_to, + typename ParentCG = void> __device__ __forceinline__ void retrieve( - FlushingCG const& flushing_cg, - cooperative_groups::thread_block_tile const& probing_cg, + FlushingCG flushing_cg, + cooperative_groups::thread_block_tile probing_cg, Key const& k, uint32_t* flushing_cg_counter, value_type* output_buffer, @@ -1914,10 +1918,11 @@ class static_multimap { typename FlushingCG, typename atomicT, typename OutputIt, - typename KeyEqual = cuda::std::equal_to> + typename KeyEqual = cuda::std::equal_to, + typename ParentCG = void> __device__ __forceinline__ void retrieve_outer( - FlushingCG const& flushing_cg, - cooperative_groups::thread_block_tile const& probing_cg, + FlushingCG flushing_cg, + cooperative_groups::thread_block_tile probing_cg, Key const& k, uint32_t* flushing_cg_counter, value_type* output_buffer, @@ -1958,9 +1963,10 @@ class static_multimap { typename OutputIt2, typename OutputIt3, typename OutputIt4, - typename PairEqual> + typename PairEqual, + typename ParentCG> __device__ __forceinline__ void pair_retrieve( - cooperative_groups::thread_block_tile const& probing_cg, + cooperative_groups::thread_block_tile probing_cg, value_type const& pair, OutputIt1 probe_key_begin, OutputIt2 probe_val_begin, @@ -2002,10 +2008,11 @@ class static_multimap { typename atomicT, typename OutputIt1, typename OutputIt2, - typename PairEqual> + typename PairEqual, + typename ParentCG> __device__ __forceinline__ void pair_retrieve( - FlushingCG const& flushing_cg, - cooperative_groups::thread_block_tile const& probing_cg, + FlushingCG flushing_cg, + cooperative_groups::thread_block_tile probing_cg, value_type const& pair, uint32_t* warp_counter, value_type* probe_output_buffer, @@ -2050,9 +2057,10 @@ class static_multimap { typename OutputIt2, typename OutputIt3, typename OutputIt4, - typename PairEqual> + typename PairEqual, + typename ParentCG> __device__ __forceinline__ void pair_retrieve_outer( - cooperative_groups::thread_block_tile const& probing_cg, + cooperative_groups::thread_block_tile probing_cg, value_type const& pair, OutputIt1 probe_key_begin, OutputIt2 probe_val_begin, @@ -2094,10 +2102,11 @@ class static_multimap { typename atomicT, typename OutputIt1, typename OutputIt2, - typename PairEqual> + typename PairEqual, + typename ParentCG> __device__ __forceinline__ void pair_retrieve_outer( - FlushingCG const& flushing_cg, - cooperative_groups::thread_block_tile const& probing_cg, + FlushingCG flushing_cg, + cooperative_groups::thread_block_tile probing_cg, value_type const& pair, uint32_t* flushing_cg_counter, value_type* probe_output_buffer, diff --git a/include/cuco/static_multimap_ref.cuh b/include/cuco/static_multimap_ref.cuh index 2a0c69fd7..cbe27802f 100644 --- a/include/cuco/static_multimap_ref.cuh +++ b/include/cuco/static_multimap_ref.cuh @@ -291,7 +291,7 @@ class static_multimap_ref */ template [[nodiscard]] __device__ constexpr auto make_copy( - CG const& tile, + CG tile, bucket_type* const memory_to_use, cuda_thread_scope scope = {}) const noexcept; @@ -305,7 +305,7 @@ class static_multimap_ref * @param tile The cooperative thread group used to initialize the map */ template - __device__ constexpr void initialize(CG const& tile) noexcept; + __device__ constexpr void initialize(CG tile) noexcept; private: impl_type impl_; ///< Static map ref implementation diff --git a/include/cuco/static_multiset_ref.cuh b/include/cuco/static_multiset_ref.cuh index 313295d80..e23024006 100644 --- a/include/cuco/static_multiset_ref.cuh +++ b/include/cuco/static_multiset_ref.cuh @@ -271,7 +271,7 @@ class static_multiset_ref */ template [[nodiscard]] __device__ constexpr auto make_copy( - CG const& tile, + CG tile, bucket_type* const memory_to_use, cuda_thread_scope scope = {}) const noexcept; @@ -285,7 +285,7 @@ class static_multiset_ref * @param tile The cooperative thread group used to initialize the set */ template - __device__ constexpr void initialize(CG const& tile) noexcept; + __device__ constexpr void initialize(CG tile) noexcept; private: impl_type impl_; diff --git a/include/cuco/static_set_ref.cuh b/include/cuco/static_set_ref.cuh index 40a037443..327c95d75 100644 --- a/include/cuco/static_set_ref.cuh +++ b/include/cuco/static_set_ref.cuh @@ -269,7 +269,7 @@ class static_set_ref */ template [[nodiscard]] __device__ constexpr auto make_copy( - CG const& tile, + CG tile, typename StorageRef::value_type* const memory_to_use, cuda_thread_scope scope = {}) const noexcept; @@ -283,7 +283,7 @@ class static_set_ref * @param tile The cooperative thread group used to initialize the set */ template - __device__ constexpr void initialize(CG const& tile) noexcept; + __device__ constexpr void initialize(CG tile) noexcept; private: impl_type impl_; diff --git a/tests/static_multimap/for_each_test.cu b/tests/static_multimap/for_each_test.cu index f7290707d..826f915d6 100644 --- a/tests/static_multimap/for_each_test.cu +++ b/tests/static_multimap/for_each_test.cu @@ -67,7 +67,8 @@ CUCO_KERNEL void for_each_check_cooperative(Ref ref, while (idx < n) { auto const tile = - cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + cooperative_groups::tiled_partition( + cooperative_groups::this_thread_block()); auto const& key = *(first + idx); std::size_t thread_matches = 0; if constexpr (Synced) { @@ -80,7 +81,7 @@ CUCO_KERNEL void for_each_check_cooperative(Ref ref, thread_matches++; } }, - [] __device__(auto const& group) { group.sync(); }); + [] __device__(auto group) { group.sync(); }); } else { ref.for_each(tile, key, [&] __device__(auto const slot) { auto const [slot_key, slot_value] = slot; diff --git a/tests/static_multiset/for_each_test.cu b/tests/static_multiset/for_each_test.cu index b987ba660..6d663f439 100644 --- a/tests/static_multiset/for_each_test.cu +++ b/tests/static_multiset/for_each_test.cu @@ -66,7 +66,8 @@ CUCO_KERNEL void for_each_check_cooperative(Ref ref, while (idx < n) { auto const tile = - cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + cooperative_groups::tiled_partition( + cooperative_groups::this_thread_block()); auto const& key = *(first + idx); std::size_t thread_matches = 0; if constexpr (Synced) { @@ -76,7 +77,7 @@ CUCO_KERNEL void for_each_check_cooperative(Ref ref, [&] __device__(auto const slot) { if (ref.key_eq()(key, slot)) { thread_matches++; } }, - [] __device__(auto const& group) { group.sync(); }); + [] __device__(auto group) { group.sync(); }); } else { ref.for_each(tile, key, [&] __device__(auto const slot) { if (ref.key_eq()(key, slot)) { thread_matches++; } diff --git a/tests/utility/probing_scheme_test.cu b/tests/utility/probing_scheme_test.cu index 0d232df08..39048946b 100644 --- a/tests/utility/probing_scheme_test.cu +++ b/tests/utility/probing_scheme_test.cu @@ -66,7 +66,8 @@ __global__ void generate_cg_probing_sequence(Key key, if (tid < cg_size) { auto const tile = - cooperative_groups::tiled_partition(cooperative_groups::this_thread_block()); + cooperative_groups::tiled_partition( + cooperative_groups::this_thread_block()); auto iter = probing_scheme.template make_iterator(tile, key, upper_bound); From a1bc5447f5df0a0f1a9f81a4185658f29acb4a1a Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 15 Aug 2025 17:21:27 -0700 Subject: [PATCH 18/24] Use CMAKE_ARGS in build.sh and some more minor improvements --- ci/build.sh | 45 +++++++++++++++++++++++------------- cmake/roaring_testdata.cmake | 2 ++ 2 files changed, 31 insertions(+), 16 deletions(-) diff --git a/ci/build.sh b/ci/build.sh index 7ac9029e3..035a5a4c1 100755 --- a/ci/build.sh +++ b/ci/build.sh @@ -51,7 +51,12 @@ HOST_COMPILER=${CXX:-g++} # $CXX if set, otherwise `g++` CUDA_ARCHS=native # detect system's GPU architectures CXX_STANDARD=17 -EXTRA_CMAKE_OPTIONS=() +# Initialize CMAKE_ARGS from environment variable if available +if [ -n "${CMAKE_ARGS:-}" ]; then + read -ra CMAKE_ARGS <<< "$CMAKE_ARGS" +else + CMAKE_ARGS=() +fi function usage { echo "cuCollections build script" @@ -105,8 +110,15 @@ function usage { echo " Enables verbose mode for detailed output and builds with C++17 standard." echo " Build files will be written to /build/local and symlinked to /build/latest." echo - echo "Pass-through:" - echo " -- [CMake args...] Anything after -- is forwarded to CMake" + echo " Using CMAKE_ARGS Environment Variable:" + echo " $ CMAKE_ARGS=\"-DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_FEATURE=ON\" $0 -t" + echo " $ export CMAKE_ARGS=\"-DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_FEATURE=ON\"" + echo " $ $0 -t" + echo " Uses CMAKE_ARGS environment variable to pass additional CMake options." + echo " Can be overridden by using -- followed by specific arguments." + echo + echo " Pass-through to CMake:" + echo " -- [CMake args...] Anything after -- is forwarded to CMake (overrides CMAKE_ARGS env var)" echo exit 1 } @@ -131,7 +143,7 @@ while [ "${#args[@]}" -ne 0 ]; do --arch) CUDA_ARCHS="${args[1]}"; args=("${args[@]:2}");; --std) CXX_STANDARD="${args[1]}"; args=("${args[@]:2}");; -v | -verbose | --verbose) VERBOSE=1; args=("${args[@]:1}");; - --) EXTRA_CMAKE_OPTIONS+=("${args[@]:1}"); break;; + --) CMAKE_ARGS=("${args[@]:1}"); break;; -h | -help | --help) usage ;; *) echo "Unrecognized option: ${args[0]}"; usage ;; esac @@ -162,14 +174,12 @@ if [ "$BUILD_TESTS" == "OFF" ] && [ "$BUILD_EXAMPLES" == "OFF" ] && [ "$BUILD_BE BUILD_BENCHMARKS=ON fi +BUILD_DIR="$BUILD_PREFIX/$BUILD_INFIX" # Trigger clean (re-)build if [ "$CLEAN_BUILD" -eq 1 ]; then rm -rf BUILD_DIR fi - -BUILD_DIR="$BUILD_PREFIX/$BUILD_INFIX" mkdir -p $BUILD_DIR -export BUILD_DIR # TODO remove # The most recent build will be symlinked to cuCollections/build/latest rm -f $BUILD_PREFIX/latest @@ -194,10 +204,10 @@ CMAKE_OPTIONS=" -DBUILD_BENCHMARKS=${BUILD_BENCHMARKS} \ " -echo "========================================" -echo "-- START: $(date)" +echo "[INFO]==============================================" +echo "-- TIMESTAMP: $(date -u +"%Y-%m-%d %H:%M:%S UTC")" echo "-- GIT_SHA: $(git rev-parse HEAD 2>/dev/null || echo 'Not a repository')" -echo "-- PWD: $(pwd)" +echo "-- SRC_DIR: $(dirname $(pwd))" echo "-- BUILD_DIR: ${BUILD_DIR}" echo "-- BUILD_TYPE: ${BUILD_TYPE}" echo "-- PARALLEL_LEVEL: ${PARALLEL_LEVEL}" @@ -206,23 +216,26 @@ echo "-- BUILD_TESTS: ${BUILD_TESTS}" echo "-- BUILD_EXAMPLES: ${BUILD_EXAMPLES}" echo "-- BUILD_BENCHMARKS: ${BUILD_BENCHMARKS}" -if [ ${#EXTRA_CMAKE_OPTIONS[@]} -gt 0 ]; then - echo "-- EXTRA_CMAKE_OPTIONS: ${EXTRA_CMAKE_OPTIONS[*]}" +if [ ${#CMAKE_ARGS[@]} -gt 0 ]; then + echo "-- CMAKE_ARGS: ${CMAKE_ARGS[*]}" else - echo "-- EXTRA_CMAKE_OPTIONS: (none)" + echo "-- CMAKE_ARGS: (none)" fi + # configure -cmake -S .. -B $BUILD_DIR $CMAKE_OPTIONS "${EXTRA_CMAKE_OPTIONS[@]}" -echo "========================================" +echo "[CONFIGURE]========================================" +cmake -S .. -B $BUILD_DIR $CMAKE_OPTIONS "${CMAKE_ARGS[@]}" if command -v sccache >/dev/null; then source "./sccache_stats.sh" start +else + echo "sccache stats: N/A" fi #build +echo "[BUILD]============================================" cmake --build $BUILD_DIR --parallel $PARALLEL_LEVEL -echo "========================================" echo "Build complete" if command -v sccache >/dev/null; then diff --git a/cmake/roaring_testdata.cmake b/cmake/roaring_testdata.cmake index 8dded834c..168519866 100644 --- a/cmake/roaring_testdata.cmake +++ b/cmake/roaring_testdata.cmake @@ -35,5 +35,7 @@ rapids_cmake_download_with_retry("${ROARING_FORMATSPEC_BASE}/testdata64/portable "${CUCO_ROARING_DATA_DIR}/portable_bitmap64.bin" "b5a553a759167f5f9ccb3fa21552d943b4c73235635b753376f4faf62067d178") +message(STATUS "Roaring Bitmap test data downloaded to: ${CUCO_ROARING_DATA_DIR}") + # Define macro only when data is available add_compile_definitions(CUCO_ROARING_DATA_DIR="${CUCO_ROARING_DATA_DIR}") \ No newline at end of file From 830ca6518e8a9c4d1f684cd0b2c55967b817fedb Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Fri, 15 Aug 2025 18:18:51 -0700 Subject: [PATCH 19/24] Use CMAKE_ARGS in build script and other minor improvements --- ci/build.sh | 48 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 10 deletions(-) diff --git a/ci/build.sh b/ci/build.sh index 3d244f334..1d3074fb9 100755 --- a/ci/build.sh +++ b/ci/build.sh @@ -37,6 +37,9 @@ resolve_path() { # Ensure the script is being executed in its containing directory cd "$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"; +# Determine repo root as the parent of the `ci` directory +REPO_ROOT="$(cd .. && pwd)" + # Script defaults BUILD_TESTS=${BUILD_TESTS:-OFF} BUILD_EXAMPLES=${BUILD_EXAMPLES:-OFF} @@ -51,6 +54,13 @@ HOST_COMPILER=${CXX:-g++} # $CXX if set, otherwise `g++` CUDA_ARCHS=native # detect system's GPU architectures CXX_STANDARD=17 +# Initialize CMAKE_ARGS from environment variable if available +if [ -n "${CMAKE_ARGS:-}" ]; then + read -ra CMAKE_ARGS <<< "$CMAKE_ARGS" +else + CMAKE_ARGS=() +fi + function usage { echo "cuCollections build script" echo "Usage: $0 [OPTIONS]" @@ -103,6 +113,16 @@ function usage { echo " Enables verbose mode for detailed output and builds with C++17 standard." echo " Build files will be written to /build/local and symlinked to /build/latest." echo + echo " Using CMAKE_ARGS Environment Variable:" + echo " $ CMAKE_ARGS=\"-DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_FEATURE=ON\" $0 -t" + echo " $ export CMAKE_ARGS=\"-DCMAKE_VERBOSE_MAKEFILE=ON -DENABLE_FEATURE=ON\"" + echo " $ $0 -t" + echo " Uses CMAKE_ARGS environment variable to pass additional CMake options." + echo " Can be overridden by using -- followed by specific arguments." + echo + echo " Pass-through to CMake:" + echo " -- [CMake args...] Anything after -- is forwarded to CMake (overrides CMAKE_ARGS env var)" + echo exit 1 } @@ -126,6 +146,7 @@ while [ "${#args[@]}" -ne 0 ]; do --arch) CUDA_ARCHS="${args[1]}"; args=("${args[@]:2}");; --std) CXX_STANDARD="${args[1]}"; args=("${args[@]:2}");; -v | -verbose | --verbose) VERBOSE=1; args=("${args[@]:1}");; + --) CMAKE_ARGS=("${args[@]:1}"); break;; -h | -help | --help) usage ;; *) echo "Unrecognized option: ${args[0]}"; usage ;; esac @@ -156,14 +177,12 @@ if [ "$BUILD_TESTS" == "OFF" ] && [ "$BUILD_EXAMPLES" == "OFF" ] && [ "$BUILD_BE BUILD_BENCHMARKS=ON fi +BUILD_DIR="$BUILD_PREFIX/$BUILD_INFIX" # Trigger clean (re-)build if [ "$CLEAN_BUILD" -eq 1 ]; then rm -rf BUILD_DIR fi - -BUILD_DIR="$BUILD_PREFIX/$BUILD_INFIX" mkdir -p $BUILD_DIR -export BUILD_DIR # TODO remove # The most recent build will be symlinked to cuCollections/build/latest rm -f $BUILD_PREFIX/latest @@ -186,12 +205,13 @@ CMAKE_OPTIONS=" -DBUILD_TESTS=${BUILD_TESTS} \ -DBUILD_EXAMPLES=${BUILD_EXAMPLES} \ -DBUILD_BENCHMARKS=${BUILD_BENCHMARKS} \ + ${CMAKE_ARGS[*]} " -echo "========================================" -echo "-- START: $(date)" -echo "-- GIT_SHA: $(git rev-parse HEAD 2>/dev/null || echo 'Not a repository')" -echo "-- PWD: $(pwd)" +echo "[INFO]==============================================" +echo "-- TIMESTAMP: $(date -u +"%Y-%m-%d %H:%M:%S UTC")" +echo "-- GIT_SHA: $(git rev-parse HEAD 2>/dev/null || echo 'N/A')" +echo "-- SRC_DIR: ${REPO_ROOT}" echo "-- BUILD_DIR: ${BUILD_DIR}" echo "-- BUILD_TYPE: ${BUILD_TYPE}" echo "-- PARALLEL_LEVEL: ${PARALLEL_LEVEL}" @@ -200,21 +220,29 @@ echo "-- BUILD_TESTS: ${BUILD_TESTS}" echo "-- BUILD_EXAMPLES: ${BUILD_EXAMPLES}" echo "-- BUILD_BENCHMARKS: ${BUILD_BENCHMARKS}" +if [ ${#CMAKE_ARGS[@]} -gt 0 ]; then + echo "-- CMAKE_ARGS: ${CMAKE_ARGS[*]}" +else + echo "-- CMAKE_ARGS: (none)" +fi + # configure +echo "[CONFIGURE]========================================" cmake -S .. -B $BUILD_DIR $CMAKE_OPTIONS -echo "========================================" if command -v sccache >/dev/null; then source "./sccache_stats.sh" start +else + echo "sccache stats: N/A" fi #build +echo "[BUILD]============================================" cmake --build $BUILD_DIR --parallel $PARALLEL_LEVEL -echo "========================================" echo "Build complete" if command -v sccache >/dev/null; then source "./sccache_stats.sh" end else echo "sccache stats: N/A" -fi +fi \ No newline at end of file From 01169d805a2889077fc5128604f05c26f6e44cb1 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 18 Aug 2025 15:47:54 -0700 Subject: [PATCH 20/24] Address comments from code review --- README.md | 2 +- examples/roaring_bitmap/host_bulk_example.cu | 8 ++-- .../roaring_bitmap/roaring_bitmap_impl.cuh | 39 +++++++++------- .../roaring_bitmap/roaring_bitmap_storage.cuh | 11 ++--- include/cuco/detail/roaring_bitmap/util.cuh | 44 +++++++++++++------ include/cuco/roaring_bitmap.cuh | 5 ++- tests/roaring_bitmap/contains_test.cu | 5 +-- 7 files changed, 65 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index ea40b39b8..8608e3680 100644 --- a/README.md +++ b/README.md @@ -266,4 +266,4 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection `cuco::roaring_bitmap` implements a Roaring bitmap following the [Roaring bitmap format specification](https://github.com/RoaringBitmap/RoaringFormatSpec). #### Examples: -- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/roaring_bitmap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy9WA1v2zYT_iv3qsAgN7blpB_ZnI_NjdPNWF97sNMVQ1MIlETbhGVRI6k4XpD__h5JfSbK2nV75wCxRR6fu-M9dzzqzpFUSsYT6Qw_3jkscoaHXScmySojK-oMnTCLiNN1JM9EqJ-959cJPIcLnu4FW60VuGEHjgZHr2D662Q8GcHFbP7LbD66msymfS1qxN-xkCaSRpAlERWg1hRGKQnxK5_pwq9UaDvgqD8AVwtcO_nctdM5MSh7nsGW7CHhCjJJEYZJWLKYAr0NaaqAJRDybRozkoQUdkytjaocx5gDv-UgPFAE5QmuSPFpWZcEokrT9WetVDr0vN1u1yfG7D4XKy-2wtJ7N7m4nC4ue2h6uex9EuPGgqC_Z0yg48EeSIqWhSRAe2OyAy6ArATFOcW15TvBFEtWXZB8qXZEUIMTMakECzLV2LzCTvS_LoDbRxLcuNECJotrB96MFpNF1-B8mFz9NHt_BR9G8_loejW5XMBsjsGajic6VPj0FkbT3-DnyXTcBYpbh6robSq0F2gq09tKI7uHC0obZiy5NUumNGRLFkJBIFjxGyoSdAtSKrbMUg2NjAxOzLZMEWXGHjlnVHnXyXXyjCVhnEUUTsMs5J7gRCCiHzC1JWk_zNbnj2QyxWKm9p4ShCnZX6fp-UOkiHhSRV6I_yK6PH9ykiWqfVLtU-pbBQ0BtRaZVF5Eb9AR_4aGiov-uk0k5iskRNw-mSUM906SuA5Rl1ti4CnZNtYy3jKoCZKsGkMW0gB6zy1jfzCZtEYAP8jijU9vCcac4v7a6UAwuoQx3WK00GtFMWZSxzjPnWZYkIUaBUkYc77JUgMMo18msqoKk8SmcK4JdpgYnGA-7Di8OOohEFgwwxhkN4XXL2vD4KZcKJNQyMAtUR1YCr7V1hj8j3Nr0hsj_daILJCin1yd0BIzeoU8z4I-Fg2vIVs8VWs6mMoplwx3bV_yFytBuAFm_dfuFs6inyivMoHE1nMhFwJ3HMdkFiMfYUq2NN53tcu4kcoILXkc853OFR0JOTQqevDROqurGc-UyBLZD1hiJl27S52vccgLYh54rw6Pj0n0naeNiIgiXquyzmNT_j07HhlRBD1n2uuXlR2WHv-gHa9feq3qOiWJf8DTiMJVncd50c9DvyUbzJFUVznojS_eX8z88ezD9N1sNPbns9F8Mv3Rv7pcXI1HV6Oz2VSfBgGWUqrKRDFFUFHExqzD-oB1J0H-wM90f4W_MYsDzmPLRhftHg5txiPtMFW_ybPF16zyU6LWaPwd4gLJUNeKJlRns7-hewln8PGT24HeOdgqNBw2ythpoRIMAGjuGyX6pMBWAEujVo4WMOlLtNG_KZZ0oTabYVF9ceSr804BBOB5cIGlCz38PaOYZMae4mSp6HCHvLwvOFFlSgHxjxFwfjka__eyv42e6aGeHivUGBfyDWlzyph-UkhrF9wWMdjgbg9O8OsUDgf6o38fnJmH2r6AgeunmVz7AcEIbzol9n1DCQIb0ArtFFsz-_vgYPNnmC-QyZ_FPa7hfvtluG2YtjA-QbC2_XQNaEBXLHE7XauCJpHbKcDvgca6G_p6Mr5--RfI2FoRDBX_H0zEElRx8ZFqW_o-R0vj3udpqcUsLW9NeLM4ttHG5--K56-I-Od0jZq6Dgd_Q9cDkcHtUY5WybaKvGqIPGkxs0nLHhjKdOIeaYg_MXdw-22-AA6A_c3MMCH90sy4qziC7XboEympUK5ulbUaPE8imih_SVC4rPFd7OHy37BFg_SxVBYv3J_Civx-Vvfg7r7Qr7_0g_7GpJihKpMp-tnmZt7GmlH34VHVzYW4HA6R6kTsrS5Mdfc_WqqPuc0RFF0uvDQrQioEnJ6iC28JiplLlpbDAT38UI8eM-twJ-Lc9twXsylm6L5040eq7MVTsj-MK8YUSelm5Q7qNiOcNbg4ltFTvcYs8M2vM7tY0TheuVb4KTQMdCffy4Ij1S3BdO4pSxIatVAm2Cs8tYNsuaTCLZV3qsjMKXbexicMK4ct9vliXxqDdkeuoDhFBd4JlR8SqU7DNRHPz93CFkF2fsqNjJl3rbq-rmIYHyRoQ3EOHcZcUrdmSV548_tE0esXnX3JHkve5q2j6k-a419s4kktxLYvqh8ALsHUldgZx5FOBjxs9DsEJBdLal1Kp-yt8paq0WK1eErAxktzVOK3vVznd4V6sJsFQTd955URthbozdWlwLC2puoN3k56-e2k5hFZ4dr8_mGt1_IPb9hWhXyy2nQrK4rpupfFLUnvnlZkNYsnN1CvMx0tLvArmbNyH_Q4X7qPlNbtyA3DSlDrA3SVY2p_d2_5l2d4Qw1OmDzXTc-WaFTTLT9jywivv6Z9L7p23bH748m8nt9l2w2aUn7EBNrduuqkdFNmYahfszQ_Z7ahb--ISvADLGjtl7aiLBfo33wV4F9Fs8dSE639wuYUFLH1Gi239TrXMMwrtVto_B4H0DmKw3rOEBxBHlbuWlyrlQNccoiTz_RpWOksz4j261tr2PL7GZLBkBILCMNqYN41Nq56107VB-Sfa-cL7n-dhwsfnUy5d9afJGJLZKzTdfSbTyyNonqT6yQ3YXh49Co7xGlrFk46PYQ7Cw8ODo-hR0S4PpNb_3gAvR72zQr_Kd0NRL2YbAPz7jdmQQ0zDMMYB2_s61oc0ITbOPfdYh7LcmMei5Zz_8n8_Q-9NYGl)) \ No newline at end of file +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/roaring_bitmap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy9WAtv27YW_ivnqsAgN7aVpI9szmNz43Qz1msPtrtiaAqBkmibsCxqJBXHC_Lf7yGpZ6KsXbc7B4gt8vA7D37n8FB3jqRSMp5IZ_DxzmGRMzjqOjFJVhlZUWfghFlEnK4jeSZC_ew9v07gOVzydC_Yaq3ADTtwfHj8Cia_jkfjIVxOZ79MZ8PFeDrpa1Ej_o6FNJE0giyJqAC1pjBMSYhf-UwXfqVC2wHH_UNwtcC1k89dO51Tg7LnGWzJHhKuIJMUYZiEJYsp0NuQpgpYAiHfpjEjSUhhx9TaqMpxjDnwWw7CA0VQnuCKFJ-WdUkgqjRdf9ZKpQPP2-12fWLM7nOx8mIrLL1348uryfyqh6aXy94nMQYWBP09YwIdD_ZAUrQsJAHaG5MdcAFkJSjOKa4t3wmmWLLqguRLtSOCGpyISSVYkKlG8Ao70f-6AIaPJBi44RzG82sH3gzn43nX4HwYL36avl_Ah-FsNpwsxldzmM5wsyajsd4qfHoLw8lv8PN4MuoCxdChKnqbCu0Fmsp0WGlkYzintGHGkluzZEpDtmQhFASCFb-hIkG3IKViyyzV0MjI4MRsyxRRZuyRc0aVd51cJ89YEsZZROEszELuCU4EIvoBU1uS9sNsffFIJlMsZmrvKUGYkv11ml48RIqIJ1XkhfgvosuLJydZoton1T6lvlXQEFBrkUnlRfQGHfFvaKi46K_bRGK-QkLE7ZNZwjB2ksR1iLqcZr7cS0W3jeVL5AMlzTHGWwY1b5JVY8hqMnq855bIP5gEWyOAH2Txxqe3BKlAMex2OhCMLmFEt7iJGAxFcSul3vo8pZq7heTUKMjNmPNNlhpgGP4yllWxGCc2s3NNsMN84QTTZMfhxXEPgcCCGSIh6Sm8flkbBjflQpk8Q2JuierAUvCttsbgf5xZk94Y6bdGZI7M_eTqPJeY6Cukfxb0sZZ4DdniqVrTwQxPuWQYtX1JaywQ4QaY9V-7WziLfqK8ygTyXc-FXAiMOI7JLEaawoRsabzvapcxkMoILXkc851OIbPhA6OiBx-ts7rI8UyJLJH9gCVm0rVR6nyNQ14Q88B7dXRyQqLvPG1ERBTxWpV1Hpvy79nxyIhi03OmvX5Z2WHp8Q_a8fql16quU5L4BzykKCzqPM7Pgnzrt2SDOZLq4ge90eX7y6k_mn6YvJsOR_5sOpyNJz_6i6v5YjRcDM-nE31IBFhhqSoTxdRGzP40xqzDsoHlKEH-wM90v8DfmMUB57Flo4t2DwY245F2mKrf5Nnia1b5KVFrNP4OcYFkqGtFE6qz2d_QvYRz-PjJ7UDvAmxxGgwa1e2sUAkGADT3jRJ9gGCHgBVTK0cLmPQl2ujfFEu6UJvNsNa-OPbVRacAAvA8uMTShR7-nlFMMmNPceBUdLhDXt4XnKgypYD4xwg4uxqO_nvV30bP9FBPjxVqjAt5QNqcMqafFtLaBbdFDDYY7cNT_DqDo0P90b8Pzs1DLS5g4PppJtd-QHCHN50S-76hBIENaIV2hh2b_X1wsPkzzBfI5M_intRwv_0y3DZMWxifIFhbPF0DGtAVS9xO16qgSeR2CvB7oLFukr6ejK9f_gUytlYEQ8X_BxOxBFVcfKTalr7P0dK493laajFLy1uzvVkc293G5--K56_Y8c_pGjZ1HR3-DV0PRA5vj3O0SrZV5FVD5EmLmU1a9sBQphP3WEP8ibmHt9_mC-AA2N_MDLOlX5oZdxVHsAsPfSIlFcrVHbRWg-dJRBPlLwkKlzW-iz1c_hu2aJA-lsrihfEprMivbXUP7u4L_fpLP-hvTIopqjKZop9tbuZtrBl1Hx5V3VyIy8EAqU7E3urCVHf_o6X6mNscQdHlwkuzIqRCwNkZuvCWoJi5e2k5HNDDD_XoMbMOIxHntue-mKCYofvSjR-psvdRyf6g5VFq0PQIcsSAVY27_W0mH_nYqcIzxA4w1FXHdMtb7LWx9jTPwDJ6BUuq64Pp3VOWJDRqIU2wV3huB9lySYVbWlNTPqPYexuvcGN5rl3PmTjjHkWuoDhFBV4WlR8Sqc7CNRHPL9zCFkF2fsqNjJl3rbq-rmO4Q0jRhuIcOoy5pG7Nkrz05jeK0vG8ty8jYOnbvHdUHUpz_ItNPK1tsu2M6keASzB5JfbGcaTTAY8b_XIB6cWS2h51SkrkTVWjyWrxlIDdL81Sid_21p3fFuqb3SwJuu27qIyw1cBwTEda87am6g3eT3r5_aTmEVnh2vwGYq3X8g-v3laFfLLedCsrium6l8U9SUdPK7KaxZMB1OtMT4sL_ErmvIyDHudL95HSuh25YVgLap2ArnNM7e_uLf_yHG-owQmT6brt2RKNavrlZ2wZ4QXYNPBF3657dn80npXVrN54g6aUHzGBdreuOi3dlFkY6vcvzc-5benbe6IS_ABLWvu1rSjMBfo3XwX4V9HswdREa7-yOQVFbMVGy23FzjUM8lptqxeGicTpmpiR3IQHRbu2oYXE93AIAzjCyWf6IKyUlcdD-82tdb_yqxmywLARKwfDMmDePjZueddO1QLkn2vnC65-nYcLHx1KuXfWnyRiS6Sq03X0u1CsiaJ6t-skN2F4dPwqO8JpaxZOOj2EOw8PDo5OoEdEuD6XW__kEHo9bJkV_lO6EYh6MdkG5m1wzIIaZhiGMQ7e2Be4OKCZtnHuu8U81uPGPFYr5_6T-fsfn-2NIg==)) \ No newline at end of file diff --git a/examples/roaring_bitmap/host_bulk_example.cu b/examples/roaring_bitmap/host_bulk_example.cu index 4e371eaa1..46d5481cb 100644 --- a/examples/roaring_bitmap/host_bulk_example.cu +++ b/examples/roaring_bitmap/host_bulk_example.cu @@ -24,6 +24,7 @@ #include #include +#include #include #include #include @@ -95,10 +96,9 @@ bool check(std::string const& bitmap_file_path) } // Get file size - file.seekg(0, std::ios::end); - std::streamsize file_size = file.tellg(); - file.seekg(0, std::ios::beg); + auto file_size = std::filesystem::file_size(bitmap_file_path); + // Allocate host memory for the bitmap file thrust::universal_host_pinned_vector buffer(file_size); // Read file into memory @@ -130,7 +130,7 @@ int main() success &= check(data_dir + "/bitmapwithruns.bin"); success &= check(data_dir + "/portable_bitmap64.bin"); - std::cout << "success: " << (success ? "true" : "false") << std::endl; + std::cout << "success: " << std::boolalpha << success << std::endl; return success ? 0 : 1; #else diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh index 82762dbe0..b69ebf0dd 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh @@ -27,8 +27,7 @@ #include #include #include -#include -#include +#include namespace cuco::detail { @@ -75,9 +74,12 @@ class roaring_bitmap_impl { cuda::stream_ref stream = {}) const noexcept { if (this->empty()) { - auto nosync_exec_policy = thrust::cuda::par_nosync.on(stream.get()); - thrust::fill( - nosync_exec_policy, contained, contained + cuda::std::distance(first, last), false); + cub::DeviceTransform::Transform( + thrust::constant_iterator(false), + contained, + cuda::std::distance(first, last), + cuda::proclaim_return_type([] __device__(auto /* dummy */) { return false; }), + stream.get()); } else { cub::DeviceTransform::Transform( first, @@ -176,8 +178,7 @@ class roaring_bitmap_impl { index * sizeof(cuda::std::uint32_t)); } cuda::std::byte const* container = storage_ref_.data() + offset; - if (storage_ref_.metadata().has_run and - (storage_ref_.run_container_bitmap()[index / 8] & (1 << (index % 8)))) { + if (storage_ref_.metadata().has_run and check_bit(storage_ref_.run_container_bitmap(), index)) { return this->contains_run_container(container, lower); } else { cuda::std::uint32_t card; @@ -188,7 +189,7 @@ class roaring_bitmap_impl { card = 1u + misaligned_load( storage_ref_.key_cards() + (index * 2 + 1) * sizeof(cuda::std::uint16_t)); } - if (card <= 4096) { + if (card <= storage_ref_type::metadata_type::max_array_container_card) { return this->contains_array_container(container, lower, card); } else { return this->contains_bitset_container(container, lower, card); @@ -313,17 +314,21 @@ class roaring_bitmap_impl { OutputIt contained, cuda::stream_ref stream = {}) const noexcept { - auto nosync_exec_policy = thrust::cuda::par_nosync.on(stream.get()); if (this->empty()) { - thrust::fill( - nosync_exec_policy, contained, contained + cuda::std::distance(first, last), false); + cub::DeviceTransform::Transform( + thrust::constant_iterator(false), + contained, + cuda::std::distance(first, last), + cuda::proclaim_return_type([] __device__(auto /* dummy */) { return false; }), + stream.get()); } else { - thrust::transform(nosync_exec_policy, - first, - last, - contained, - cuda::proclaim_return_type( - [*this] __device__(auto key) { return this->contains(key); })); + cub::DeviceTransform::Transform( + first, + contained, + cuda::std::distance(first, last), + cuda::proclaim_return_type( + [*this] __device__(auto key) { return this->contains(key); }), + stream.get()); } } diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh index 349f1bb83..c2736fe54 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh @@ -45,8 +45,7 @@ class roaring_bitmap_storage_ref { metadata_type const& metadata) : metadata_{metadata}, data_{bitmap}, - run_container_bitmap_{ - reinterpret_cast(bitmap + metadata.run_container_bitmap)}, + run_container_bitmap_{bitmap + metadata.run_container_bitmap}, key_cards_{bitmap + metadata.key_cards}, container_offsets_{bitmap + metadata.container_offsets} { @@ -64,7 +63,7 @@ class roaring_bitmap_storage_ref { __host__ __device__ cuda::std::size_t size_bytes() const noexcept { return metadata_.size_bytes; } - __host__ __device__ cuda::std::uint8_t const* run_container_bitmap() const noexcept + __host__ __device__ cuda::std::byte const* run_container_bitmap() const noexcept { return run_container_bitmap_; } @@ -79,7 +78,7 @@ class roaring_bitmap_storage_ref { private: metadata_type metadata_; cuda::std::byte const* data_; - cuda::std::uint8_t const* run_container_bitmap_; + cuda::std::byte const* run_container_bitmap_; cuda::std::byte const* key_cards_; cuda::std::byte const* container_offsets_; }; @@ -208,10 +207,6 @@ class roaring_bitmap_storage { metadata_.num_buckets * sizeof(cuda::std::pair), cudaMemcpyHostToDevice, stream.get())); - // stream.wait(); - // clear intermediate data - // bucket_metadata.clear(); - // buckets_h.clear(); } ref_type ref() const noexcept { return ref_; } diff --git a/include/cuco/detail/roaring_bitmap/util.cuh b/include/cuco/detail/roaring_bitmap/util.cuh index 01892e73a..a3cc04ae7 100644 --- a/include/cuco/detail/roaring_bitmap/util.cuh +++ b/include/cuco/detail/roaring_bitmap/util.cuh @@ -42,6 +42,14 @@ __host__ __device__ __forceinline__ T misaligned_load(cuda::std::byte const* ptr return value; } +__host__ __device__ __forceinline__ bool check_bit(cuda::std::byte const* bitmap, + cuda::std::uint32_t index) +{ + // check if the bit at index is set + return static_cast(bitmap[index / 8]) & + (cuda::std::uint8_t(1) << (index % 8)); +} + template struct roaring_bitmap_metadata { static_assert(cuco::dependent_false, "T must be either uint32_t or uint64_t"); @@ -49,8 +57,10 @@ struct roaring_bitmap_metadata { template <> struct roaring_bitmap_metadata { + static constexpr cuda::std::uint32_t max_array_container_card = 4096; + cuda::std::size_t size_bytes = 0; - cuda::std::uint32_t num_keys = 0; + cuda::std::size_t num_keys = 0; cuda::std::uint32_t run_container_bitmap = 0; cuda::std::uint32_t key_cards = 0; cuda::std::uint32_t container_offsets = 0; @@ -63,14 +73,18 @@ struct roaring_bitmap_metadata { constexpr cuda::std::uint32_t serial_cookie_no_runcontainer = 12346; constexpr cuda::std::uint32_t serial_cookie = 12347; // constexpr cuda::std::uint32_t frozen_cookie = 13766; // not implemented - constexpr cuda::std::int32_t no_offset_threshold = 4; + constexpr cuda::std::int32_t no_offset_threshold = 4; + constexpr cuda::std::int32_t max_containers = 1 << 16; + constexpr cuda::std::uint32_t cookie_mask = 0xFFFF; + constexpr cuda::std::uint32_t cookie_shift = 16; + constexpr cuda::std::uint32_t bitset_container_bytes = 8192; cuda::std::byte const* buf = bitmap; cuda::std::uint32_t cookie; cuda::std::memcpy(&cookie, buf, sizeof(cuda::std::uint32_t)); buf += sizeof(cuda::std::uint32_t); - if ((cookie & 0xFFFF) != serial_cookie && cookie != serial_cookie_no_runcontainer) { + if ((cookie & cookie_mask) != serial_cookie && cookie != serial_cookie_no_runcontainer) { valid = false; NV_IF_TARGET( NV_IS_HOST, @@ -80,13 +94,15 @@ struct roaring_bitmap_metadata { return; } - if ((cookie & 0xFFFF) == serial_cookie) - num_containers = (cookie >> 16) + 1; + if ((cookie & cookie_mask) == serial_cookie) + // upper 16 bits of cookie are the number of containers - 1 + num_containers = (cookie >> cookie_shift) + 1; else { + // following 4 bytes are the number of containers cuda::std::memcpy(&num_containers, buf, sizeof(cuda::std::uint32_t)); buf += sizeof(cuda::std::uint32_t); } - if (num_containers < 0 or num_containers > (1 << 16)) { + if (num_containers < 0 or num_containers > max_containers) { valid = false; NV_IF_TARGET( NV_IS_HOST, @@ -95,14 +111,16 @@ struct roaring_bitmap_metadata { return; } - has_run = (cookie & 0xFFFF) == serial_cookie; + has_run = (cookie & cookie_mask) == serial_cookie; if (has_run) { - cuda::std::size_t s = (num_containers + 7) / 8; + cuda::std::size_t s = (num_containers + 7) / 8; // ceil bytes to store run container bitmap run_container_bitmap = cuda::std::distance(bitmap, buf); buf += s; } - key_cards = cuda::std::distance(bitmap, buf); + key_cards = cuda::std::distance(bitmap, buf); + // if the current address is aligned to 2 bytes, then all containers are aligned to at least 2 + // bytes bool const aligned_16 = (reinterpret_cast(bitmap + key_cards) % sizeof(cuda::std::uint16_t)) == 0; buf += num_containers * 2 * sizeof(cuda::std::uint16_t); @@ -136,16 +154,14 @@ struct roaring_bitmap_metadata { cuda::std::byte const* end = bitmap + misaligned_load( bitmap + container_offsets + (num_containers - 1) * sizeof(cuda::std::uint32_t)); - if (has_run and (static_cast( - (bitmap + run_container_bitmap)[(num_containers - 1) / 8]) & - (cuda::std::uint8_t(1) << ((num_containers - 1) % 8)))) { + if (has_run and check_bit(bitmap + run_container_bitmap, num_containers - 1)) { cuda::std::uint16_t const num_runs = misaligned_load(end); end += sizeof(cuda::std::uint16_t) + num_runs * 2 * sizeof(cuda::std::uint16_t); } else { - if (card <= 4096) { // TODO check if this is correct + if (card <= max_array_container_card) { end += card * sizeof(cuda::std::uint16_t); } else { - end += 8192; // fixed size bitset container + end += bitset_container_bytes; // fixed size bitset container } } diff --git a/include/cuco/roaring_bitmap.cuh b/include/cuco/roaring_bitmap.cuh index 4ca3fb8a2..a4be0175d 100644 --- a/include/cuco/roaring_bitmap.cuh +++ b/include/cuco/roaring_bitmap.cuh @@ -74,7 +74,8 @@ class roaring_bitmap { * `contains_async`. * * @tparam InputIt Device-accessible random access input iterator of keys convertible to `T` - * @tparam OutputIt Device-accessible random access output iterator to `bool` + * @tparam OutputIt Device-accessible random access output iterator whose `value_type` is + * constructible from `bool` * * @param first Beginning of the sequence of keys * @param last End of the sequence of keys @@ -153,7 +154,7 @@ class roaring_bitmap { [[nodiscard]] ref_type ref() const noexcept; private: - storage_type storage_; + storage_type storage_; ///< Storage type }; } // namespace cuco diff --git a/tests/roaring_bitmap/contains_test.cu b/tests/roaring_bitmap/contains_test.cu index db3b9cd33..4a30e12b4 100644 --- a/tests/roaring_bitmap/contains_test.cu +++ b/tests/roaring_bitmap/contains_test.cu @@ -26,6 +26,7 @@ #include +#include #include #include #include @@ -70,9 +71,7 @@ bool check(std::string const& bitmap_file_path) std::ifstream file(bitmap_file_path, std::ios::binary); if (!file.is_open()) { return false; } - file.seekg(0, std::ios::end); - std::streamsize file_size = file.tellg(); - file.seekg(0, std::ios::beg); + auto file_size = std::filesystem::file_size(bitmap_file_path); thrust::universal_host_pinned_vector buffer(file_size); From 4e68e8adac1c075d4989ec3afb61459f1a6df3d4 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 18 Aug 2025 16:18:35 -0700 Subject: [PATCH 21/24] Code simplifications --- .../roaring_bitmap/roaring_bitmap_impl.cuh | 70 +++++++++---------- include/cuco/detail/roaring_bitmap/util.cuh | 10 ++- 2 files changed, 38 insertions(+), 42 deletions(-) diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh index b69ebf0dd..16b2001ee 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh @@ -113,12 +113,12 @@ class roaring_bitmap_impl { // linear search #pragma unroll for (cuda::std::uint32_t i = 0; i < storage_ref_.metadata().num_containers; i++) { + cuda::std::byte const* key_ptr = + storage_ref_.key_cards() + (i * 2) * sizeof(cuda::std::uint16_t); if constexpr (Aligned) { - key = aligned_load(storage_ref_.key_cards() + - (i * 2) * sizeof(cuda::std::uint16_t)); + key = aligned_load(key_ptr); } else { - key = misaligned_load(storage_ref_.key_cards() + - (i * 2) * sizeof(cuda::std::uint16_t)); + key = misaligned_load(key_ptr); } if (key == upper) { return this->contains_container(lower, i); } if (key > upper) { return false; } @@ -129,12 +129,12 @@ class roaring_bitmap_impl { cuda::std::uint32_t right = storage_ref_.metadata().num_containers; while (left < right) { cuda::std::uint32_t mid = left + (right - left) / 2; + cuda::std::byte const* key_ptr = + storage_ref_.key_cards() + (mid * 2) * sizeof(cuda::std::uint16_t); if constexpr (Aligned) { - key = aligned_load(storage_ref_.key_cards() + - (mid * 2) * sizeof(cuda::std::uint16_t)); + key = aligned_load(key_ptr); } else { - key = misaligned_load(storage_ref_.key_cards() + - (mid * 2) * sizeof(cuda::std::uint16_t)); + key = misaligned_load(key_ptr); } if (key == upper) { @@ -170,29 +170,29 @@ class roaring_bitmap_impl { __device__ bool contains_container(cuda::std::uint16_t lower, cuda::std::uint32_t index) const { cuda::std::uint32_t offset; + cuda::std::byte const* offset_ptr = + storage_ref_.container_offsets() + index * sizeof(cuda::std::uint32_t); if (offsets_aligned_) { - offset = aligned_load(storage_ref_.container_offsets() + - index * sizeof(cuda::std::uint32_t)); + offset = aligned_load(offset_ptr); } else { - offset = misaligned_load(storage_ref_.container_offsets() + - index * sizeof(cuda::std::uint32_t)); + offset = misaligned_load(offset_ptr); } cuda::std::byte const* container = storage_ref_.data() + offset; if (storage_ref_.metadata().has_run and check_bit(storage_ref_.run_container_bitmap(), index)) { return this->contains_run_container(container, lower); } else { cuda::std::uint32_t card; + cuda::std::byte const* card_ptr = + storage_ref_.key_cards() + (index * 2 + 1) * sizeof(cuda::std::uint16_t); if constexpr (Aligned) { - card = 1u + aligned_load( - storage_ref_.key_cards() + (index * 2 + 1) * sizeof(cuda::std::uint16_t)); + card = 1u + aligned_load(card_ptr); } else { - card = 1u + misaligned_load( - storage_ref_.key_cards() + (index * 2 + 1) * sizeof(cuda::std::uint16_t)); + card = 1u + misaligned_load(card_ptr); } if (card <= storage_ref_type::metadata_type::max_array_container_card) { return this->contains_array_container(container, lower, card); } else { - return this->contains_bitset_container(container, lower, card); + return this->contains_bitset_container(container, lower); } } } @@ -206,10 +206,11 @@ class roaring_bitmap_impl { // Use linear search for small arrays, binary search for larger ones if (card < binary_search_threshold) { for (cuda::std::uint32_t i = 0; i < card; i++) { + cuda::std::byte const* elem_ptr = container + i * sizeof(cuda::std::uint16_t); if constexpr (Aligned) { - elem = aligned_load(container + i * sizeof(cuda::std::uint16_t)); + elem = aligned_load(elem_ptr); } else { - elem = misaligned_load(container + i * sizeof(cuda::std::uint16_t)); + elem = misaligned_load(elem_ptr); } if (elem == lower) { return true; } } @@ -219,12 +220,12 @@ class roaring_bitmap_impl { cuda::std::uint32_t right = card; while (left < right) { - cuda::std::uint32_t mid = left + (right - left) / 2; + cuda::std::uint32_t mid = left + (right - left) / 2; + cuda::std::byte const* elem_ptr = container + mid * sizeof(cuda::std::uint16_t); if constexpr (Aligned) { - elem = aligned_load(container + mid * sizeof(cuda::std::uint16_t)); + elem = aligned_load(elem_ptr); } else { - elem = - misaligned_load(container + mid * sizeof(cuda::std::uint16_t)); + elem = misaligned_load(elem_ptr); } if (elem == lower) { return true; @@ -239,11 +240,9 @@ class roaring_bitmap_impl { } __device__ bool contains_bitset_container(cuda::std::byte const* container, - cuda::std::uint16_t lower, - cuda::std::uint32_t card) const + cuda::std::uint16_t lower) const { - return static_cast(container[lower / 8]) & - (cuda::std::uint8_t(1) << (lower % 8)); + return check_bit(container, lower); } template @@ -262,19 +261,18 @@ class roaring_bitmap_impl { cuda::std::uint32_t end; for (cuda::std::uint32_t i = 0; i < num_runs; i++) { + // the first 16 bits of the run container denotes the number of runs + // followed by the sequence of runs as (start, end) U16 pairs + cuda::std::byte const* start_ptr = container + (i * 2 + 1) * sizeof(cuda::std::uint16_t); // TODO load start+end in one instruction if constexpr (Aligned) { - start = - aligned_load(container + (i * 2 + 1) * sizeof(cuda::std::uint16_t)); - end = - static_cast(start) + - aligned_load(container + (i * 2 + 2) * sizeof(cuda::std::uint16_t)); + start = aligned_load(start_ptr); + end = static_cast(start) + + aligned_load(start_ptr + sizeof(cuda::std::uint16_t)); } else { - start = misaligned_load(container + - (i * 2 + 1) * sizeof(cuda::std::uint16_t)); + start = misaligned_load(start_ptr); end = static_cast(start) + - misaligned_load(container + - (i * 2 + 2) * sizeof(cuda::std::uint16_t)); + misaligned_load(start_ptr + sizeof(cuda::std::uint16_t)); } if (start <= lower && end >= lower) { return true; } if (start > lower) { break; } diff --git a/include/cuco/detail/roaring_bitmap/util.cuh b/include/cuco/detail/roaring_bitmap/util.cuh index a3cc04ae7..c59f65fad 100644 --- a/include/cuco/detail/roaring_bitmap/util.cuh +++ b/include/cuco/detail/roaring_bitmap/util.cuh @@ -138,14 +138,12 @@ struct roaring_bitmap_metadata { cuda::std::uint32_t card = 0; for (cuda::std::int32_t i = 0; i < num_containers; i++) { + cuda::std::byte const* card_ptr = + bitmap + key_cards + (i * 2 + 1) * sizeof(cuda::std::uint16_t); if (aligned_16) { - card = aligned_load(bitmap + key_cards + - (i * 2 + 1) * sizeof(cuda::std::uint16_t)) + - 1u; + card = 1u + aligned_load(card_ptr); } else { - card = misaligned_load(bitmap + key_cards + - (i * 2 + 1) * sizeof(cuda::std::uint16_t)) + - 1u; + card = 1u + misaligned_load(card_ptr); } num_keys += card; } From 1dfc1399c063ee61e071e61076ba55c6efbaf745 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Mon, 18 Aug 2025 16:44:12 -0700 Subject: [PATCH 22/24] Resolve merge conflict in build.sh --- ci/build.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ci/build.sh b/ci/build.sh index e826033cf..1d3074fb9 100755 --- a/ci/build.sh +++ b/ci/build.sh @@ -72,9 +72,9 @@ function usage { echo " --prefix: Build directory prefix (Defaults to /build)" echo " -i/--infix: Build directory infix (Defaults to local)" echo " -d/--debug: Debug build" - echo " -p/--parallel: Build parallelism (Defaults to $PARALLEL_LEVEL if set, otherwise the system's number of CPU cores)" - echo " --cuda: CUDA compiler (Defaults to $CUDACXX if set, otherwise nvcc)" - echo " --cxx: Host compiler (Defaults to $CXX if set, otherwise g++)" + echo " -p/--parallel: Build parallelism (Defaults to \$PARALLEL_LEVEL if set, otherwise the system's number of CPU cores)" + echo " --cuda: CUDA compiler (Defaults to \$CUDACXX if set, otherwise nvcc)" + echo " --cxx: Host compiler (Defaults to \$CXX if set, otherwise g++)" echo " --arch: Target CUDA arches, e.g. \"60-real;70;80-virtual\" (Defaults to the system's native GPU archs)" echo " --std: CUDA/C++ standard (Defaults to 17)" echo " -v/-verbose/--verbose: Enable shell echo for debugging" From 245592e8f59d0556c8863b02d440c5379d0c1e03 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 19 Aug 2025 15:26:18 -0700 Subject: [PATCH 23/24] Use std::fs::file_size in benchmark --- benchmarks/roaring_bitmap/contains_bench.cu | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/benchmarks/roaring_bitmap/contains_bench.cu b/benchmarks/roaring_bitmap/contains_bench.cu index da66bea4d..ac7ddc55f 100644 --- a/benchmarks/roaring_bitmap/contains_bench.cu +++ b/benchmarks/roaring_bitmap/contains_bench.cu @@ -27,6 +27,7 @@ #include #include +#include #include #include @@ -43,9 +44,7 @@ void roaring_bitmap_contains(nvbench::state& state, nvbench::type_list) if (!file.is_open()) { state.skip("Bitmap file not found"); } // Get file size - file.seekg(0, std::ios::end); - std::streamsize file_size = file.tellg(); - file.seekg(0, std::ios::beg); + auto const file_size = std::filesystem::file_size(bitmap_file); thrust::universal_host_pinned_vector buffer(file_size); From 67d19ec0a63c67bf339cb5d4ef698ef004e0bc33 Mon Sep 17 00:00:00 2001 From: Daniel Juenger <2955913+sleeepyjack@users.noreply.github.com> Date: Tue, 19 Aug 2025 15:46:32 -0700 Subject: [PATCH 24/24] Move to experimental namespace --- README.md | 4 ++-- benchmarks/roaring_bitmap/contains_bench.cu | 2 +- examples/roaring_bitmap/host_bulk_example.cu | 3 ++- .../detail/roaring_bitmap/roaring_bitmap.inl | 4 ++-- .../roaring_bitmap/roaring_bitmap_impl.cuh | 4 ++-- .../roaring_bitmap/roaring_bitmap_ref.inl | 4 ++-- .../roaring_bitmap/roaring_bitmap_storage.cuh | 22 ++++++++++--------- include/cuco/detail/roaring_bitmap/util.cuh | 4 ++-- include/cuco/roaring_bitmap.cuh | 4 ++-- include/cuco/roaring_bitmap_ref.cuh | 4 ++-- tests/roaring_bitmap/contains_test.cu | 3 ++- 11 files changed, 31 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 8608e3680..ae00028e9 100644 --- a/README.md +++ b/README.md @@ -263,7 +263,7 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection ### roaring_bitmap -`cuco::roaring_bitmap` implements a Roaring bitmap following the [Roaring bitmap format specification](https://github.com/RoaringBitmap/RoaringFormatSpec). +`cuco::experimental::roaring_bitmap` implements a Roaring bitmap following the [Roaring bitmap format specification](https://github.com/RoaringBitmap/RoaringFormatSpec). #### Examples: -- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/roaring_bitmap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy9WAtv27YW_ivnqsAgN7aVpI9szmNz43Qz1msPtrtiaAqBkmibsCxqJBXHC_Lf7yGpZ6KsXbc7B4gt8vA7D37n8FB3jqRSMp5IZ_DxzmGRMzjqOjFJVhlZUWfghFlEnK4jeSZC_ew9v07gOVzydC_Yaq3ADTtwfHj8Cia_jkfjIVxOZ79MZ8PFeDrpa1Ej_o6FNJE0giyJqAC1pjBMSYhf-UwXfqVC2wHH_UNwtcC1k89dO51Tg7LnGWzJHhKuIJMUYZiEJYsp0NuQpgpYAiHfpjEjSUhhx9TaqMpxjDnwWw7CA0VQnuCKFJ-WdUkgqjRdf9ZKpQPP2-12fWLM7nOx8mIrLL1348uryfyqh6aXy94nMQYWBP09YwIdD_ZAUrQsJAHaG5MdcAFkJSjOKa4t3wmmWLLqguRLtSOCGpyISSVYkKlG8Ao70f-6AIaPJBi44RzG82sH3gzn43nX4HwYL36avl_Ah-FsNpwsxldzmM5wsyajsd4qfHoLw8lv8PN4MuoCxdChKnqbCu0Fmsp0WGlkYzintGHGkluzZEpDtmQhFASCFb-hIkG3IKViyyzV0MjI4MRsyxRRZuyRc0aVd51cJ89YEsZZROEszELuCU4EIvoBU1uS9sNsffFIJlMsZmrvKUGYkv11ml48RIqIJ1XkhfgvosuLJydZoton1T6lvlXQEFBrkUnlRfQGHfFvaKi46K_bRGK-QkLE7ZNZwjB2ksR1iLqcZr7cS0W3jeVL5AMlzTHGWwY1b5JVY8hqMnq855bIP5gEWyOAH2Txxqe3BKlAMex2OhCMLmFEt7iJGAxFcSul3vo8pZq7heTUKMjNmPNNlhpgGP4yllWxGCc2s3NNsMN84QTTZMfhxXEPgcCCGSIh6Sm8flkbBjflQpk8Q2JuierAUvCttsbgf5xZk94Y6bdGZI7M_eTqPJeY6Cukfxb0sZZ4DdniqVrTwQxPuWQYtX1JaywQ4QaY9V-7WziLfqK8ygTyXc-FXAiMOI7JLEaawoRsabzvapcxkMoILXkc851OIbPhA6OiBx-ts7rI8UyJLJH9gCVm0rVR6nyNQ14Q88B7dXRyQqLvPG1ERBTxWpV1Hpvy79nxyIhi03OmvX5Z2WHp8Q_a8fql16quU5L4BzykKCzqPM7Pgnzrt2SDOZLq4ge90eX7y6k_mn6YvJsOR_5sOpyNJz_6i6v5YjRcDM-nE31IBFhhqSoTxdRGzP40xqzDsoHlKEH-wM90v8DfmMUB57Flo4t2DwY245F2mKrf5Nnia1b5KVFrNP4OcYFkqGtFE6qz2d_QvYRz-PjJ7UDvAmxxGgwa1e2sUAkGADT3jRJ9gGCHgBVTK0cLmPQl2ujfFEu6UJvNsNa-OPbVRacAAvA8uMTShR7-nlFMMmNPceBUdLhDXt4XnKgypYD4xwg4uxqO_nvV30bP9FBPjxVqjAt5QNqcMqafFtLaBbdFDDYY7cNT_DqDo0P90b8Pzs1DLS5g4PppJtd-QHCHN50S-76hBIENaIV2hh2b_X1wsPkzzBfI5M_intRwv_0y3DZMWxifIFhbPF0DGtAVS9xO16qgSeR2CvB7oLFukr6ejK9f_gUytlYEQ8X_BxOxBFVcfKTalr7P0dK493laajFLy1uzvVkc293G5--K56_Y8c_pGjZ1HR3-DV0PRA5vj3O0SrZV5FVD5EmLmU1a9sBQphP3WEP8ibmHt9_mC-AA2N_MDLOlX5oZdxVHsAsPfSIlFcrVHbRWg-dJRBPlLwkKlzW-iz1c_hu2aJA-lsrihfEprMivbXUP7u4L_fpLP-hvTIopqjKZop9tbuZtrBl1Hx5V3VyIy8EAqU7E3urCVHf_o6X6mNscQdHlwkuzIqRCwNkZuvCWoJi5e2k5HNDDD_XoMbMOIxHntue-mKCYofvSjR-psvdRyf6g5VFq0PQIcsSAVY27_W0mH_nYqcIzxA4w1FXHdMtb7LWx9jTPwDJ6BUuq64Pp3VOWJDRqIU2wV3huB9lySYVbWlNTPqPYexuvcGN5rl3PmTjjHkWuoDhFBV4WlR8Sqc7CNRHPL9zCFkF2fsqNjJl3rbq-rmO4Q0jRhuIcOoy5pG7Nkrz05jeK0vG8ty8jYOnbvHdUHUpz_ItNPK1tsu2M6keASzB5JfbGcaTTAY8b_XIB6cWS2h51SkrkTVWjyWrxlIDdL81Sid_21p3fFuqb3SwJuu27qIyw1cBwTEda87am6g3eT3r5_aTmEVnh2vwGYq3X8g-v3laFfLLedCsrium6l8U9SUdPK7KaxZMB1OtMT4sL_ErmvIyDHudL95HSuh25YVgLap2ArnNM7e_uLf_yHG-owQmT6brt2RKNavrlZ2wZ4QXYNPBF3657dn80npXVrN54g6aUHzGBdreuOi3dlFkY6vcvzc-5benbe6IS_ABLWvu1rSjMBfo3XwX4V9HswdREa7-yOQVFbMVGy23FzjUM8lptqxeGicTpmpiR3IQHRbu2oYXE93AIAzjCyWf6IKyUlcdD-82tdb_yqxmywLARKwfDMmDePjZueddO1QLkn2vnC65-nYcLHx1KuXfWnyRiS6Sq03X0u1CsiaJ6t-skN2F4dPwqO8JpaxZOOj2EOw8PDo5OoEdEuD6XW__kEHo9bJkV_lO6EYh6MdkG5m1wzIIaZhiGMQ7e2Be4OKCZtnHuu8U81uPGPFYr5_6T-fsfn-2NIg==)) \ No newline at end of file +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/roaring_bitmap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJy9WA1v2zYT_iv3qsAgN7aVpB_ZnI_NjdPNWF97sN0VQ1MIlETbhGVRI6k4XpD__h5JfSbK2nV75wCxRR7vnjs-dzzqzpFUSsYT6Qw-3jkscgZHXScmySojK-oMnDCLiNN1JM9EqJ-959cJPIdLnu4FW60VuGEHjg-PX8Hk1_FoPITL6eyX6Wy4GE8nfS1qxN-xkCaSRpAlERWg1hSGKQnxK5_pwq9UaBxw3D8EVwtcO_nctdM5NVr2PIMt2UPCFWSSohomYcliCvQ2pKkClkDIt2nMSBJS2DG1NqZyPQYO_JYr4YEiKE9wRYpPy7okEFVC15-1UunA83a7XZ8Y2H0uVl5shaX3bnx5NZlf9RB6uex9EmNgQdDfMybQ8WAPJEVkIQkQb0x2wAWQlaA4p7hGvhNMsWTVBcmXakcENXoiJpVgQaYawStwov91AQwfSTBwwzmM59cOvBnOx_Ou0fNhvPhp-n4BH4az2XCyGF_NYTrDzZqMxnqr8OktDCe_wc_jyagLFEOHpuhtKrQXCJXpsNLIxnBOaQPGkltYMqUhW7IQCgLBit9QkaBbkFKxZZZqCDIyemK2ZYooM_bIOWPKu06uk2csCeMsonAWZiH3BCcCNfoBU1uS9sNsffFIJlMsZmrvKUGYkv11ml481BQRT6rIC_FfRJcXT06yRLVPqn1KfWugIaDWIpPKi-gNOuLf0FBx0V-3icR8hYSI2yezhGHsJInrKupymvlyLxXdNpYvkQ-UNMcYbxnUvElWjSFrydjxnlsi_2ASbI0K_CCLNz69JUgFimG304FgdAkjusVNxGAoilsp9dbnKdXcLSSn1oLcjDnfZKlRDMNfxrIqFuPEZnZuCXaYL5xgmuw4vDjuoSKwygyRkPQUXr-sDYObcqFMniExt0R1YCn4VqMx-j_OLKQ3RvqtEZkjcz-5Os8lJvoK6Z8FfawlXkO2eKrWdDDDUy4ZRm1f0hoLRLgBZv3X7hbOop8orzKBfNdzIRcCI45jMouRpjAhWxrvu9plDKQyQksex3ynU8hs-MCY6MFH66wucjxTIktkP2CJmXRtlDpf45AXxDzwXh2dnJDoO0-DiIgiXquxzmMo_x6ORyCKTc-Z9vplhcPS4x_E8fql12quU5L4BzykKCzqPM7Pgnzrt2SDOZLq4ge90eX7y6k_mn6YvJsOR_5sOpyNJz_6i6v5YjRcDM-nE31IBFhhqSoTxdRGzP40xqzDsoHlKEH-wM90v8DfmMUB57Flo4u4BwOb8Ug7TNVv8mzxNav8lKg1gr9DvUAytLWiCdXZ7G_oXsI5fPzkdqB3AbY4DQaN6nZWmASjADT3jRF9gGCHgBVTG0cETPoSMfo3xZIu1GYzrLUvjn110SkUAXgeXGLpQg9_zygmmcFTHDgVHe6Ql_cFJ6pMKVT8YwScXQ1H_73qb6NneqinxwozxoU8IG1OGeinhbR2wW0Rgw1G-_AUv87g6FB_9O-Dc_NQiwsYdf00k2s_ILjDm06p-75hBBUbpZW2M-zY7O-Dg82f6XyBTP6s3pOa3m-_TG-bTlsYnyBYWzxdozSgK5a4na41QZPI7RTK74HGukn6ejK-fvkXyNhaEQwV_x9MxBJUcfGRaVv6PkdL497naanFLC1vzfZmcWx3G5-_K56_Ysc_Z2vYtHV0-DdsPRA5vD3OtVWyrSKvGiJPImY2adkDoEwn7rFW8SdwD2-_zRfAAbC_mRlmS780M-4qjmAXHvpESiqUqztobQbPk4gmyl8SFC5rfBd7uPw3bBGQPpbK4oXxKVDk17a6B3f3hX39pR_0NybFFE2ZTNHPNjfzNtaMug-Pqm4uxOVggFQnYm9tYaq7_9FSfcxtjkrR5cJLsyKkQsDZGbrwlqCYuXtpORzQww_t6DGzDiMR59hzX0xQzNB96caPVNn7qGR_0PIoNdr0CHLEKKsad_vbTD7ysVOFZ4gdYKirjumWt9hrY-1pnoFl9AqWVNcH07unLElo1EKaYK_w3A6y5ZIKt0RTMz6j2Hsbr3BjeW5dz5k44x5FrqA4RQVeFpUfEqnOwjURzy_cAosgOz_lRsbMu9ZcX9cx3CGkaMNwrjqMuaRuDUleevMbRel43tuXEbD0xWpPBdsif0mMCBq3kKpfaY67doe_FPZpbeNtt1Q_FlyCCS2xX44jnSJ4BOkXDkg5ltT2rVPSJG-0Go1Xi_cE7B5q5kr8tjfx_AZRJ0CzTOhW8KICYSuE4Z2OvuZyzdQbvLP08jtLzSOywrX5rcSi1_IPr-PWhHyyBnUrFMV03cvi7qSjpw1Zy-LJAOp1ps_FBX4lc17GQY_zpfvIaB1HDgzrQ6070LWPqf3dveVknvcNMzhhsl-3QluitZoe-hlbRngpNk190cvrPt4fjWdlhas346Ap5UdMIO7WVaelmzILQ_1Opvk5t21-e59UKj_AMtd-lSuKdaH9m69S-Fe12cOqqa39GucUFLFVHJHbKp5bGOT121Y0DBOJ0zUxIzmEB4W8tqGFxPdwCAM4wsln-nCsjJVHRvttrnW_8usassCwESsHwzJg3kg2bn7XTtUW5J9r5wuug52HCx8dVLl31p8kYkukqtN19PtRrJOiet_rJDdheHT8KjvCaQsLJ50eqjsPDw6OTqBHRLg-l1v_5BB6PSysCv8p3RxEvZhsA_OGOGZBTWcYhjEO3tiXujigmbZx7rvFPNboxjxWK-f- \ No newline at end of file diff --git a/benchmarks/roaring_bitmap/contains_bench.cu b/benchmarks/roaring_bitmap/contains_bench.cu index ac7ddc55f..20cb27a27 100644 --- a/benchmarks/roaring_bitmap/contains_bench.cu +++ b/benchmarks/roaring_bitmap/contains_bench.cu @@ -51,7 +51,7 @@ void roaring_bitmap_contains(nvbench::state& state, nvbench::type_list) file.read(reinterpret_cast(thrust::raw_pointer_cast(buffer.data())), file_size); file.close(); - cuco::roaring_bitmap roaring_bitmap(thrust::raw_pointer_cast(buffer.data())); + cuco::experimental::roaring_bitmap roaring_bitmap(thrust::raw_pointer_cast(buffer.data())); thrust::device_vector items(num_items); diff --git a/examples/roaring_bitmap/host_bulk_example.cu b/examples/roaring_bitmap/host_bulk_example.cu index 46d5481cb..e70e6db3a 100644 --- a/examples/roaring_bitmap/host_bulk_example.cu +++ b/examples/roaring_bitmap/host_bulk_example.cu @@ -106,7 +106,8 @@ bool check(std::string const& bitmap_file_path) file.close(); // Create roaring bitmap from the file - cuco::roaring_bitmap roaring_bitmap(thrust::raw_pointer_cast(buffer.data())); + cuco::experimental::roaring_bitmap roaring_bitmap( + thrust::raw_pointer_cast(buffer.data())); // Generate query keys (all should be contained in the bitmap) auto keys = generate_keys(); diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl index 7159cc6ae..ff8dc3d13 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap.inl @@ -19,7 +19,7 @@ #include #include -namespace cuco { +namespace cuco::experimental { template roaring_bitmap::roaring_bitmap(cuda::std::byte const* bitmap, @@ -85,4 +85,4 @@ typename roaring_bitmap::ref_type roaring_bitmap::re { return ref_type{storage_.ref()}; } -} // namespace cuco \ No newline at end of file +} // namespace cuco::experimental \ No newline at end of file diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh index 16b2001ee..7276dfae8 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh @@ -29,7 +29,7 @@ #include #include -namespace cuco::detail { +namespace cuco::experimental::detail { // primary template template @@ -372,4 +372,4 @@ class roaring_bitmap_impl { storage_ref_type storage_ref_; }; -} // namespace cuco::detail \ No newline at end of file +} // namespace cuco::experimental::detail \ No newline at end of file diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl index 9536bb79f..01738ac7f 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_ref.inl @@ -22,7 +22,7 @@ #include #include -namespace cuco { +namespace cuco::experimental { template __host__ __device__ roaring_bitmap_ref::roaring_bitmap_ref(storage_ref_type const& storage_ref) @@ -87,4 +87,4 @@ __host__ __device__ cuda::std::size_t roaring_bitmap_ref::size_bytes() const return impl_.size_bytes(); } -} // namespace cuco \ No newline at end of file +} // namespace cuco::experimental \ No newline at end of file diff --git a/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh b/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh index c2736fe54..4c33f5ee4 100644 --- a/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh +++ b/include/cuco/detail/roaring_bitmap/roaring_bitmap_storage.cuh @@ -30,7 +30,7 @@ #include #include -namespace cuco::detail { +namespace cuco::experimental::detail { template struct roaring_bitmap_storage_ref { @@ -140,8 +140,8 @@ class roaring_bitmap_storage { : allocator_{alloc}, metadata_{bitmap}, data_{allocator_.allocate(metadata_.size_bytes), - detail::custom_deleter{metadata_.size_bytes, - allocator_}}, + cuco::detail::custom_deleter{metadata_.size_bytes, + allocator_}}, ref_{data_.get(), metadata_} { CUCO_CUDA_TRY(cudaMemcpyAsync( @@ -153,7 +153,8 @@ class roaring_bitmap_storage { private: allocator_type allocator_; typename ref_type::metadata_type metadata_; - std::unique_ptr> data_; + std::unique_ptr> + data_; ref_type ref_; }; @@ -186,10 +187,10 @@ class roaring_bitmap_storage { return typename ref_type::metadata_type{bitmap, bucket_metadata}; }(bucket_metadata_)}, data_{allocator_.allocate(metadata_.size_bytes), - detail::custom_deleter{metadata_.size_bytes, - allocator_}}, + cuco::detail::custom_deleter{metadata_.size_bytes, + allocator_}}, buckets_{bucket_allocator_.allocate(metadata_.num_buckets), - detail::custom_deleter{ + cuco::detail::custom_deleter{ metadata_.num_buckets, bucket_allocator_}}, ref_{data_.get(), metadata_, buckets_.get()} { @@ -217,11 +218,12 @@ class roaring_bitmap_storage { std::vector bucket_metadata_; std::vector> buckets_h_; typename ref_type::metadata_type metadata_; - std::unique_ptr> data_; + std::unique_ptr> + data_; std::unique_ptr, - custom_deleter> + cuco::detail::custom_deleter> buckets_; ref_type ref_; }; -} // namespace cuco::detail \ No newline at end of file +} // namespace cuco::experimental::detail \ No newline at end of file diff --git a/include/cuco/detail/roaring_bitmap/util.cuh b/include/cuco/detail/roaring_bitmap/util.cuh index c59f65fad..1807b471d 100644 --- a/include/cuco/detail/roaring_bitmap/util.cuh +++ b/include/cuco/detail/roaring_bitmap/util.cuh @@ -26,7 +26,7 @@ #include #include -namespace cuco::detail { +namespace cuco::experimental::detail { template __host__ __device__ __forceinline__ T aligned_load(cuda::std::byte const* ptr) @@ -237,4 +237,4 @@ struct roaring_bitmap_metadata { valid = true; } }; -} // namespace cuco::detail \ No newline at end of file +} // namespace cuco::experimental::detail \ No newline at end of file diff --git a/include/cuco/roaring_bitmap.cuh b/include/cuco/roaring_bitmap.cuh index a4be0175d..e7c083bf3 100644 --- a/include/cuco/roaring_bitmap.cuh +++ b/include/cuco/roaring_bitmap.cuh @@ -23,7 +23,7 @@ #include #include -namespace cuco { +namespace cuco::experimental { /** * @brief GPU-accelerated container that owns a serialized Roaring bitmap. @@ -157,6 +157,6 @@ class roaring_bitmap { storage_type storage_; ///< Storage type }; -} // namespace cuco +} // namespace cuco::experimental #include \ No newline at end of file diff --git a/include/cuco/roaring_bitmap_ref.cuh b/include/cuco/roaring_bitmap_ref.cuh index 88b704c28..071640a3b 100644 --- a/include/cuco/roaring_bitmap_ref.cuh +++ b/include/cuco/roaring_bitmap_ref.cuh @@ -21,7 +21,7 @@ #include #include -namespace cuco { +namespace cuco::experimental { /** * @brief Non-owning reference to a Roaring bitmap stored in its serialized format. @@ -145,6 +145,6 @@ class roaring_bitmap_ref { impl_type impl_; }; -} // namespace cuco +} // namespace cuco::experimental #include \ No newline at end of file diff --git a/tests/roaring_bitmap/contains_test.cu b/tests/roaring_bitmap/contains_test.cu index 4a30e12b4..42e0db3c8 100644 --- a/tests/roaring_bitmap/contains_test.cu +++ b/tests/roaring_bitmap/contains_test.cu @@ -78,7 +78,8 @@ bool check(std::string const& bitmap_file_path) file.read(reinterpret_cast(thrust::raw_pointer_cast(buffer.data())), file_size); file.close(); - cuco::roaring_bitmap roaring_bitmap(thrust::raw_pointer_cast(buffer.data())); + cuco::experimental::roaring_bitmap roaring_bitmap( + thrust::raw_pointer_cast(buffer.data())); auto keys = generate_keys(); thrust::device_vector contained(keys.size(), false);