Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
972 changes: 375 additions & 597 deletions Cargo.lock

Large diffs are not rendered by default.

17 changes: 5 additions & 12 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "wgpu_sort"
version = "0.1.0"
edition = "2021"
edition = "2024"
authors = ["Simon Niedermayr", "Josef Stumpfegger"]
license = "BSD-2-Clause"
description = " WebGPU/wgpu Radix Key-Value Sort "
Expand All @@ -12,31 +12,24 @@ keywords = ["wgpu", "gpu", "sort","radxi","wgpu"]
categories = ["rendering","algorithms"]
readme = "README.md"


[package.metadata.docs.rs]
all-features = true

[dependencies]
wgpu = { version = "0.20" }
wgpu = { version = "27.0" }
bytemuck = { version = "1.13.0", features = ["derive"] }
futures-intrusive = "0.5.0"

log = "0.4"
env_logger = "0.11"


[dev-dependencies]
rand = "0.8.5"
pollster = { version = "0.3.0", features = ["macro"] }
env_logger = "0.11"
rand = "0.9"
pollster = { version = "0.4.0", features = ["macro"] }
float-ord = "0.3.2"
criterion = { version = "0.4", features = ["html_reports"] }


[[bench]]
name = "sort"
harness = false


[[example]]
name = "sort"

Expand Down
192 changes: 103 additions & 89 deletions benches/sort.rs
Original file line number Diff line number Diff line change
@@ -1,102 +1,116 @@
use std::{num::NonZeroU32, time::Duration};

use wgpu_sort::{utils::{download_buffer, guess_workgroup_size}, GPUSorter, SortBuffers};

struct SortStuff{
device:wgpu::Device,
queue:wgpu::Queue,
query_set:wgpu::QuerySet,
query_buffer:wgpu::Buffer,
use wgpu_sort::{
GPUSorter, SortBuffers,
utils::{download_buffer, guess_workgroup_size},
};

struct SortStuff {
device: wgpu::Device,
queue: wgpu::Queue,
query_set: wgpu::QuerySet,
query_buffer: wgpu::Buffer,
}

async fn setup()-> SortStuff{
let instance = wgpu::Instance::new(wgpu::InstanceDescriptor::default());

let adapter = wgpu::util::initialize_adapter_from_env_or_default(&instance, None)
.await
.unwrap();

let (device, queue) = adapter
.request_device(
&wgpu::DeviceDescriptor {
required_features: wgpu::Features::TIMESTAMP_QUERY,
required_limits: wgpu::Limits{
max_buffer_size:1<<30,
max_storage_buffer_binding_size:1<<30,
..Default::default()
},
label: None,
},
None,
)
.await
.unwrap();

let capacity = 2;
let query_set = device.create_query_set(&wgpu::QuerySetDescriptor {
label: Some("time stamp query set"),
ty: wgpu::QueryType::Timestamp,
count: capacity,
});


let query_buffer = device.create_buffer(&wgpu::BufferDescriptor {
label: Some("query set buffer"),
size: capacity as u64 * std::mem::size_of::<u64>() as u64,
usage: wgpu::BufferUsages::QUERY_RESOLVE | wgpu::BufferUsages::COPY_SRC,
mapped_at_creation: false,
});

return SortStuff{device,queue,query_set,query_buffer}

async fn setup() -> SortStuff {
let instance = wgpu::Instance::new(&wgpu::InstanceDescriptor::default());

let adapter = wgpu::util::initialize_adapter_from_env_or_default(&instance, None)
.await
.unwrap();

let (device, queue) = adapter
.request_device(&wgpu::DeviceDescriptor {
required_features: wgpu::Features::TIMESTAMP_QUERY,
required_limits: wgpu::Limits {
max_buffer_size: 1 << 30,
max_storage_buffer_binding_size: 1 << 30,
..Default::default()
},
experimental_features: Default::default(),
memory_hints: Default::default(),
label: None,
trace: Default::default(),
})
.await
.unwrap();

let capacity = 2;
let query_set = device.create_query_set(&wgpu::QuerySetDescriptor {
label: Some("time stamp query set"),
ty: wgpu::QueryType::Timestamp,
count: capacity,
});

let query_buffer = device.create_buffer(&wgpu::BufferDescriptor {
label: Some("query set buffer"),
size: capacity as u64 * std::mem::size_of::<u64>() as u64,
usage: wgpu::BufferUsages::QUERY_RESOLVE | wgpu::BufferUsages::COPY_SRC,
mapped_at_creation: false,
});

SortStuff {
device,
queue,
query_set,
query_buffer,
}
}

async fn sort(context:&SortStuff,sorter:&GPUSorter,buffers:&SortBuffers,n:u32,iters:u32) -> Duration {

let mut encoder = context.device.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: None,
});

encoder.write_timestamp(&context.query_set, 0);

for _ in 0..iters{
sorter.sort(&mut encoder,&context.queue,buffers,Some(n));
}

encoder.write_timestamp(&context.query_set, 1);
encoder.resolve_query_set(
&context.query_set,
0..2,
&context.query_buffer,
0,
);
let idx = context.queue.submit([encoder.finish()]);
context.device.poll(wgpu::Maintain::WaitForSubmissionIndex(idx));

let timestamps : Vec<u64> = pollster::block_on(download_buffer(&context.query_buffer, &context.device, &context.queue, ..));
let diff_ticks = timestamps[1] - timestamps[0];
let period = context.queue.get_timestamp_period();
let diff_time = Duration::from_nanos((diff_ticks as f32 * period / iters as f32) as u64);

return diff_time;
async fn sort(
context: &SortStuff,
sorter: &GPUSorter,
buffers: &SortBuffers,
n: u32,
iters: u32,
) -> Duration {
let mut encoder = context
.device
.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });

encoder.write_timestamp(&context.query_set, 0);

for _ in 0..iters {
sorter.sort(&mut encoder, &context.queue, buffers, Some(n));
}

encoder.write_timestamp(&context.query_set, 1);
encoder.resolve_query_set(&context.query_set, 0..2, &context.query_buffer, 0);
let idx = context.queue.submit([encoder.finish()]);
context
.device
.poll(wgpu::PollType::Wait {
submission_index: Some(idx),
timeout: None,
})
.unwrap();

let timestamps: Vec<u64> = pollster::block_on(download_buffer(
&context.query_buffer,
&context.device,
&context.queue,
..,
));
let diff_ticks = timestamps[1] - timestamps[0];
let period = context.queue.get_timestamp_period();
let diff_time = Duration::from_nanos((diff_ticks as f32 * period / iters as f32) as u64);

diff_time
}



#[pollster::main]
async fn main() {
let context = setup().await;

let context = setup().await;
let subgroup_size = guess_workgroup_size(&context.device, &context.queue)
.await
.expect("could not find a valid subgroup size");

let subgroup_size = guess_workgroup_size(&context.device, &context.queue).await.expect("could not find a valid subgroup size");
let sorter = GPUSorter::new(&context.device, subgroup_size);

let sorter = GPUSorter::new(&context.device, subgroup_size);


for n in [10_000,100_000,1_000_000,8_000_000,20_000_000]{
let buffers = sorter.create_sort_buffers(&context.device, NonZeroU32::new(n).unwrap());
let d = sort(&context,&sorter, &buffers,n,10000).await;
println!("{n}: {d:?}");
}
}

for n in [10_000, 100_000, 1_000_000, 8_000_000, 20_000_000] {
let buffers = sorter.create_sort_buffers(&context.device, NonZeroU32::new(n).unwrap());
let d = sort(&context, &sorter, &buffers, n, 10000).await;
println!("{n}: {d:?}");
}
}
8 changes: 5 additions & 3 deletions examples/sort.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use wgpu_sort::{utils::{download_buffer, guess_workgroup_size, upload_to_buffer}

#[pollster::main]
async fn main(){
let instance = wgpu::Instance::new(wgpu::InstanceDescriptor::default());
let instance = wgpu::Instance::new(&wgpu::InstanceDescriptor::default());

let adapter = wgpu::util::initialize_adapter_from_env_or_default(&instance, None)
.await
Expand All @@ -17,9 +17,11 @@ async fn main(){
&wgpu::DeviceDescriptor {
required_features: wgpu::Features::empty(),
required_limits: wgpu::Limits::default(),
experimental_features: Default::default(),
memory_hints: Default::default(),
label: None,
trace: Default::default(),
},
None,
)
.await
.unwrap();
Expand Down Expand Up @@ -60,7 +62,7 @@ async fn main(){

// wait for sorter to finish
let idx = queue.submit([encoder.finish()]);
device.poll(wgpu::Maintain::WaitForSubmissionIndex(idx));
device.poll(wgpu::PollType::Wait { submission_index: Some(idx), timeout: None }).unwrap();

// keys buffer has padding at the end
// so we only download the "valid" data
Expand Down
8 changes: 5 additions & 3 deletions examples/sort_indirect.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use wgpu_sort::{utils::{download_buffer, guess_workgroup_size, upload_to_buffer}

#[pollster::main]
async fn main(){
let instance = wgpu::Instance::new(wgpu::InstanceDescriptor::default());
let instance = wgpu::Instance::new(&wgpu::InstanceDescriptor::default());

let adapter = wgpu::util::initialize_adapter_from_env_or_default(&instance, None)
.await
Expand All @@ -20,9 +20,11 @@ async fn main(){
&wgpu::DeviceDescriptor {
required_features: wgpu::Features::empty(),
required_limits: wgpu::Limits::default(),
experimental_features: Default::default(),
memory_hints: Default::default(),
label: None,
trace: Default::default(),
},
None,
)
.await
.unwrap();
Expand Down Expand Up @@ -78,7 +80,7 @@ async fn main(){

// wait for sorter to fininsh
let idx = queue.submit([encoder.finish()]);
device.poll(wgpu::Maintain::WaitForSubmissionIndex(idx));
device.poll(wgpu::PollType::Wait { submission_index: Some(idx), timeout: None }).unwrap();

// keys buffer has padding at the end
// so we only download the "valid" data
Expand Down
Loading