diff --git a/.gitignore b/.gitignore index fc89f1b..490a548 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .idea/ +.vscode/ Cargo.lock target diff --git a/Cargo.toml b/Cargo.toml index d37c2de..258e1a9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -47,6 +47,9 @@ atomic-polyfill = [ "dep:atomic-polyfill", "once_cell/atomic-polyfill"] # Nightly-only support for AES intrinsics on 32-bit ARM nightly-arm-aes = [] +# Nightly-only support for VAES intrinsics with 256 SIMD registers +vaes = [] + [[bench]] name = "ahash" path = "tests/bench.rs" diff --git a/smhasher/ahash-cbindings/Cargo.toml b/smhasher/ahash-cbindings/Cargo.toml index 49513b3..5ac69e0 100644 --- a/smhasher/ahash-cbindings/Cargo.toml +++ b/smhasher/ahash-cbindings/Cargo.toml @@ -17,4 +17,7 @@ lto = 'fat' debug-assertions = false [dependencies] -ahash = { path = "../../", default-features = false } \ No newline at end of file +ahash = { path = "../../", default-features = false } + +[features] +vaes = ["ahash/vaes"] diff --git a/smhasher/ahash-cbindings/install.sh b/smhasher/ahash-cbindings/install.sh index 2effe93..7f9142f 100755 --- a/smhasher/ahash-cbindings/install.sh +++ b/smhasher/ahash-cbindings/install.sh @@ -1 +1,9 @@ -RUSTFLAGS="-C opt-level=3 -C target-cpu=native -C codegen-units=1" cargo build --release && sudo cp target/release/libahash_c.a /usr/local/lib/ + +# check if args contains vaes +if [[ $* == *vaes* ]]; then + export CARGO_OPTS="--features=vaes" +else + export CARGO_OPTS="" +fi + +RUSTFLAGS="-C opt-level=3 -C target-cpu=native -C codegen-units=1" cargo build ${CARGO_OPTS} --release && sudo cp target/release/libahash_c.a /usr/local/lib/ diff --git a/smhasher/ahash-cbindings/src/lib.rs b/smhasher/ahash-cbindings/src/lib.rs index e828f12..9c8fb05 100644 --- a/smhasher/ahash-cbindings/src/lib.rs +++ b/smhasher/ahash-cbindings/src/lib.rs @@ -1,6 +1,5 @@ use ahash::*; use core::slice; -use std::hash::{BuildHasher}; #[no_mangle] pub extern "C" fn ahash64(buf: *const (), len: usize, seed: u64) -> u64 { diff --git a/src/aes_hash.rs b/src/aes_hash.rs index 0b9a1d4..ed7052d 100644 --- a/src/aes_hash.rs +++ b/src/aes_hash.rs @@ -22,6 +22,55 @@ pub struct AHasher { key: u128, } +#[cfg(any( + all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "aes", not(miri)), + all(target_arch = "aarch64", target_feature = "aes", not(miri)), + all(feature = "nightly-arm-aes", target_arch = "arm", target_feature = "aes", not(miri)), +))] +fn hash_batch_128b(data: &mut &[u8], hasher: &mut AHasher) { + let tail = read_last4_vec256(data); + let mut current = [ + aesenc_vec256(convert_u128_to_vec256(hasher.key), tail[0]), + aesdec_vec256(convert_u128_to_vec256(hasher.key), tail[1]), + aesenc_vec256(convert_u128_to_vec256(hasher.key), tail[2]), + aesdec_vec256(convert_u128_to_vec256(hasher.key), tail[3]), + ]; + let mut sum: [Vector256; 2] = [convert_u128_to_vec256(hasher.key), convert_u128_to_vec256(!hasher.key)]; + sum[0] = add_by_64s_vec256(sum[0], tail[0]); + sum[1] = add_by_64s_vec256(sum[1], tail[1]); + sum[0] = shuffle_and_add_vec256(sum[0], tail[2]); + sum[1] = shuffle_and_add_vec256(sum[1], tail[3]); + while data.len() > 128 { + let (blocks, rest) = read4_vec256(data); + current[0] = aesdec_vec256(current[0], blocks[0]); + current[1] = aesdec_vec256(current[1], blocks[1]); + current[2] = aesdec_vec256(current[2], blocks[2]); + current[3] = aesdec_vec256(current[3], blocks[3]); + sum[0] = shuffle_and_add_vec256(sum[0], blocks[0]); + sum[1] = shuffle_and_add_vec256(sum[1], blocks[1]); + sum[0] = shuffle_and_add_vec256(sum[0], blocks[2]); + sum[1] = shuffle_and_add_vec256(sum[1], blocks[3]); + *data = rest; + } + let current = [ + convert_vec256_to_u128(current[0]), + convert_vec256_to_u128(current[1]), + convert_vec256_to_u128(current[2]), + convert_vec256_to_u128(current[3]), + ]; + let sum = [convert_vec256_to_u128(sum[0]), convert_vec256_to_u128(sum[1])]; + hasher.hash_in_2( + aesenc(current[0][0], current[0][1]), + aesenc(current[1][0], current[1][1]), + ); + hasher.hash_in(add_by_64s(sum[0][0].convert(), sum[0][1].convert()).convert()); + hasher.hash_in_2( + aesenc(current[2][0], current[2][1]), + aesenc(current[3][0], current[3][1]), + ); + hasher.hash_in(add_by_64s(sum[1][0].convert(), sum[1][1].convert()).convert()); +} + impl AHasher { /// Creates a new hasher keyed to the provided keys. /// @@ -47,6 +96,7 @@ impl AHasher { /// /// println!("Hash is {:x}!", hasher.finish()); /// ``` + #[allow(unused)] #[inline] pub(crate) fn new_with_keys(key1: u128, key2: u128) -> Self { let pi: [u128; 2] = PI.convert(); @@ -160,6 +210,9 @@ impl Hasher for AHasher { self.hash_in(value.convert()); } else { if data.len() > 32 { + if data.len() > 128 { + return hash_batch_128b(&mut data, self); + } if data.len() > 64 { let tail = data.read_last_u128x4(); let mut current: [u128; 4] = [self.key; 4]; diff --git a/src/convert.rs b/src/convert.rs index 712eae1..7541824 100644 --- a/src/convert.rs +++ b/src/convert.rs @@ -19,6 +19,7 @@ macro_rules! convert { }; } +convert!([u128; 8], [u8; 128]); convert!([u128; 4], [u64; 8]); convert!([u128; 4], [u32; 16]); convert!([u128; 4], [u16; 32]); @@ -79,12 +80,14 @@ pub(crate) trait ReadFromSlice { fn read_u128(&self) -> (u128, &[u8]); fn read_u128x2(&self) -> ([u128; 2], &[u8]); fn read_u128x4(&self) -> ([u128; 4], &[u8]); + fn read_u128x8(&self) -> ([u128; 8], &[u8]); fn read_last_u16(&self) -> u16; fn read_last_u32(&self) -> u32; fn read_last_u64(&self) -> u64; fn read_last_u128(&self) -> u128; fn read_last_u128x2(&self) -> [u128; 2]; fn read_last_u128x4(&self) -> [u128; 4]; + fn read_last_u128x8(&self) -> [u128; 8]; } impl ReadFromSlice for [u8] { @@ -124,6 +127,12 @@ impl ReadFromSlice for [u8] { (as_array!(value, 64).convert(), rest) } + #[inline(always)] + fn read_u128x8(&self) -> ([u128; 8], &[u8]) { + let (value, rest) = self.split_at(128); + (as_array!(value, 128).convert(), rest) + } + #[inline(always)] fn read_last_u16(&self) -> u16 { let (_, value) = self.split_at(self.len() - 2); @@ -159,4 +168,9 @@ impl ReadFromSlice for [u8] { let (_, value) = self.split_at(self.len() - 64); as_array!(value, 64).convert() } + + fn read_last_u128x8(&self) -> [u128; 8] { + let (_, value) = self.split_at(self.len() - 128); + as_array!(value, 128).convert() + } } diff --git a/src/lib.rs b/src/lib.rs index 2086513..5f7551e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -99,6 +99,7 @@ Note the import of [HashMapExt]. This is needed for the constructor. #![cfg_attr(all(not(test), not(feature = "std")), no_std)] #![cfg_attr(feature = "specialize", feature(min_specialization))] #![cfg_attr(feature = "nightly-arm-aes", feature(stdarch_arm_neon_intrinsics))] +#![cfg_attr(feature = "vaes", feature(stdsimd))] #[macro_use] mod convert; diff --git a/src/operations.rs b/src/operations.rs index a420587..911202c 100644 --- a/src/operations.rs +++ b/src/operations.rs @@ -180,6 +180,237 @@ pub(crate) fn add_in_length(enc: &mut u128, len: u64) { } } +#[cfg(any( + all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "aes", not(miri)), + all(target_arch = "aarch64", target_feature = "aes", not(miri)), + all(feature = "nightly-arm-aes", target_arch = "arm", target_feature = "aes", not(miri)), +))] +mod vaes { + use super::*; + cfg_if::cfg_if! { + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + pub type Vector256 = core::arch::x86_64::__m256i; + } + else { + pub type Vector256 = [u128;2]; + } + } + + #[inline(always)] + pub(crate) fn aesenc_vec256(value: Vector256, xor: Vector256) -> Vector256 { + cfg_if::cfg_if! { + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + use core::arch::x86_64::*; + unsafe { + _mm256_aesenc_epi128(value, xor) + } + } + else { + [ + aesenc(value[0], xor[0]), + aesenc(value[1], xor[1]), + ] + } + } + } + + #[inline(always)] + pub(crate) fn aesdec_vec256(value: Vector256, xor: Vector256) -> Vector256 { + cfg_if::cfg_if! { + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + use core::arch::x86_64::*; + unsafe { + _mm256_aesdec_epi128(value, xor) + } + } + else { + [ + aesdec(value[0], xor[0]), + aesdec(value[1], xor[1]), + ] + } + } + } + + #[inline(always)] + pub(crate) fn add_by_64s_vec256(a: Vector256, b: Vector256) -> Vector256 { + cfg_if::cfg_if! { + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + use core::arch::x86_64::*; + unsafe { _mm256_add_epi64(a, b) } + } + else { + [ + transmute!(add_by_64s(transmute!(a[0]), transmute!(b[0]))), + transmute!(add_by_64s(transmute!(a[1]), transmute!(b[1]))), + ] + } + } + } + + #[inline(always)] + pub(crate) fn shuffle_vec256(value: Vector256) -> Vector256 { + cfg_if::cfg_if! { + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + unsafe { + use core::arch::x86_64::*; + let mask = convert_u128_to_vec256(SHUFFLE_MASK); + _mm256_shuffle_epi8(value, mask) + } + } + else { + + [ + shuffle(value[0]), + shuffle(value[1]), + ] + } + } + } + + pub(crate) fn shuffle_and_add_vec256(base: Vector256, to_add: Vector256) -> Vector256 { + add_by_64s_vec256(shuffle_vec256(base), to_add) + } + + // We specialize this routine because sometimes the compiler is not able to + // optimize it properly. + pub(crate) fn read4_vec256(data: &[u8]) -> ([Vector256; 4], &[u8]) { + cfg_if::cfg_if! { + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + use core::arch::x86_64::*; + let (arr, rem) = data.split_at(128); + let arr = unsafe { + [ _mm256_loadu_si256(arr.as_ptr().cast::<__m256i>()), + _mm256_loadu_si256(arr.as_ptr().add(32).cast::<__m256i>()), + _mm256_loadu_si256(arr.as_ptr().add(64).cast::<__m256i>()), + _mm256_loadu_si256(arr.as_ptr().add(96).cast::<__m256i>()), + ] + }; + (arr, rem) + } + else { + let (arr, slice) = data.read_u128x8(); + (transmute!(arr), slice) + } + } + } + + // We specialize this routine because sometimes the compiler is not able to + // optimize it properly. + pub(crate) fn read_last4_vec256(data: &[u8]) -> [Vector256; 4] { + cfg_if::cfg_if! { + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + use core::arch::x86_64::*; + let (_, arr) = data.split_at(data.len() - 128); + let arr = unsafe { + [ _mm256_loadu_si256(arr.as_ptr().cast::<__m256i>()), + _mm256_loadu_si256(arr.as_ptr().add(32).cast::<__m256i>()), + _mm256_loadu_si256(arr.as_ptr().add(64).cast::<__m256i>()), + _mm256_loadu_si256(arr.as_ptr().add(96).cast::<__m256i>()), + ] + }; + arr + } + else { + let arr = data.read_last_u128x8(); + transmute!(arr) + } + } + } + + // We specialize this routine because sometimes the compiler is not able to + // optimize it properly. + pub(crate) fn convert_u128_to_vec256(x: u128) -> Vector256 { + cfg_if::cfg_if! { + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + use core::arch::x86_64::*; + unsafe { + _mm256_set_epi64x( + (x >> 64) as i64, + x as i64, + (x >> 64) as i64, + x as i64, + ) + } + } + else { + transmute!([x, x]) + } + } + } + + // We specialize this routine because sometimes the compiler is not able to + // optimize it properly. + pub(crate) fn convert_vec256_to_u128(x: Vector256) -> [u128; 2] { + cfg_if::cfg_if! { + if #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "vaes", + feature = "vaes", + not(miri) + ))] { + use core::arch::x86_64::*; + unsafe { + [ + transmute!(_mm256_extracti128_si256(x, 0)), + transmute!(_mm256_extracti128_si256(x, 1)), + ] + } + } + else { + x + } + } + } +} + +#[cfg(any( + all(any(target_arch = "x86", target_arch = "x86_64"), target_feature = "aes", not(miri)), + all(target_arch = "aarch64", target_feature = "aes", not(miri)), + all(feature = "nightly-arm-aes", target_arch = "arm", target_feature = "aes", not(miri)), +))] +pub(crate) use vaes::*; + #[cfg(test)] mod test { use super::*; diff --git a/tests/bench.rs b/tests/bench.rs index e038ba4..24a08a6 100644 --- a/tests/bench.rs +++ b/tests/bench.rs @@ -56,10 +56,11 @@ fn seahash(b: &H) -> u64 { hasher.finish() } -const STRING_LENGTHS: [u32; 12] = [1, 3, 4, 7, 8, 15, 16, 24, 33, 68, 132, 1024]; +const STRING_LENGTHS: &'static [u32; 12] = &[1, 3, 4, 7, 8, 15, 16, 24, 33, 68, 132, 1024]; +const WIDER_STRINGS_LENGTHS: &'static [u32] = &[1, 64, 1024, 4096, 5261, 16384, 19997]; -fn gen_strings() -> Vec { - STRING_LENGTHS +fn gen_strings(lengths: &[u32]) -> Vec { + lengths .iter() .map(|len| { let mut string = String::default(); @@ -83,7 +84,12 @@ macro_rules! bench_inputs { $group.bench_function("u32", |b| b.iter_batched(|| rng.gen::(), |v| $hash(&v), size)); $group.bench_function("u64", |b| b.iter_batched(|| rng.gen::(), |v| $hash(&v), size)); $group.bench_function("u128", |b| b.iter_batched(|| rng.gen::(), |v| $hash(&v), size)); - $group.bench_with_input("strings", &gen_strings(), |b, s| b.iter(|| $hash(black_box(s)))); + $group.bench_with_input("strings", &gen_strings(STRING_LENGTHS), |b, s| { + b.iter(|| $hash(black_box(s))) + }); + $group.bench_with_input("wide-strings", &gen_strings(WIDER_STRINGS_LENGTHS), |b, s| { + b.iter(|| $hash(black_box(s))) + }); }; }