From 8fff48911c5db538080ff1560b515f7e16269185 Mon Sep 17 00:00:00 2001 From: silvanshade Date: Sat, 10 Feb 2024 12:20:17 -0700 Subject: [PATCH 1/5] Update cpufeatures dependency --- Cargo.lock | 4 ++-- aes/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 96dff53b..eafa190b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -92,9 +92,9 @@ dependencies = [ [[package]] name = "cpufeatures" -version = "0.2.11" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce420fe07aecd3e67c5f910618fe65e94158f6dcc0adf44e00d69ce2bdfe0fd0" +checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" dependencies = [ "libc", ] diff --git a/aes/Cargo.toml b/aes/Cargo.toml index ff892e84..438f5afd 100644 --- a/aes/Cargo.toml +++ b/aes/Cargo.toml @@ -18,7 +18,7 @@ cipher = "=0.5.0-pre.2" zeroize = { version = "1.5.6", optional = true, default_features = false, features = ["aarch64"] } [target.'cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "x86"))'.dependencies] -cpufeatures = "0.2" +cpufeatures = "0.2.12" [dev-dependencies] cipher = { version = "=0.5.0-pre.2", features = ["dev"] } From dff5efa42e215c81c35df23d775d69df73a4e4aa Mon Sep 17 00:00:00 2001 From: silvanshade Date: Thu, 1 Feb 2024 15:36:10 -0700 Subject: [PATCH 2/5] Implement VAES AVX and AVX512 backends for aes --- aes/src/armv8.rs | 37 +- aes/src/autodetect.rs | 126 ++--- aes/src/hazmat.rs | 2 +- aes/src/lib.rs | 51 +- aes/src/ni.rs | 363 ------------- aes/src/soft.rs | 34 +- aes/src/x86.rs | 529 +++++++++++++++++++ aes/src/x86/ni.rs | 34 ++ aes/src/{ => x86}/ni/aes128.rs | 30 +- aes/src/{ => x86}/ni/aes192.rs | 34 +- aes/src/{ => x86}/ni/aes256.rs | 30 +- aes/src/{ => x86}/ni/hazmat.rs | 20 +- aes/src/{ => x86}/ni/test_expand.rs | 0 aes/src/{ => x86}/ni/utils.rs | 36 +- aes/src/x86/vaes256.rs | 3 + aes/src/x86/vaes256/aes128.rs | 533 +++++++++++++++++++ aes/src/x86/vaes256/aes192.rs | 603 ++++++++++++++++++++++ aes/src/x86/vaes256/aes256.rs | 673 ++++++++++++++++++++++++ aes/src/x86/vaes512.rs | 3 + aes/src/x86/vaes512/aes128.rs | 602 ++++++++++++++++++++++ aes/src/x86/vaes512/aes192.rs | 684 +++++++++++++++++++++++++ aes/src/x86/vaes512/aes256.rs | 766 ++++++++++++++++++++++++++++ 22 files changed, 4590 insertions(+), 603 deletions(-) delete mode 100644 aes/src/ni.rs create mode 100644 aes/src/x86.rs create mode 100644 aes/src/x86/ni.rs rename aes/src/{ => x86}/ni/aes128.rs (80%) rename aes/src/{ => x86}/ni/aes192.rs (83%) rename aes/src/{ => x86}/ni/aes256.rs (85%) rename aes/src/{ => x86}/ni/hazmat.rs (77%) rename aes/src/{ => x86}/ni/test_expand.rs (100%) rename aes/src/{ => x86}/ni/utils.rs (64%) create mode 100644 aes/src/x86/vaes256.rs create mode 100644 aes/src/x86/vaes256/aes128.rs create mode 100644 aes/src/x86/vaes256/aes192.rs create mode 100644 aes/src/x86/vaes256/aes256.rs create mode 100644 aes/src/x86/vaes512.rs create mode 100644 aes/src/x86/vaes512/aes128.rs create mode 100644 aes/src/x86/vaes512/aes192.rs create mode 100644 aes/src/x86/vaes512/aes256.rs diff --git a/aes/src/armv8.rs b/aes/src/armv8.rs index 4ac12959..8fe68c12 100644 --- a/aes/src/armv8.rs +++ b/aes/src/armv8.rs @@ -31,6 +31,13 @@ use cipher::{ use core::arch::aarch64::*; use core::fmt; +pub(crate) mod features { + cpufeatures::new!(features_aes, "aes"); + pub(crate) mod aes { + pub use super::features_aes::*; + } +} + macro_rules! define_aes_impl { ( $name:ident, @@ -50,18 +57,6 @@ macro_rules! define_aes_impl { decrypt: $name_dec, } - impl $name { - #[inline(always)] - pub(crate) fn get_enc_backend(&self) -> $name_back_enc<'_> { - self.encrypt.get_enc_backend() - } - - #[inline(always)] - pub(crate) fn get_dec_backend(&self) -> $name_back_dec<'_> { - self.decrypt.get_dec_backend() - } - } - impl BlockCipher for $name {} impl KeySizeUser for $name { @@ -132,13 +127,6 @@ macro_rules! define_aes_impl { round_keys: [uint8x16_t; $rounds], } - impl $name_enc { - #[inline(always)] - pub(crate) fn get_enc_backend(&self) -> $name_back_enc<'_> { - $name_back_enc(self) - } - } - impl BlockCipher for $name_enc {} impl KeySizeUser for $name_enc { @@ -160,7 +148,7 @@ macro_rules! define_aes_impl { impl BlockCipherEncrypt for $name_enc { fn encrypt_with_backend(&self, f: impl BlockClosure) { - f.call(&mut self.get_enc_backend()) + f.call(&mut $name_back_enc(self)) } } @@ -194,13 +182,6 @@ macro_rules! define_aes_impl { round_keys: [uint8x16_t; $rounds], } - impl $name_dec { - #[inline(always)] - pub(crate) fn get_dec_backend(&self) -> $name_back_dec<'_> { - $name_back_dec(self) - } - } - impl BlockCipher for $name_dec {} impl KeySizeUser for $name_dec { @@ -235,7 +216,7 @@ macro_rules! define_aes_impl { impl BlockCipherDecrypt for $name_dec { fn decrypt_with_backend(&self, f: impl BlockClosure) { - f.call(&mut self.get_dec_backend()); + f.call(&mut $name_back_dec(self)); } } diff --git a/aes/src/autodetect.rs b/aes/src/autodetect.rs index 3cf17fc4..22df02ee 100644 --- a/aes/src/autodetect.rs +++ b/aes/src/autodetect.rs @@ -11,12 +11,10 @@ use core::fmt; use core::mem::ManuallyDrop; #[cfg(target_arch = "aarch64")] -use crate::armv8 as intrinsics; +use crate::armv8 as arch; #[cfg(any(target_arch = "x86_64", target_arch = "x86"))] -use crate::ni as intrinsics; - -cpufeatures::new!(aes_intrinsics, "aes"); +use crate::x86 as arch; macro_rules! define_aes_impl { ( @@ -28,21 +26,21 @@ macro_rules! define_aes_impl { $doc:expr $(,)? ) => { mod $module { - use super::{intrinsics, soft}; + use super::{arch, soft}; use core::mem::ManuallyDrop; pub(super) union Inner { - pub(super) intrinsics: ManuallyDrop, + pub(super) arch: ManuallyDrop, pub(super) soft: ManuallyDrop, } pub(super) union InnerEnc { - pub(super) intrinsics: ManuallyDrop, + pub(super) arch: ManuallyDrop, pub(super) soft: ManuallyDrop, } pub(super) union InnerDec { - pub(super) intrinsics: ManuallyDrop, + pub(super) arch: ManuallyDrop, pub(super) soft: ManuallyDrop, } } @@ -51,7 +49,7 @@ macro_rules! define_aes_impl { #[doc = "block cipher"] pub struct $name { inner: $module::Inner, - token: aes_intrinsics::InitToken, + token: arch::features::aes::InitToken, } impl KeySizeUser for $name { @@ -69,9 +67,7 @@ macro_rules! define_aes_impl { use core::ops::Deref; let inner = if enc.token.get() { $module::Inner { - intrinsics: ManuallyDrop::new(unsafe { - enc.inner.intrinsics.deref().into() - }), + arch: ManuallyDrop::new(unsafe { enc.inner.arch.deref().into() }), } } else { $module::Inner { @@ -89,11 +85,11 @@ macro_rules! define_aes_impl { impl KeyInit for $name { #[inline] fn new(key: &Key) -> Self { - let (token, aesni_present) = aes_intrinsics::init_get(); + let (token, aes_features) = arch::features::aes::init_get(); - let inner = if aesni_present { + let inner = if aes_features { $module::Inner { - intrinsics: ManuallyDrop::new(intrinsics::$name::new(key)), + arch: ManuallyDrop::new(arch::$name::new(key)), } } else { $module::Inner { @@ -109,7 +105,7 @@ macro_rules! define_aes_impl { fn clone(&self) -> Self { let inner = if self.token.get() { $module::Inner { - intrinsics: unsafe { self.inner.intrinsics.clone() }, + arch: unsafe { self.inner.arch.clone() }, } } else { $module::Inner { @@ -132,38 +128,20 @@ macro_rules! define_aes_impl { impl BlockCipherEncrypt for $name { fn encrypt_with_backend(&self, f: impl BlockClosure) { - unsafe { - if self.token.get() { - #[target_feature(enable = "aes")] - unsafe fn inner( - state: &intrinsics::$name, - f: impl BlockClosure, - ) { - f.call(&mut state.get_enc_backend()); - } - inner(&self.inner.intrinsics, f); - } else { - f.call(&mut self.inner.soft.get_enc_backend()); - } + if self.token.get() { + unsafe { &self.inner.arch }.encrypt_with_backend(f) + } else { + unsafe { &self.inner.soft }.encrypt_with_backend(f) } } } impl BlockCipherDecrypt for $name { fn decrypt_with_backend(&self, f: impl BlockClosure) { - unsafe { - if self.token.get() { - #[target_feature(enable = "aes")] - unsafe fn inner( - state: &intrinsics::$name, - f: impl BlockClosure, - ) { - f.call(&mut state.get_dec_backend()); - } - inner(&self.inner.intrinsics, f); - } else { - f.call(&mut self.inner.soft.get_dec_backend()); - } + if self.token.get() { + unsafe { &self.inner.arch }.decrypt_with_backend(f) + } else { + unsafe { &self.inner.soft }.decrypt_with_backend(f) } } } @@ -184,7 +162,7 @@ macro_rules! define_aes_impl { #[inline] fn drop(&mut self) { if self.token.get() { - unsafe { ManuallyDrop::drop(&mut self.inner.intrinsics) }; + unsafe { ManuallyDrop::drop(&mut self.inner.arch) }; } else { unsafe { ManuallyDrop::drop(&mut self.inner.soft) }; }; @@ -198,7 +176,7 @@ macro_rules! define_aes_impl { #[doc = "block cipher (encrypt-only)"] pub struct $name_enc { inner: $module::InnerEnc, - token: aes_intrinsics::InitToken, + token: arch::features::aes::InitToken, } impl KeySizeUser for $name_enc { @@ -208,11 +186,11 @@ macro_rules! define_aes_impl { impl KeyInit for $name_enc { #[inline] fn new(key: &Key) -> Self { - let (token, aesni_present) = aes_intrinsics::init_get(); + let (token, aes_features) = arch::features::aes::init_get(); - let inner = if aesni_present { + let inner = if aes_features { $module::InnerEnc { - intrinsics: ManuallyDrop::new(intrinsics::$name_enc::new(key)), + arch: ManuallyDrop::new(arch::$name_enc::new(key)), } } else { $module::InnerEnc { @@ -228,7 +206,7 @@ macro_rules! define_aes_impl { fn clone(&self) -> Self { let inner = if self.token.get() { $module::InnerEnc { - intrinsics: unsafe { self.inner.intrinsics.clone() }, + arch: unsafe { self.inner.arch.clone() }, } } else { $module::InnerEnc { @@ -251,19 +229,10 @@ macro_rules! define_aes_impl { impl BlockCipherEncrypt for $name_enc { fn encrypt_with_backend(&self, f: impl BlockClosure) { - unsafe { - if self.token.get() { - #[target_feature(enable = "aes")] - unsafe fn inner( - state: &intrinsics::$name_enc, - f: impl BlockClosure, - ) { - f.call(&mut state.get_enc_backend()); - } - inner(&self.inner.intrinsics, f); - } else { - f.call(&mut self.inner.soft.get_enc_backend()); - } + if self.token.get() { + unsafe { &self.inner.arch }.encrypt_with_backend(f) + } else { + unsafe { &self.inner.soft }.encrypt_with_backend(f) } } } @@ -284,7 +253,7 @@ macro_rules! define_aes_impl { #[inline] fn drop(&mut self) { if self.token.get() { - unsafe { ManuallyDrop::drop(&mut self.inner.intrinsics) }; + unsafe { ManuallyDrop::drop(&mut self.inner.arch) }; } else { unsafe { ManuallyDrop::drop(&mut self.inner.soft) }; }; @@ -298,7 +267,7 @@ macro_rules! define_aes_impl { #[doc = "block cipher (decrypt-only)"] pub struct $name_dec { inner: $module::InnerDec, - token: aes_intrinsics::InitToken, + token: arch::features::aes::InitToken, } impl KeySizeUser for $name_dec { @@ -317,9 +286,7 @@ macro_rules! define_aes_impl { use core::ops::Deref; let inner = if enc.token.get() { $module::InnerDec { - intrinsics: ManuallyDrop::new(unsafe { - enc.inner.intrinsics.deref().into() - }), + arch: ManuallyDrop::new(unsafe { enc.inner.arch.deref().into() }), } } else { $module::InnerDec { @@ -337,11 +304,11 @@ macro_rules! define_aes_impl { impl KeyInit for $name_dec { #[inline] fn new(key: &Key) -> Self { - let (token, aesni_present) = aes_intrinsics::init_get(); + let (token, aes_features) = arch::features::aes::init_get(); - let inner = if aesni_present { + let inner = if aes_features { $module::InnerDec { - intrinsics: ManuallyDrop::new(intrinsics::$name_dec::new(key)), + arch: ManuallyDrop::new(arch::$name_dec::new(key)), } } else { $module::InnerDec { @@ -357,7 +324,7 @@ macro_rules! define_aes_impl { fn clone(&self) -> Self { let inner = if self.token.get() { $module::InnerDec { - intrinsics: unsafe { self.inner.intrinsics.clone() }, + arch: unsafe { self.inner.arch.clone() }, } } else { $module::InnerDec { @@ -380,19 +347,10 @@ macro_rules! define_aes_impl { impl BlockCipherDecrypt for $name_dec { fn decrypt_with_backend(&self, f: impl BlockClosure) { - unsafe { - if self.token.get() { - #[target_feature(enable = "aes")] - unsafe fn inner( - state: &intrinsics::$name_dec, - f: impl BlockClosure, - ) { - f.call(&mut state.get_dec_backend()); - } - inner(&self.inner.intrinsics, f); - } else { - f.call(&mut self.inner.soft.get_dec_backend()); - } + if self.token.get() { + unsafe { &self.inner.arch }.decrypt_with_backend(f) + } else { + unsafe { &self.inner.soft }.decrypt_with_backend(f) } } } @@ -413,7 +371,7 @@ macro_rules! define_aes_impl { #[inline] fn drop(&mut self) { if self.token.get() { - unsafe { ManuallyDrop::drop(&mut self.inner.intrinsics) }; + unsafe { ManuallyDrop::drop(&mut self.inner.arch) }; } else { unsafe { ManuallyDrop::drop(&mut self.inner.soft) }; }; diff --git a/aes/src/hazmat.rs b/aes/src/hazmat.rs index 3d4def91..e22047fb 100644 --- a/aes/src/hazmat.rs +++ b/aes/src/hazmat.rs @@ -17,7 +17,7 @@ use crate::{soft::fixslice::hazmat as soft, Block, Block8}; use crate::armv8::hazmat as intrinsics; #[cfg(all(any(target_arch = "x86_64", target_arch = "x86"), not(aes_force_soft)))] -use crate::ni::hazmat as intrinsics; +use crate::x86::ni::hazmat as intrinsics; #[cfg(all( any(target_arch = "x86", target_arch = "x86_64", target_arch = "aarch64"), diff --git a/aes/src/lib.rs b/aes/src/lib.rs index 0f8bab50..c0abd046 100644 --- a/aes/src/lib.rs +++ b/aes/src/lib.rs @@ -35,18 +35,26 @@ //! runtime. On other platforms the `aes` target feature must be enabled via //! RUSTFLAGS. //! -//! ## `x86`/`x86_64` intrinsics (AES-NI) +//! ## `x86`/`x86_64` intrinsics (AES-NI and VAES) //! By default this crate uses runtime detection on `i686`/`x86_64` targets -//! in order to determine if AES-NI is available, and if it is not, it will -//! fallback to using a constant-time software implementation. +//! in order to determine if AES-NI and VAES are available, and if they are +//! not, it will fallback to using a constant-time software implementation. +//! +//! Passing `RUSTFLAGS=-C target-feature=+aes,+ssse3` explicitly at +//! compile-time will override runtime detection and ensure that AES-NI is +//! used or passing `RUSTFLAGS=-C target-feature=+aes,+avx512f,+ssse3,+vaes` +//! will ensure that AESNI and VAES are always used. //! -//! Passing `RUSTFLAGS=-C target-feature=+aes,+ssse3` explicitly at compile-time -//! will override runtime detection and ensure that AES-NI is always used. //! Programs built in this manner will crash with an illegal instruction on -//! CPUs which do not have AES-NI enabled. +//! CPUs which do not have AES-NI and VAES enabled. +//! +//! Note: It is possible to disable the use of AVX512 for the VAES backend +//! and limiting it to AVX (256-bit) by specifying `--cfg disable_avx512`. +//! For CPUs which support VAES but not AVX512, the 256-bit VAES backend will +//! be selected automatically without needing to specify this flag. //! //! Note: runtime detection is not possible on SGX targets. Please use the -//! afforementioned `RUSTFLAGS` to leverage AES-NI on these targets. +//! afforementioned `RUSTFLAGS` to leverage AES-NI and VAES on these targets. //! //! # Examples //! ``` @@ -136,8 +144,8 @@ cfg_if! { any(target_arch = "x86", target_arch = "x86_64"), not(aes_force_soft) ))] { + mod x86; mod autodetect; - mod ni; pub use autodetect::*; } else { pub use soft::*; @@ -145,15 +153,20 @@ cfg_if! { } pub use cipher; +// use cipher::consts::U32; use cipher::{ array::Array, - consts::{U16, U8}, + consts::{U16, U30, U64, U8}, }; /// 128-bit AES block pub type Block = Array; /// Eight 128-bit AES blocks pub type Block8 = Array; +/// Thirty 128-bit AES blocks +pub type Block30 = Array; +/// Sixty-Four 128-bit AES blocks +pub type Block64 = Array; #[cfg(test)] mod tests { @@ -193,19 +206,19 @@ mod tests { #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), not(aes_force_soft)))] { - use super::ni; + use super::x86; cpufeatures::new!(aes_intrinsics, "aes"); if aes_intrinsics::get() { - test_for(ni::Aes128::new(&key_128)); - test_for(ni::Aes128Enc::new(&key_128)); - test_for(ni::Aes128Dec::new(&key_128)); - test_for(ni::Aes192::new(&key_192)); - test_for(ni::Aes192Enc::new(&key_192)); - test_for(ni::Aes192Dec::new(&key_192)); - test_for(ni::Aes256::new(&key_256)); - test_for(ni::Aes256Enc::new(&key_256)); - test_for(ni::Aes256Dec::new(&key_256)); + test_for(x86::Aes128::new(&key_128)); + test_for(x86::Aes128Enc::new(&key_128)); + test_for(x86::Aes128Dec::new(&key_128)); + test_for(x86::Aes192::new(&key_192)); + test_for(x86::Aes192Enc::new(&key_192)); + test_for(x86::Aes192Dec::new(&key_192)); + test_for(x86::Aes256::new(&key_256)); + test_for(x86::Aes256Enc::new(&key_256)); + test_for(x86::Aes256Dec::new(&key_256)); } } diff --git a/aes/src/ni.rs b/aes/src/ni.rs deleted file mode 100644 index 95117aad..00000000 --- a/aes/src/ni.rs +++ /dev/null @@ -1,363 +0,0 @@ -//! AES block ciphers implementation using AES-NI instruction set. -//! -//! Ciphers functionality is accessed using `BlockCipher` trait from the -//! [`cipher`](https://docs.rs/cipher) crate. -//! -//! # Vulnerability -//! Lazy FP state restory vulnerability can allow local process to leak content -//! of the FPU register, in which round keys are stored. This vulnerability -//! can be mitigated at the operating system level by installing relevant -//! patches. (i.e. keep your OS updated!) More info: -//! - [Intel advisory](https://www.intel.com/content/www/us/en/security-center/advisory/intel-sa-00145.html) -//! - [Wikipedia](https://en.wikipedia.org/wiki/Lazy_FP_state_restore) -//! -//! # Related documents -//! - [Intel AES-NI whitepaper](https://software.intel.com/sites/default/files/article/165683/aes-wp-2012-09-22-v01.pdf) -//! - [Use of the AES Instruction Set](https://www.cosic.esat.kuleuven.be/ecrypt/AESday/slides/Use_of_the_AES_Instruction_Set.pdf) - -#[macro_use] -mod utils; - -mod aes128; -mod aes192; -mod aes256; - -#[cfg(test)] -mod test_expand; - -#[cfg(feature = "hazmat")] -pub(crate) mod hazmat; - -#[cfg(target_arch = "x86")] -use core::arch::x86 as arch; -#[cfg(target_arch = "x86_64")] -use core::arch::x86_64 as arch; - -use crate::{Block, Block8}; -use cipher::{ - consts::{U16, U24, U32, U8}, - inout::InOut, - AlgorithmName, BlockBackend, BlockCipher, BlockCipherDecrypt, BlockCipherEncrypt, BlockClosure, - BlockSizeUser, Key, KeyInit, KeySizeUser, ParBlocksSizeUser, -}; -use core::fmt; - -macro_rules! define_aes_impl { - ( - $name:tt, - $name_enc:ident, - $name_dec:ident, - $name_back_enc:ident, - $name_back_dec:ident, - $module:tt, - $key_size:ty, - $doc:expr $(,)? - ) => { - #[doc=$doc] - #[doc = "block cipher"] - #[derive(Clone)] - pub struct $name { - encrypt: $name_enc, - decrypt: $name_dec, - } - - impl $name { - #[inline(always)] - pub(crate) fn get_enc_backend(&self) -> $name_back_enc<'_> { - self.encrypt.get_enc_backend() - } - - #[inline(always)] - pub(crate) fn get_dec_backend(&self) -> $name_back_dec<'_> { - self.decrypt.get_dec_backend() - } - } - - impl BlockCipher for $name {} - - impl KeySizeUser for $name { - type KeySize = $key_size; - } - - impl KeyInit for $name { - #[inline] - fn new(key: &Key) -> Self { - let encrypt = $name_enc::new(key); - let decrypt = $name_dec::from(&encrypt); - Self { encrypt, decrypt } - } - } - - impl From<$name_enc> for $name { - #[inline] - fn from(encrypt: $name_enc) -> $name { - let decrypt = (&encrypt).into(); - Self { encrypt, decrypt } - } - } - - impl From<&$name_enc> for $name { - #[inline] - fn from(encrypt: &$name_enc) -> $name { - let decrypt = encrypt.into(); - let encrypt = encrypt.clone(); - Self { encrypt, decrypt } - } - } - - impl BlockSizeUser for $name { - type BlockSize = U16; - } - - impl BlockCipherEncrypt for $name { - fn encrypt_with_backend(&self, f: impl BlockClosure) { - self.encrypt.encrypt_with_backend(f) - } - } - - impl BlockCipherDecrypt for $name { - fn decrypt_with_backend(&self, f: impl BlockClosure) { - self.decrypt.decrypt_with_backend(f) - } - } - - impl fmt::Debug for $name { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { - f.write_str(concat!(stringify!($name), " { .. }")) - } - } - - impl AlgorithmName for $name { - fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(stringify!($name)) - } - } - - #[cfg(feature = "zeroize")] - impl zeroize::ZeroizeOnDrop for $name {} - - #[doc=$doc] - #[doc = "block cipher (encrypt-only)"] - #[derive(Clone)] - pub struct $name_enc { - round_keys: $module::RoundKeys, - } - - impl $name_enc { - #[inline(always)] - pub(crate) fn get_enc_backend(&self) -> $name_back_enc<'_> { - $name_back_enc(self) - } - } - - impl BlockCipher for $name_enc {} - - impl KeySizeUser for $name_enc { - type KeySize = $key_size; - } - - impl KeyInit for $name_enc { - #[inline] - fn new(key: &Key) -> Self { - // SAFETY: we enforce that this code is called only when - // target features required by `expand` were properly checked. - Self { - round_keys: unsafe { $module::expand_key(key.as_ref()) }, - } - } - } - - impl BlockSizeUser for $name_enc { - type BlockSize = U16; - } - - impl BlockCipherEncrypt for $name_enc { - fn encrypt_with_backend(&self, f: impl BlockClosure) { - f.call(&mut self.get_enc_backend()) - } - } - - impl fmt::Debug for $name_enc { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { - f.write_str(concat!(stringify!($name_enc), " { .. }")) - } - } - - impl AlgorithmName for $name_enc { - fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(stringify!($name_enc)) - } - } - - impl Drop for $name_enc { - #[inline] - fn drop(&mut self) { - #[cfg(feature = "zeroize")] - zeroize::Zeroize::zeroize(&mut self.round_keys); - } - } - - #[cfg(feature = "zeroize")] - impl zeroize::ZeroizeOnDrop for $name_enc {} - - #[doc=$doc] - #[doc = "block cipher (decrypt-only)"] - #[derive(Clone)] - pub struct $name_dec { - round_keys: $module::RoundKeys, - } - - impl $name_dec { - #[inline(always)] - pub(crate) fn get_dec_backend(&self) -> $name_back_dec<'_> { - $name_back_dec(self) - } - } - - impl BlockCipher for $name_dec {} - - impl KeySizeUser for $name_dec { - type KeySize = $key_size; - } - - impl KeyInit for $name_dec { - #[inline] - fn new(key: &Key) -> Self { - $name_enc::new(key).into() - } - } - - impl From<$name_enc> for $name_dec { - #[inline] - fn from(enc: $name_enc) -> $name_dec { - Self::from(&enc) - } - } - - impl From<&$name_enc> for $name_dec { - #[inline] - fn from(enc: &$name_enc) -> $name_dec { - let round_keys = unsafe { $module::inv_expanded_keys(&enc.round_keys) }; - Self { round_keys } - } - } - - impl BlockSizeUser for $name_dec { - type BlockSize = U16; - } - - impl BlockCipherDecrypt for $name_dec { - fn decrypt_with_backend(&self, f: impl BlockClosure) { - f.call(&mut self.get_dec_backend()); - } - } - - impl fmt::Debug for $name_dec { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { - f.write_str(concat!(stringify!($name_dec), " { .. }")) - } - } - - impl AlgorithmName for $name_dec { - fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str(stringify!($name_dec)) - } - } - - impl Drop for $name_dec { - #[inline] - fn drop(&mut self) { - #[cfg(feature = "zeroize")] - zeroize::Zeroize::zeroize(&mut self.round_keys); - } - } - - #[cfg(feature = "zeroize")] - impl zeroize::ZeroizeOnDrop for $name_dec {} - - pub(crate) struct $name_back_enc<'a>(&'a $name_enc); - - impl<'a> BlockSizeUser for $name_back_enc<'a> { - type BlockSize = U16; - } - - impl<'a> ParBlocksSizeUser for $name_back_enc<'a> { - type ParBlocksSize = U8; - } - - impl<'a> BlockBackend for $name_back_enc<'a> { - #[inline(always)] - fn proc_block(&mut self, block: InOut<'_, '_, Block>) { - unsafe { - $module::encrypt1(&self.0.round_keys, block); - } - } - - #[inline(always)] - fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, Block8>) { - unsafe { - $module::encrypt8(&self.0.round_keys, blocks); - } - } - } - - pub(crate) struct $name_back_dec<'a>(&'a $name_dec); - - impl<'a> BlockSizeUser for $name_back_dec<'a> { - type BlockSize = U16; - } - - impl<'a> ParBlocksSizeUser for $name_back_dec<'a> { - type ParBlocksSize = U8; - } - - impl<'a> BlockBackend for $name_back_dec<'a> { - #[inline(always)] - fn proc_block(&mut self, block: InOut<'_, '_, Block>) { - unsafe { - $module::decrypt1(&self.0.round_keys, block); - } - } - - #[inline(always)] - fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, Block8>) { - unsafe { - $module::decrypt8(&self.0.round_keys, blocks); - } - } - } - }; -} - -define_aes_impl!( - Aes128, - Aes128Enc, - Aes128Dec, - Aes128BackEnc, - Aes128BackDec, - aes128, - U16, - "AES-128", -); - -define_aes_impl!( - Aes192, - Aes192Enc, - Aes192Dec, - Aes192BackEnc, - Aes192BackDec, - aes192, - U24, - "AES-192", -); - -define_aes_impl!( - Aes256, - Aes256Enc, - Aes256Dec, - Aes256BackEnc, - Aes256BackDec, - aes256, - U32, - "AES-256", -); diff --git a/aes/src/soft.rs b/aes/src/soft.rs index 50daf7df..6d34d4bc 100644 --- a/aes/src/soft.rs +++ b/aes/src/soft.rs @@ -43,18 +43,6 @@ macro_rules! define_aes_impl { keys: $fixslice_keys, } - impl $name { - #[inline(always)] - pub(crate) fn get_enc_backend(&self) -> $name_back_enc<'_> { - $name_back_enc(self) - } - - #[inline(always)] - pub(crate) fn get_dec_backend(&self) -> $name_back_dec<'_> { - $name_back_dec(self) - } - } - impl KeySizeUser for $name { type KeySize = $key_size; } @@ -76,13 +64,13 @@ macro_rules! define_aes_impl { impl BlockCipherEncrypt for $name { fn encrypt_with_backend(&self, f: impl BlockClosure) { - f.call(&mut self.get_enc_backend()) + f.call(&mut $name_back_enc(self)) } } impl BlockCipherDecrypt for $name { fn decrypt_with_backend(&self, f: impl BlockClosure) { - f.call(&mut self.get_dec_backend()) + f.call(&mut $name_back_dec(self)) } } @@ -130,13 +118,6 @@ macro_rules! define_aes_impl { inner: $name, } - impl $name_enc { - #[inline(always)] - pub(crate) fn get_enc_backend(&self) -> $name_back_enc<'_> { - self.inner.get_enc_backend() - } - } - impl BlockCipher for $name_enc {} impl KeySizeUser for $name_enc { @@ -157,7 +138,7 @@ macro_rules! define_aes_impl { impl BlockCipherEncrypt for $name_enc { fn encrypt_with_backend(&self, f: impl BlockClosure) { - f.call(&mut self.get_enc_backend()) + f.call(&mut $name_back_enc(&self.inner)) } } @@ -183,13 +164,6 @@ macro_rules! define_aes_impl { inner: $name, } - impl $name_dec { - #[inline(always)] - pub(crate) fn get_dec_backend(&self) -> $name_back_dec<'_> { - self.inner.get_dec_backend() - } - } - impl BlockCipher for $name_dec {} impl KeySizeUser for $name_dec { @@ -226,7 +200,7 @@ macro_rules! define_aes_impl { impl BlockCipherDecrypt for $name_dec { fn decrypt_with_backend(&self, f: impl BlockClosure) { - f.call(&mut self.get_dec_backend()); + f.call(&mut $name_back_dec(&self.inner)); } } diff --git a/aes/src/x86.rs b/aes/src/x86.rs new file mode 100644 index 00000000..1285a407 --- /dev/null +++ b/aes/src/x86.rs @@ -0,0 +1,529 @@ +pub(crate) mod ni; +#[cfg(target_arch = "x86_64")] +pub(crate) mod vaes256; +#[cfg(target_arch = "x86_64")] +pub(crate) mod vaes512; + +#[cfg(target_arch = "x86")] +use core::arch::x86 as arch; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64 as arch; + +use self::arch::*; +use crate::{Block, Block8}; +#[cfg(target_arch = "x86_64")] +use crate::{Block30, Block64}; +#[cfg(target_arch = "x86_64")] +use cipher::consts::{U30, U64}; +use cipher::{ + consts::{U16, U24, U32, U8}, + inout::InOut, + AlgorithmName, BlockBackend, BlockCipher, BlockCipherDecrypt, BlockCipherEncrypt, BlockClosure, + BlockSizeUser, Key, KeyInit, KeySizeUser, ParBlocksSizeUser, +}; +use core::fmt; + +pub(crate) mod features { + cpufeatures::new!(features_aes, "aes"); + cpufeatures::new!(features_avx, "avx"); + cpufeatures::new!(features_avx512f, "avx512f"); + cpufeatures::new!(features_vaes, "vaes"); + pub(crate) mod aes { + pub use super::features_aes::*; + } + #[cfg(target_arch = "x86_64")] + pub(crate) mod avx { + pub use super::features_avx::*; + } + #[cfg(target_arch = "x86_64")] + pub(crate) mod avx512f { + pub use super::features_avx512f::*; + } + #[cfg(target_arch = "x86_64")] + pub(crate) mod vaes { + pub use super::features_vaes::*; + } +} + +#[derive(Clone)] +enum Backend { + Ni, + #[cfg(target_arch = "x86_64")] + Vaes256, + #[cfg(target_arch = "x86_64")] + Vaes512, +} + +#[derive(Clone)] +struct Features { + #[cfg(target_arch = "x86_64")] + avx: self::features::avx::InitToken, + #[cfg(target_arch = "x86_64")] + avx512f: self::features::avx512f::InitToken, + #[cfg(target_arch = "x86_64")] + vaes: self::features::vaes::InitToken, +} + +impl Features { + fn new() -> Self { + Self { + #[cfg(target_arch = "x86_64")] + avx: self::features::avx::init(), + #[cfg(target_arch = "x86_64")] + avx512f: self::features::avx512f::init(), + #[cfg(target_arch = "x86_64")] + vaes: self::features::vaes::init(), + } + } + + fn backend(&self) -> Backend { + #[allow(unused_mut)] + let mut backend = Backend::Ni; + #[cfg(target_arch = "x86_64")] + if !cfg!(disable_avx512) && self.avx512f.get() && self.vaes.get() { + backend = self::Backend::Vaes512; + } + #[cfg(target_arch = "x86_64")] + if !cfg!(disable_avx256) && self.avx.get() && self.vaes.get() { + backend = self::Backend::Vaes256; + } + backend + } +} + +type RoundKeys = [__m128i; ROUNDS]; +#[cfg(target_arch = "x86_64")] +type Simd256RoundKeys = [__m256i; ROUNDS]; +#[cfg(target_arch = "x86_64")] +type Simd512RoundKeys = [__m512i; ROUNDS]; + +macro_rules! define_aes_impl { + ( + $name:tt, + $name_enc:ident, + $name_dec:ident, + $name_backend:ident, + $module:tt, + $key_size:ty, + $rounds:tt, + $doc:expr $(,)? + ) => { + mod $name_backend { + use super::*; + pub(crate) mod mode { + pub(crate) struct Encrypt; + pub(crate) struct Decrypt; + } + #[derive(Clone)] + pub(crate) struct Ni<'a, Mode> { + pub(crate) mode: core::marker::PhantomData, + pub(crate) keys: &'a RoundKeys<$rounds>, + } + #[derive(Clone)] + #[cfg(target_arch = "x86_64")] + pub(crate) struct Vaes256<'a, Mode> { + pub(crate) mode: core::marker::PhantomData, + pub(crate) keys: &'a RoundKeys<$rounds>, + pub(crate) simd_256_keys: Option>, + } + #[cfg(target_arch = "x86_64")] + pub(crate) struct Vaes512<'a, Mode> { + pub(crate) mode: core::marker::PhantomData, + pub(crate) keys: &'a RoundKeys<$rounds>, + pub(crate) simd_512_keys: Option>, + } + } + + #[doc=$doc] + #[doc = "block cipher"] + #[derive(Clone)] + pub struct $name { + encrypt: $name_enc, + decrypt: $name_dec, + } + + #[cfg(feature = "zeroize")] + impl zeroize::ZeroizeOnDrop for $name {} + + impl BlockCipher for $name {} + + impl KeySizeUser for $name { + type KeySize = $key_size; + } + + impl KeyInit for $name { + #[inline] + fn new(key: &Key) -> Self { + let encrypt = $name_enc::new(key); + let decrypt = $name_dec::from(&encrypt); + Self { encrypt, decrypt } + } + } + + impl From<$name_enc> for $name { + #[inline] + fn from(encrypt: $name_enc) -> $name { + let decrypt = (&encrypt).into(); + Self { encrypt, decrypt } + } + } + + impl From<&$name_enc> for $name { + #[inline] + fn from(encrypt: &$name_enc) -> $name { + let decrypt = encrypt.into(); + let encrypt = encrypt.clone(); + Self { encrypt, decrypt } + } + } + + impl BlockSizeUser for $name { + type BlockSize = U16; + } + + impl BlockCipherEncrypt for $name { + #[inline] + fn encrypt_with_backend(&self, f: impl BlockClosure) { + self.encrypt.encrypt_with_backend(f) + } + } + + impl BlockCipherDecrypt for $name { + #[inline] + fn decrypt_with_backend(&self, f: impl BlockClosure) { + self.decrypt.decrypt_with_backend(f) + } + } + + impl fmt::Debug for $name { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { + f.write_str(concat!(stringify!($name), " { .. }")) + } + } + + impl AlgorithmName for $name { + fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(stringify!($name)) + } + } + + #[doc=$doc] + #[doc = "block cipher (encrypt-only)"] + #[derive(Clone)] + pub struct $name_enc { + round_keys: RoundKeys<$rounds>, + features: Features, + } + + impl Drop for $name_enc { + fn drop(&mut self) { + #[cfg(feature = "zeroize")] + zeroize::Zeroize::zeroize(&mut self.round_keys); + } + } + #[cfg(feature = "zeroize")] + impl zeroize::ZeroizeOnDrop for $name_enc {} + + impl BlockCipher for $name_enc {} + + impl KeySizeUser for $name_enc { + type KeySize = $key_size; + } + + impl KeyInit for $name_enc { + #[inline] + fn new(key: &Key) -> Self { + // SAFETY: we enforce that this code is called only when + // target features required by `expand` were properly checked. + Self { + round_keys: unsafe { self::ni::$module::expand_key(key.as_ref()) }, + features: Features::new(), + } + } + } + + impl BlockSizeUser for $name_enc { + type BlockSize = U16; + } + + impl BlockCipherEncrypt for $name_enc { + #[inline] + fn encrypt_with_backend(&self, f: impl BlockClosure) { + let mode = core::marker::PhantomData::; + let keys = &self.round_keys; + match self.features.backend() { + self::Backend::Ni => f.call(&mut $name_backend::Ni { mode, keys }), + #[cfg(target_arch = "x86_64")] + self::Backend::Vaes256 => f.call(&mut $name_backend::Vaes256 { + mode, + keys, + simd_256_keys: None, + }), + #[cfg(target_arch = "x86_64")] + self::Backend::Vaes512 => f.call(&mut $name_backend::Vaes512 { + mode, + keys, + simd_512_keys: None, + }), + } + } + } + + impl fmt::Debug for $name_enc { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { + f.write_str(concat!(stringify!($name_enc), " { .. }")) + } + } + + impl AlgorithmName for $name_enc { + fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(stringify!($name_enc)) + } + } + + #[doc=$doc] + #[doc = "block cipher (decrypt-only)"] + #[derive(Clone)] + pub struct $name_dec { + round_keys: RoundKeys<$rounds>, + features: Features, + } + + impl Drop for $name_dec { + fn drop(&mut self) { + #[cfg(feature = "zeroize")] + zeroize::Zeroize::zeroize(&mut self.round_keys); + } + } + + impl BlockCipher for $name_dec {} + #[cfg(feature = "zeroize")] + impl zeroize::ZeroizeOnDrop for $name_dec {} + + impl KeySizeUser for $name_dec { + type KeySize = $key_size; + } + + impl KeyInit for $name_dec { + #[inline] + fn new(key: &Key) -> Self { + $name_enc::new(key).into() + } + } + + impl From<$name_enc> for $name_dec { + #[inline] + fn from(enc: $name_enc) -> $name_dec { + Self::from(&enc) + } + } + + impl From<&$name_enc> for $name_dec { + #[inline] + fn from(enc: &$name_enc) -> $name_dec { + Self { + round_keys: unsafe { self::ni::$module::inv_expanded_keys(&enc.round_keys) }, + features: enc.features.clone(), + } + } + } + + impl BlockSizeUser for $name_dec { + type BlockSize = U16; + } + + impl BlockCipherDecrypt for $name_dec { + #[inline] + fn decrypt_with_backend(&self, f: impl BlockClosure) { + let mode = core::marker::PhantomData::; + let keys = &self.round_keys; + match self.features.backend() { + self::Backend::Ni => f.call(&mut $name_backend::Ni { mode, keys }), + #[cfg(target_arch = "x86_64")] + self::Backend::Vaes256 => f.call(&mut $name_backend::Vaes256 { + mode, + keys, + simd_256_keys: None, + }), + #[cfg(target_arch = "x86_64")] + self::Backend::Vaes512 => f.call(&mut $name_backend::Vaes512 { + mode, + keys, + simd_512_keys: None, + }), + } + } + } + + impl fmt::Debug for $name_dec { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { + f.write_str(concat!(stringify!($name_dec), " { .. }")) + } + } + + impl AlgorithmName for $name_dec { + fn write_alg_name(f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(stringify!($name_dec)) + } + } + + impl<'a, Mode> BlockSizeUser for $name_backend::Ni<'a, Mode> { + type BlockSize = U16; + } + #[cfg(target_arch = "x86_64")] + impl<'a, Mode> BlockSizeUser for $name_backend::Vaes256<'a, Mode> { + type BlockSize = U16; + } + #[cfg(target_arch = "x86_64")] + impl<'a, Mode> BlockSizeUser for $name_backend::Vaes512<'a, Mode> { + type BlockSize = U16; + } + + impl<'a, Mode> ParBlocksSizeUser for $name_backend::Ni<'a, Mode> { + type ParBlocksSize = U8; + } + #[cfg(target_arch = "x86_64")] + impl<'a, Mode> ParBlocksSizeUser for $name_backend::Vaes256<'a, Mode> { + type ParBlocksSize = U30; + } + #[cfg(target_arch = "x86_64")] + impl<'a, Mode> ParBlocksSizeUser for $name_backend::Vaes512<'a, Mode> { + type ParBlocksSize = U64; + } + + impl<'a> BlockBackend for $name_backend::Ni<'a, self::$name_backend::mode::Encrypt> { + #[inline] + fn proc_block(&mut self, block: InOut<'_, '_, Block>) { + unsafe { + self::ni::$module::encrypt1(self.keys, block); + } + } + #[inline] + fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, Block8>) { + unsafe { + self::ni::$module::encrypt8(self.keys, blocks); + } + } + } + #[cfg(target_arch = "x86_64")] + impl<'a> BlockBackend for $name_backend::Vaes256<'a, self::$name_backend::mode::Encrypt> { + #[inline] + fn proc_block(&mut self, block: InOut<'_, '_, Block>) { + unsafe { + self::ni::$module::encrypt1(self.keys, block); + } + } + #[inline] + fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, Block30>) { + unsafe { + let simd_256_keys = self.simd_256_keys.get_or_insert_with(|| { + self::vaes256::$module::parallelize_keys(&self.keys) + }); + self::vaes256::$module::encrypt30(simd_256_keys, blocks); + } + } + } + #[cfg(target_arch = "x86_64")] + impl<'a> BlockBackend for $name_backend::Vaes512<'a, self::$name_backend::mode::Encrypt> { + #[inline] + fn proc_block(&mut self, block: InOut<'_, '_, Block>) { + unsafe { + self::ni::$module::encrypt1(self.keys, block); + } + } + #[inline] + fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, Block64>) { + unsafe { + let simd_512_keys = self.simd_512_keys.get_or_insert_with(|| { + self::vaes512::$module::parallelize_keys(&self.keys) + }); + self::vaes512::$module::encrypt64(simd_512_keys, blocks); + } + } + } + + impl<'a> BlockBackend for $name_backend::Ni<'a, self::$name_backend::mode::Decrypt> { + #[inline] + fn proc_block(&mut self, block: InOut<'_, '_, Block>) { + unsafe { + self::ni::$module::decrypt1(self.keys, block); + } + } + #[inline] + fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, Block8>) { + unsafe { + self::ni::$module::decrypt8(self.keys, blocks); + } + } + } + #[cfg(target_arch = "x86_64")] + impl<'a> BlockBackend for $name_backend::Vaes256<'a, self::$name_backend::mode::Decrypt> { + #[inline] + fn proc_block(&mut self, block: InOut<'_, '_, Block>) { + unsafe { + self::ni::$module::decrypt1(self.keys, block); + } + } + #[inline] + fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, Block30>) { + unsafe { + let simd_256_keys = self.simd_256_keys.get_or_insert_with(|| { + self::vaes256::$module::parallelize_keys(&self.keys) + }); + self::vaes256::$module::decrypt30(simd_256_keys, blocks); + } + } + } + #[cfg(target_arch = "x86_64")] + impl<'a> BlockBackend for $name_backend::Vaes512<'a, self::$name_backend::mode::Decrypt> { + #[inline] + fn proc_block(&mut self, block: InOut<'_, '_, Block>) { + unsafe { + self::ni::$module::decrypt1(self.keys, block); + } + } + #[inline] + fn proc_par_blocks(&mut self, blocks: InOut<'_, '_, Block64>) { + unsafe { + let simd_512_keys = self.simd_512_keys.get_or_insert_with(|| { + self::vaes512::$module::parallelize_keys(&self.keys) + }); + self::vaes512::$module::decrypt64(simd_512_keys, blocks); + } + } + } + }; +} + +define_aes_impl!( + Aes128, + Aes128Enc, + Aes128Dec, + aes128_backend, + aes128, + U16, + 11, + "AES-128", +); + +define_aes_impl!( + Aes192, + Aes192Enc, + Aes192Dec, + aes192_backend, + aes192, + U24, + 13, + "AES-192", +); + +define_aes_impl!( + Aes256, + Aes256Enc, + Aes256Dec, + aes256_backend, + aes256, + U32, + 15, + "AES-256", +); diff --git a/aes/src/x86/ni.rs b/aes/src/x86/ni.rs new file mode 100644 index 00000000..aadea99a --- /dev/null +++ b/aes/src/x86/ni.rs @@ -0,0 +1,34 @@ +//! AES block ciphers implementation using AES-NI instruction set. +//! +//! Ciphers functionality is accessed using `BlockCipher` trait from the +//! [`cipher`](https://docs.rs/cipher) crate. +//! +//! # Vulnerability +//! Lazy FP state restory vulnerability can allow local process to leak content +//! of the FPU register, in which round keys are stored. This vulnerability +//! can be mitigated at the operating system level by installing relevant +//! patches. (i.e. keep your OS updated!) More info: +//! - [Intel advisory](https://www.intel.com/content/www/us/en/security-center/advisory/intel-sa-00145.html) +//! - [Wikipedia](https://en.wikipedia.org/wiki/Lazy_FP_state_restore) +//! +//! # Related documents +//! - [Intel AES-NI whitepaper](https://software.intel.com/sites/default/files/article/165683/aes-wp-2012-09-22-v01.pdf) +//! - [Use of the AES Instruction Set](https://www.cosic.esat.kuleuven.be/ecrypt/AESday/slides/Use_of_the_AES_Instruction_Set.pdf) + +#[macro_use] +mod utils; + +pub(crate) mod aes128; +pub(crate) mod aes192; +pub(crate) mod aes256; + +#[cfg(test)] +mod test_expand; + +#[cfg(feature = "hazmat")] +pub(crate) mod hazmat; + +#[cfg(target_arch = "x86")] +use core::arch::x86 as arch; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64 as arch; diff --git a/aes/src/ni/aes128.rs b/aes/src/x86/ni/aes128.rs similarity index 80% rename from aes/src/ni/aes128.rs rename to aes/src/x86/ni/aes128.rs index b0836a16..07b7beb3 100644 --- a/aes/src/ni/aes128.rs +++ b/aes/src/x86/ni/aes128.rs @@ -1,16 +1,14 @@ -use super::{arch::*, utils::*}; +use super::utils::*; +use crate::x86::{arch::*, RoundKeys}; use crate::{Block, Block8}; use cipher::inout::InOut; use core::mem; -/// AES-128 round keys -pub(super) type RoundKeys = [__m128i; 11]; - #[inline] #[target_feature(enable = "aes")] -pub(super) unsafe fn encrypt1(keys: &RoundKeys, block: InOut<'_, '_, Block>) { +pub(crate) unsafe fn encrypt1(keys: &RoundKeys<11>, block: InOut<'_, '_, Block>) { let (in_ptr, out_ptr) = block.into_raw(); - let mut b = _mm_loadu_si128(in_ptr as *const __m128i); + let mut b = _mm_loadu_si128(in_ptr.cast()); b = _mm_xor_si128(b, keys[0]); b = _mm_aesenc_si128(b, keys[1]); b = _mm_aesenc_si128(b, keys[2]); @@ -22,12 +20,12 @@ pub(super) unsafe fn encrypt1(keys: &RoundKeys, block: InOut<'_, '_, Block>) { b = _mm_aesenc_si128(b, keys[8]); b = _mm_aesenc_si128(b, keys[9]); b = _mm_aesenclast_si128(b, keys[10]); - _mm_storeu_si128(out_ptr as *mut __m128i, b); + _mm_storeu_si128(out_ptr.cast(), b); } #[inline] #[target_feature(enable = "aes")] -pub(super) unsafe fn encrypt8(keys: &RoundKeys, blocks: InOut<'_, '_, Block8>) { +pub(crate) unsafe fn encrypt8(keys: &RoundKeys<11>, blocks: InOut<'_, '_, Block8>) { let (in_ptr, out_ptr) = blocks.into_raw(); let mut b = load8(in_ptr); xor8(&mut b, keys[0]); @@ -46,9 +44,9 @@ pub(super) unsafe fn encrypt8(keys: &RoundKeys, blocks: InOut<'_, '_, Block8>) { #[inline] #[target_feature(enable = "aes")] -pub(super) unsafe fn decrypt1(keys: &RoundKeys, block: InOut<'_, '_, Block>) { +pub(crate) unsafe fn decrypt1(keys: &RoundKeys<11>, block: InOut<'_, '_, Block>) { let (in_ptr, out_ptr) = block.into_raw(); - let mut b = _mm_loadu_si128(in_ptr as *const __m128i); + let mut b = _mm_loadu_si128(in_ptr.cast()); b = _mm_xor_si128(b, keys[10]); b = _mm_aesdec_si128(b, keys[9]); b = _mm_aesdec_si128(b, keys[8]); @@ -60,12 +58,12 @@ pub(super) unsafe fn decrypt1(keys: &RoundKeys, block: InOut<'_, '_, Block>) { b = _mm_aesdec_si128(b, keys[2]); b = _mm_aesdec_si128(b, keys[1]); b = _mm_aesdeclast_si128(b, keys[0]); - _mm_storeu_si128(out_ptr as *mut __m128i, b); + _mm_storeu_si128(out_ptr.cast(), b); } #[inline] #[target_feature(enable = "aes")] -pub(super) unsafe fn decrypt8(keys: &RoundKeys, blocks: InOut<'_, '_, Block8>) { +pub(crate) unsafe fn decrypt8(keys: &RoundKeys<11>, blocks: InOut<'_, '_, Block8>) { let (in_ptr, out_ptr) = blocks.into_raw(); let mut b = load8(in_ptr); xor8(&mut b, keys[10]); @@ -104,12 +102,12 @@ macro_rules! expand_round { #[inline] #[target_feature(enable = "aes")] -pub(super) unsafe fn expand_key(key: &[u8; 16]) -> RoundKeys { +pub(crate) unsafe fn expand_key(key: &[u8; 16]) -> RoundKeys<11> { // SAFETY: `RoundKeys` is a `[__m128i; 11]` which can be initialized // with all zeroes. - let mut keys: RoundKeys = mem::zeroed(); + let mut keys: RoundKeys<11> = mem::zeroed(); - let k = _mm_loadu_si128(key.as_ptr() as *const __m128i); + let k = _mm_loadu_si128(key.as_ptr().cast()); keys[0] = k; expand_round!(keys, 1, 0x01); @@ -128,7 +126,7 @@ pub(super) unsafe fn expand_key(key: &[u8; 16]) -> RoundKeys { #[inline] #[target_feature(enable = "aes")] -pub(super) unsafe fn inv_expanded_keys(keys: &RoundKeys) -> RoundKeys { +pub(crate) unsafe fn inv_expanded_keys(keys: &RoundKeys<11>) -> RoundKeys<11> { [ keys[0], _mm_aesimc_si128(keys[1]), diff --git a/aes/src/ni/aes192.rs b/aes/src/x86/ni/aes192.rs similarity index 83% rename from aes/src/ni/aes192.rs rename to aes/src/x86/ni/aes192.rs index eee1f211..2058e2c7 100644 --- a/aes/src/ni/aes192.rs +++ b/aes/src/x86/ni/aes192.rs @@ -1,16 +1,14 @@ -use super::{arch::*, utils::*}; +use super::utils::*; +use crate::x86::{arch::*, RoundKeys}; use crate::{Block, Block8}; use cipher::inout::InOut; use core::{mem, ptr}; -/// AES-192 round keys -pub(super) type RoundKeys = [__m128i; 13]; - #[inline] #[target_feature(enable = "aes")] -pub(super) unsafe fn encrypt1(keys: &RoundKeys, block: InOut<'_, '_, Block>) { +pub(crate) unsafe fn encrypt1(keys: &RoundKeys<13>, block: InOut<'_, '_, Block>) { let (in_ptr, out_ptr) = block.into_raw(); - let mut b = _mm_loadu_si128(in_ptr as *const __m128i); + let mut b = _mm_loadu_si128(in_ptr.cast()); b = _mm_xor_si128(b, keys[0]); b = _mm_aesenc_si128(b, keys[1]); b = _mm_aesenc_si128(b, keys[2]); @@ -24,12 +22,12 @@ pub(super) unsafe fn encrypt1(keys: &RoundKeys, block: InOut<'_, '_, Block>) { b = _mm_aesenc_si128(b, keys[10]); b = _mm_aesenc_si128(b, keys[11]); b = _mm_aesenclast_si128(b, keys[12]); - _mm_storeu_si128(out_ptr as *mut __m128i, b); + _mm_storeu_si128(out_ptr.cast(), b); } #[inline] #[target_feature(enable = "aes")] -pub(super) unsafe fn encrypt8(keys: &RoundKeys, blocks: InOut<'_, '_, Block8>) { +pub(crate) unsafe fn encrypt8(keys: &RoundKeys<13>, blocks: InOut<'_, '_, Block8>) { let (in_ptr, out_ptr) = blocks.into_raw(); let mut b = load8(in_ptr); xor8(&mut b, keys[0]); @@ -50,9 +48,9 @@ pub(super) unsafe fn encrypt8(keys: &RoundKeys, blocks: InOut<'_, '_, Block8>) { #[inline] #[target_feature(enable = "aes")] -pub(super) unsafe fn decrypt1(keys: &RoundKeys, block: InOut<'_, '_, Block>) { +pub(crate) unsafe fn decrypt1(keys: &RoundKeys<13>, block: InOut<'_, '_, Block>) { let (in_ptr, out_ptr) = block.into_raw(); - let mut b = _mm_loadu_si128(in_ptr as *const __m128i); + let mut b = _mm_loadu_si128(in_ptr.cast()); b = _mm_xor_si128(b, keys[12]); b = _mm_aesdec_si128(b, keys[11]); b = _mm_aesdec_si128(b, keys[10]); @@ -66,12 +64,12 @@ pub(super) unsafe fn decrypt1(keys: &RoundKeys, block: InOut<'_, '_, Block>) { b = _mm_aesdec_si128(b, keys[2]); b = _mm_aesdec_si128(b, keys[1]); b = _mm_aesdeclast_si128(b, keys[0]); - _mm_storeu_si128(out_ptr as *mut __m128i, b); + _mm_storeu_si128(out_ptr.cast(), b); } #[inline] #[target_feature(enable = "aes")] -pub(super) unsafe fn decrypt8(keys: &RoundKeys, blocks: InOut<'_, '_, Block8>) { +pub(crate) unsafe fn decrypt8(keys: &RoundKeys<13>, blocks: InOut<'_, '_, Block8>) { let (in_ptr, out_ptr) = blocks.into_raw(); let mut b = load8(in_ptr); xor8(&mut b, keys[12]); @@ -123,21 +121,21 @@ macro_rules! shuffle { #[inline] #[target_feature(enable = "aes")] -pub(super) unsafe fn expand_key(key: &[u8; 24]) -> RoundKeys { +pub(crate) unsafe fn expand_key(key: &[u8; 24]) -> RoundKeys<13> { // SAFETY: `RoundKeys` is a `[__m128i; 13]` which can be initialized // with all zeroes. - let mut keys: RoundKeys = mem::zeroed(); + let mut keys: RoundKeys<13> = mem::zeroed(); // we are being extra pedantic here to remove out-of-bound access. // this should be optimized out into movups, movsd sequence // note that unaligned load MUST be used here, even though we read // from the array (compiler missoptimizes aligned load) let (k0, k1l) = { let mut t = [0u8; 32]; - ptr::write(t.as_mut_ptr() as *mut [u8; 24], *key); + ptr::write(t.as_mut_ptr().cast(), *key); ( - _mm_loadu_si128(t.as_ptr() as *const __m128i), - _mm_loadu_si128(t.as_ptr().offset(16) as *const __m128i), + _mm_loadu_si128(t.as_ptr().cast()), + _mm_loadu_si128(t.as_ptr().offset(16).cast()), ) }; @@ -178,7 +176,7 @@ pub(super) unsafe fn expand_key(key: &[u8; 24]) -> RoundKeys { #[inline] #[target_feature(enable = "aes")] -pub(super) unsafe fn inv_expanded_keys(keys: &RoundKeys) -> RoundKeys { +pub(crate) unsafe fn inv_expanded_keys(keys: &RoundKeys<13>) -> RoundKeys<13> { [ keys[0], _mm_aesimc_si128(keys[1]), diff --git a/aes/src/ni/aes256.rs b/aes/src/x86/ni/aes256.rs similarity index 85% rename from aes/src/ni/aes256.rs rename to aes/src/x86/ni/aes256.rs index bea090ab..11f68b53 100644 --- a/aes/src/ni/aes256.rs +++ b/aes/src/x86/ni/aes256.rs @@ -1,16 +1,14 @@ -use super::{arch::*, utils::*}; +use super::utils::*; +use crate::x86::{arch::*, RoundKeys}; use crate::{Block, Block8}; use cipher::inout::InOut; use core::mem; -/// AES-192 round keys -pub(super) type RoundKeys = [__m128i; 15]; - #[inline] #[target_feature(enable = "aes")] -pub(super) unsafe fn encrypt1(keys: &RoundKeys, block: InOut<'_, '_, Block>) { +pub(crate) unsafe fn encrypt1(keys: &RoundKeys<15>, block: InOut<'_, '_, Block>) { let (in_ptr, out_ptr) = block.into_raw(); - let mut b = _mm_loadu_si128(in_ptr as *const __m128i); + let mut b = _mm_loadu_si128(in_ptr.cast()); b = _mm_xor_si128(b, keys[0]); b = _mm_aesenc_si128(b, keys[1]); b = _mm_aesenc_si128(b, keys[2]); @@ -26,12 +24,12 @@ pub(super) unsafe fn encrypt1(keys: &RoundKeys, block: InOut<'_, '_, Block>) { b = _mm_aesenc_si128(b, keys[12]); b = _mm_aesenc_si128(b, keys[13]); b = _mm_aesenclast_si128(b, keys[14]); - _mm_storeu_si128(out_ptr as *mut __m128i, b); + _mm_storeu_si128(out_ptr.cast(), b); } #[inline] #[target_feature(enable = "aes")] -pub(super) unsafe fn encrypt8(keys: &RoundKeys, blocks: InOut<'_, '_, Block8>) { +pub(crate) unsafe fn encrypt8(keys: &RoundKeys<15>, blocks: InOut<'_, '_, Block8>) { let (in_ptr, out_ptr) = blocks.into_raw(); let mut b = load8(in_ptr); xor8(&mut b, keys[0]); @@ -54,9 +52,9 @@ pub(super) unsafe fn encrypt8(keys: &RoundKeys, blocks: InOut<'_, '_, Block8>) { #[inline] #[target_feature(enable = "aes")] -pub(super) unsafe fn decrypt1(keys: &RoundKeys, block: InOut<'_, '_, Block>) { +pub(crate) unsafe fn decrypt1(keys: &RoundKeys<15>, block: InOut<'_, '_, Block>) { let (in_ptr, out_ptr) = block.into_raw(); - let mut b = _mm_loadu_si128(in_ptr as *const __m128i); + let mut b = _mm_loadu_si128(in_ptr.cast()); b = _mm_xor_si128(b, keys[14]); b = _mm_aesdec_si128(b, keys[13]); b = _mm_aesdec_si128(b, keys[12]); @@ -72,12 +70,12 @@ pub(super) unsafe fn decrypt1(keys: &RoundKeys, block: InOut<'_, '_, Block>) { b = _mm_aesdec_si128(b, keys[2]); b = _mm_aesdec_si128(b, keys[1]); b = _mm_aesdeclast_si128(b, keys[0]); - _mm_storeu_si128(out_ptr as *mut __m128i, b); + _mm_storeu_si128(out_ptr.cast(), b); } #[inline] #[target_feature(enable = "aes")] -pub(super) unsafe fn decrypt8(keys: &RoundKeys, blocks: InOut<'_, '_, Block8>) { +pub(crate) unsafe fn decrypt8(keys: &RoundKeys<15>, blocks: InOut<'_, '_, Block8>) { let (in_ptr, out_ptr) = blocks.into_raw(); let mut b = load8(in_ptr); xor8(&mut b, keys[14]); @@ -153,12 +151,12 @@ macro_rules! expand_round_last { } #[inline(always)] -pub(super) unsafe fn expand_key(key: &[u8; 32]) -> RoundKeys { +pub(crate) unsafe fn expand_key(key: &[u8; 32]) -> RoundKeys<15> { // SAFETY: `RoundKeys` is a `[__m128i; 15]` which can be initialized // with all zeroes. - let mut keys: RoundKeys = mem::zeroed(); + let mut keys: RoundKeys<15> = mem::zeroed(); - let kp = key.as_ptr() as *const __m128i; + let kp = key.as_ptr().cast(); keys[0] = _mm_loadu_si128(kp); keys[1] = _mm_loadu_si128(kp.add(1)); @@ -175,7 +173,7 @@ pub(super) unsafe fn expand_key(key: &[u8; 32]) -> RoundKeys { #[inline] #[target_feature(enable = "aes")] -pub(super) unsafe fn inv_expanded_keys(keys: &RoundKeys) -> RoundKeys { +pub(crate) unsafe fn inv_expanded_keys(keys: &RoundKeys<15>) -> RoundKeys<15> { [ keys[0], _mm_aesimc_si128(keys[1]), diff --git a/aes/src/ni/hazmat.rs b/aes/src/x86/ni/hazmat.rs similarity index 77% rename from aes/src/ni/hazmat.rs rename to aes/src/x86/ni/hazmat.rs index a2a735a3..a5395dc5 100644 --- a/aes/src/ni/hazmat.rs +++ b/aes/src/x86/ni/hazmat.rs @@ -14,10 +14,10 @@ use crate::{Block, Block8}; #[target_feature(enable = "aes")] pub(crate) unsafe fn cipher_round(block: &mut Block, round_key: &Block) { // Safety: `loadu` and `storeu` support unaligned access - let b = _mm_loadu_si128(block.as_ptr() as *const __m128i); - let k = _mm_loadu_si128(round_key.as_ptr() as *const __m128i); + let b = _mm_loadu_si128(block.as_ptr().cast()); + let k = _mm_loadu_si128(round_key.as_ptr().cast()); let out = _mm_aesenc_si128(b, k); - _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, out); + _mm_storeu_si128(block.as_mut_ptr().cast(), out); } /// AES cipher (encrypt) round function: parallel version. @@ -37,10 +37,10 @@ pub(crate) unsafe fn cipher_round_par(blocks: &mut Block8, round_keys: &Block8) #[target_feature(enable = "aes")] pub(crate) unsafe fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) { // Safety: `loadu` and `storeu` support unaligned access - let b = _mm_loadu_si128(block.as_ptr() as *const __m128i); - let k = _mm_loadu_si128(round_key.as_ptr() as *const __m128i); + let b = _mm_loadu_si128(block.as_ptr().cast()); + let k = _mm_loadu_si128(round_key.as_ptr().cast()); let out = _mm_aesdec_si128(b, k); - _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, out); + _mm_storeu_si128(block.as_mut_ptr().cast(), out); } /// AES cipher (encrypt) round function: parallel version. @@ -60,21 +60,21 @@ pub(crate) unsafe fn equiv_inv_cipher_round_par(blocks: &mut Block8, round_keys: #[target_feature(enable = "aes")] pub(crate) unsafe fn mix_columns(block: &mut Block) { // Safety: `loadu` and `storeu` support unaligned access - let mut state = _mm_loadu_si128(block.as_ptr() as *const __m128i); + let mut state = _mm_loadu_si128(block.as_ptr().cast()); // Emulate mix columns by performing three inverse mix columns operations state = _mm_aesimc_si128(state); state = _mm_aesimc_si128(state); state = _mm_aesimc_si128(state); - _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, state); + _mm_storeu_si128(block.as_mut_ptr().cast(), state); } /// AES inverse mix columns function. #[target_feature(enable = "aes")] pub(crate) unsafe fn inv_mix_columns(block: &mut Block) { // Safety: `loadu` and `storeu` support unaligned access - let b = _mm_loadu_si128(block.as_ptr() as *const __m128i); + let b = _mm_loadu_si128(block.as_ptr().cast()); let out = _mm_aesimc_si128(b); - _mm_storeu_si128(block.as_mut_ptr() as *mut __m128i, out); + _mm_storeu_si128(block.as_mut_ptr().cast(), out); } diff --git a/aes/src/ni/test_expand.rs b/aes/src/x86/ni/test_expand.rs similarity index 100% rename from aes/src/ni/test_expand.rs rename to aes/src/x86/ni/test_expand.rs diff --git a/aes/src/ni/utils.rs b/aes/src/x86/ni/utils.rs similarity index 64% rename from aes/src/ni/utils.rs rename to aes/src/x86/ni/utils.rs index 1bd6522d..d6c97ca4 100644 --- a/aes/src/ni/utils.rs +++ b/aes/src/x86/ni/utils.rs @@ -20,16 +20,16 @@ pub(crate) fn check(a: &[__m128i], b: &[[u64; 2]]) { #[inline(always)] pub(crate) fn load8(blocks: *const Block8) -> U128x8 { unsafe { - let p = blocks as *const Block; + let p = blocks.cast::(); [ - _mm_loadu_si128(p.add(0) as *const __m128i), - _mm_loadu_si128(p.add(1) as *const __m128i), - _mm_loadu_si128(p.add(2) as *const __m128i), - _mm_loadu_si128(p.add(3) as *const __m128i), - _mm_loadu_si128(p.add(4) as *const __m128i), - _mm_loadu_si128(p.add(5) as *const __m128i), - _mm_loadu_si128(p.add(6) as *const __m128i), - _mm_loadu_si128(p.add(7) as *const __m128i), + _mm_loadu_si128(p.add(0).cast()), + _mm_loadu_si128(p.add(1).cast()), + _mm_loadu_si128(p.add(2).cast()), + _mm_loadu_si128(p.add(3).cast()), + _mm_loadu_si128(p.add(4).cast()), + _mm_loadu_si128(p.add(5).cast()), + _mm_loadu_si128(p.add(6).cast()), + _mm_loadu_si128(p.add(7).cast()), ] } } @@ -37,15 +37,15 @@ pub(crate) fn load8(blocks: *const Block8) -> U128x8 { #[inline(always)] pub(crate) fn store8(blocks: *mut Block8, b: U128x8) { unsafe { - let p = blocks as *mut Block; - _mm_storeu_si128(p.add(0) as *mut __m128i, b[0]); - _mm_storeu_si128(p.add(1) as *mut __m128i, b[1]); - _mm_storeu_si128(p.add(2) as *mut __m128i, b[2]); - _mm_storeu_si128(p.add(3) as *mut __m128i, b[3]); - _mm_storeu_si128(p.add(4) as *mut __m128i, b[4]); - _mm_storeu_si128(p.add(5) as *mut __m128i, b[5]); - _mm_storeu_si128(p.add(6) as *mut __m128i, b[6]); - _mm_storeu_si128(p.add(7) as *mut __m128i, b[7]); + let p = blocks.cast::(); + _mm_storeu_si128(p.add(0).cast(), b[0]); + _mm_storeu_si128(p.add(1).cast(), b[1]); + _mm_storeu_si128(p.add(2).cast(), b[2]); + _mm_storeu_si128(p.add(3).cast(), b[3]); + _mm_storeu_si128(p.add(4).cast(), b[4]); + _mm_storeu_si128(p.add(5).cast(), b[5]); + _mm_storeu_si128(p.add(6).cast(), b[6]); + _mm_storeu_si128(p.add(7).cast(), b[7]); } } diff --git a/aes/src/x86/vaes256.rs b/aes/src/x86/vaes256.rs new file mode 100644 index 00000000..abb566f0 --- /dev/null +++ b/aes/src/x86/vaes256.rs @@ -0,0 +1,3 @@ +pub(super) mod aes128; +pub(super) mod aes192; +pub(super) mod aes256; diff --git a/aes/src/x86/vaes256/aes128.rs b/aes/src/x86/vaes256/aes128.rs new file mode 100644 index 00000000..8056d21a --- /dev/null +++ b/aes/src/x86/vaes256/aes128.rs @@ -0,0 +1,533 @@ +use crate::x86::{arch::*, RoundKeys, Simd256RoundKeys}; +use crate::Block30; +use cipher::inout::InOut; +use core::arch::asm; + +#[inline] +pub(crate) unsafe fn parallelize_keys(keys: &RoundKeys<11>) -> Simd256RoundKeys<11> { + [ + _mm256_broadcastsi128_si256(keys[0]), + _mm256_broadcastsi128_si256(keys[1]), + _mm256_broadcastsi128_si256(keys[2]), + _mm256_broadcastsi128_si256(keys[3]), + _mm256_broadcastsi128_si256(keys[4]), + _mm256_broadcastsi128_si256(keys[5]), + _mm256_broadcastsi128_si256(keys[6]), + _mm256_broadcastsi128_si256(keys[7]), + _mm256_broadcastsi128_si256(keys[8]), + _mm256_broadcastsi128_si256(keys[9]), + _mm256_broadcastsi128_si256(keys[10]), + ] +} + +#[target_feature(enable = "avx")] +#[inline] +pub(crate) unsafe fn encrypt30( + simd_256_keys: &Simd256RoundKeys<11>, + blocks: InOut<'_, '_, Block30>, +) { + let (iptr, optr) = blocks.into_raw(); + let iptr = iptr.cast::<__m256i>(); + let optr = optr.cast::<__m256i>(); + + // load plain-data + let mut data0 = iptr.add(0).read_unaligned(); + let mut data1 = iptr.add(1).read_unaligned(); + let mut data2 = iptr.add(2).read_unaligned(); + let mut data3 = iptr.add(3).read_unaligned(); + let mut data4 = iptr.add(4).read_unaligned(); + let mut data5 = iptr.add(5).read_unaligned(); + let mut data6 = iptr.add(6).read_unaligned(); + let mut data7 = iptr.add(7).read_unaligned(); + let mut data8 = iptr.add(8).read_unaligned(); + let mut data9 = iptr.add(9).read_unaligned(); + let mut data10 = iptr.add(10).read_unaligned(); + let mut data11 = iptr.add(11).read_unaligned(); + let mut data12 = iptr.add(12).read_unaligned(); + let mut data13 = iptr.add(13).read_unaligned(); + let mut data14 = iptr.add(14).read_unaligned(); + + asm! { + // aes-128 round 0 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 0 * 32]", + "vpxord ymm1 , ymm1 , ymm0", + "vpxord ymm2 , ymm2 , ymm0", + "vpxord ymm3 , ymm3 , ymm0", + "vpxord ymm4 , ymm4 , ymm0", + "vpxord ymm5 , ymm5 , ymm0", + "vpxord ymm6 , ymm6 , ymm0", + "vpxord ymm7 , ymm7 , ymm0", + "vpxord ymm8 , ymm8 , ymm0", + "vpxord ymm9 , ymm9 , ymm0", + "vpxord ymm10, ymm10, ymm0", + "vpxord ymm11, ymm11, ymm0", + "vpxord ymm12, ymm12, ymm0", + "vpxord ymm13, ymm13, ymm0", + "vpxord ymm14, ymm14, ymm0", + "vpxord ymm15, ymm15, ymm0", + // aes-128 round 1 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 1 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 2 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 2 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 3 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 3 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 4 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 4 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 5 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 5 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 6 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 6 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 7 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 7 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 8 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 8 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 9 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 9 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 10 encrypt + "vmovdqu ymm0, [{simd_256_keys} + 10 * 32]", + "vaesenclast ymm1 , ymm1 , ymm0", + "vaesenclast ymm2 , ymm2 , ymm0", + "vaesenclast ymm3 , ymm3 , ymm0", + "vaesenclast ymm4 , ymm4 , ymm0", + "vaesenclast ymm5 , ymm5 , ymm0", + "vaesenclast ymm6 , ymm6 , ymm0", + "vaesenclast ymm7 , ymm7 , ymm0", + "vaesenclast ymm8 , ymm8 , ymm0", + "vaesenclast ymm9 , ymm9 , ymm0", + "vaesenclast ymm10, ymm10, ymm0", + "vaesenclast ymm11, ymm11, ymm0", + "vaesenclast ymm12, ymm12, ymm0", + "vaesenclast ymm13, ymm13, ymm0", + "vaesenclast ymm14, ymm14, ymm0", + "vaesenclast ymm15, ymm15, ymm0", + + simd_256_keys = in(reg) simd_256_keys.as_ptr(), + + out("ymm0") _, + inout("ymm1") data0, + inout("ymm2") data1, + inout("ymm3") data2, + inout("ymm4") data3, + inout("ymm5") data4, + inout("ymm6") data5, + inout("ymm7") data6, + inout("ymm8") data7, + inout("ymm9") data8, + inout("ymm10") data9, + inout("ymm11") data10, + inout("ymm12") data11, + inout("ymm13") data12, + inout("ymm14") data13, + inout("ymm15") data14, + + options(pure, readonly, nostack, preserves_flags), + }; + + // save cipher-data + optr.add(0).write_unaligned(data0); + optr.add(1).write_unaligned(data1); + optr.add(2).write_unaligned(data2); + optr.add(3).write_unaligned(data3); + optr.add(4).write_unaligned(data4); + optr.add(5).write_unaligned(data5); + optr.add(6).write_unaligned(data6); + optr.add(7).write_unaligned(data7); + optr.add(8).write_unaligned(data8); + optr.add(9).write_unaligned(data9); + optr.add(10).write_unaligned(data10); + optr.add(11).write_unaligned(data11); + optr.add(12).write_unaligned(data12); + optr.add(13).write_unaligned(data13); + optr.add(14).write_unaligned(data14); +} + +#[target_feature(enable = "avx")] +#[inline] +pub(crate) unsafe fn decrypt30( + simd_256_keys: &Simd256RoundKeys<11>, + blocks: InOut<'_, '_, Block30>, +) { + let (iptr, optr) = blocks.into_raw(); + let iptr = iptr.cast::<__m256i>(); + let optr = optr.cast::<__m256i>(); + + // load cipher-data + let mut data0 = iptr.add(0).read_unaligned(); + let mut data1 = iptr.add(1).read_unaligned(); + let mut data2 = iptr.add(2).read_unaligned(); + let mut data3 = iptr.add(3).read_unaligned(); + let mut data4 = iptr.add(4).read_unaligned(); + let mut data5 = iptr.add(5).read_unaligned(); + let mut data6 = iptr.add(6).read_unaligned(); + let mut data7 = iptr.add(7).read_unaligned(); + let mut data8 = iptr.add(8).read_unaligned(); + let mut data9 = iptr.add(9).read_unaligned(); + let mut data10 = iptr.add(10).read_unaligned(); + let mut data11 = iptr.add(11).read_unaligned(); + let mut data12 = iptr.add(12).read_unaligned(); + let mut data13 = iptr.add(13).read_unaligned(); + let mut data14 = iptr.add(14).read_unaligned(); + + asm! { + // aes-128 round 10 decrypt + "vmovdqu ymm0, [{simd_256_keys} + 10 * 32]", + "vpxord ymm1 , ymm1 , ymm0", + "vpxord ymm2 , ymm2 , ymm0", + "vpxord ymm3 , ymm3 , ymm0", + "vpxord ymm4 , ymm4 , ymm0", + "vpxord ymm5 , ymm5 , ymm0", + "vpxord ymm6 , ymm6 , ymm0", + "vpxord ymm7 , ymm7 , ymm0", + "vpxord ymm8 , ymm8 , ymm0", + "vpxord ymm9 , ymm9 , ymm0", + "vpxord ymm10, ymm10, ymm0", + "vpxord ymm11, ymm11, ymm0", + "vpxord ymm12, ymm12, ymm0", + "vpxord ymm13, ymm13, ymm0", + "vpxord ymm14, ymm14, ymm0", + "vpxord ymm15, ymm15, ymm0", + // aes-128 round 9 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 9 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-128 round 8 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 8 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-128 round 7 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 7 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-128 round 6 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 6 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-128 round 5 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 5 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-128 round 4 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 4 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-128 round 3 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 3 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-128 round 2 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 2 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-128 round 1 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 1 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-128 round 0 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 0 * 32]", + "vaesdeclast ymm1 , ymm1 , ymm0", + "vaesdeclast ymm2 , ymm2 , ymm0", + "vaesdeclast ymm3 , ymm3 , ymm0", + "vaesdeclast ymm4 , ymm4 , ymm0", + "vaesdeclast ymm5 , ymm5 , ymm0", + "vaesdeclast ymm6 , ymm6 , ymm0", + "vaesdeclast ymm7 , ymm7 , ymm0", + "vaesdeclast ymm8 , ymm8 , ymm0", + "vaesdeclast ymm9 , ymm9 , ymm0", + "vaesdeclast ymm10, ymm10, ymm0", + "vaesdeclast ymm11, ymm11, ymm0", + "vaesdeclast ymm12, ymm12, ymm0", + "vaesdeclast ymm13, ymm13, ymm0", + "vaesdeclast ymm14, ymm14, ymm0", + "vaesdeclast ymm15, ymm15, ymm0", + + simd_256_keys = in(reg) simd_256_keys.as_ptr(), + + out("ymm0") _, + inout("ymm1") data0, + inout("ymm2") data1, + inout("ymm3") data2, + inout("ymm4") data3, + inout("ymm5") data4, + inout("ymm6") data5, + inout("ymm7") data6, + inout("ymm8") data7, + inout("ymm9") data8, + inout("ymm10") data9, + inout("ymm11") data10, + inout("ymm12") data11, + inout("ymm13") data12, + inout("ymm14") data13, + inout("ymm15") data14, + + options(pure, readonly, nostack, preserves_flags), + }; + + // save plain-data + optr.add(0).write_unaligned(data0); + optr.add(1).write_unaligned(data1); + optr.add(2).write_unaligned(data2); + optr.add(3).write_unaligned(data3); + optr.add(4).write_unaligned(data4); + optr.add(5).write_unaligned(data5); + optr.add(6).write_unaligned(data6); + optr.add(7).write_unaligned(data7); + optr.add(8).write_unaligned(data8); + optr.add(9).write_unaligned(data9); + optr.add(10).write_unaligned(data10); + optr.add(11).write_unaligned(data11); + optr.add(12).write_unaligned(data12); + optr.add(13).write_unaligned(data13); + optr.add(14).write_unaligned(data14); +} diff --git a/aes/src/x86/vaes256/aes192.rs b/aes/src/x86/vaes256/aes192.rs new file mode 100644 index 00000000..a7557cbd --- /dev/null +++ b/aes/src/x86/vaes256/aes192.rs @@ -0,0 +1,603 @@ +use crate::x86::{arch::*, RoundKeys, Simd256RoundKeys}; +use crate::Block30; +use cipher::inout::InOut; +use core::arch::asm; + +#[inline] +pub(crate) unsafe fn parallelize_keys(keys: &RoundKeys<13>) -> Simd256RoundKeys<13> { + [ + _mm256_broadcastsi128_si256(keys[0]), + _mm256_broadcastsi128_si256(keys[1]), + _mm256_broadcastsi128_si256(keys[2]), + _mm256_broadcastsi128_si256(keys[3]), + _mm256_broadcastsi128_si256(keys[4]), + _mm256_broadcastsi128_si256(keys[5]), + _mm256_broadcastsi128_si256(keys[6]), + _mm256_broadcastsi128_si256(keys[7]), + _mm256_broadcastsi128_si256(keys[8]), + _mm256_broadcastsi128_si256(keys[9]), + _mm256_broadcastsi128_si256(keys[10]), + _mm256_broadcastsi128_si256(keys[11]), + _mm256_broadcastsi128_si256(keys[12]), + ] +} + +#[target_feature(enable = "avx")] +#[inline] +pub(crate) unsafe fn encrypt30( + simd_256_keys: &Simd256RoundKeys<13>, + blocks: InOut<'_, '_, Block30>, +) { + let (iptr, optr) = blocks.into_raw(); + let iptr = iptr.cast::<__m256i>(); + let optr = optr.cast::<__m256i>(); + + // load plain-data + let mut data0 = iptr.add(0).read_unaligned(); + let mut data1 = iptr.add(1).read_unaligned(); + let mut data2 = iptr.add(2).read_unaligned(); + let mut data3 = iptr.add(3).read_unaligned(); + let mut data4 = iptr.add(4).read_unaligned(); + let mut data5 = iptr.add(5).read_unaligned(); + let mut data6 = iptr.add(6).read_unaligned(); + let mut data7 = iptr.add(7).read_unaligned(); + let mut data8 = iptr.add(8).read_unaligned(); + let mut data9 = iptr.add(9).read_unaligned(); + let mut data10 = iptr.add(10).read_unaligned(); + let mut data11 = iptr.add(11).read_unaligned(); + let mut data12 = iptr.add(12).read_unaligned(); + let mut data13 = iptr.add(13).read_unaligned(); + let mut data14 = iptr.add(14).read_unaligned(); + + asm! { + // aes-128 round 0 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 0 * 32]", + "vpxord ymm1 , ymm1 , ymm0", + "vpxord ymm2 , ymm2 , ymm0", + "vpxord ymm3 , ymm3 , ymm0", + "vpxord ymm4 , ymm4 , ymm0", + "vpxord ymm5 , ymm5 , ymm0", + "vpxord ymm6 , ymm6 , ymm0", + "vpxord ymm7 , ymm7 , ymm0", + "vpxord ymm8 , ymm8 , ymm0", + "vpxord ymm9 , ymm9 , ymm0", + "vpxord ymm10, ymm10, ymm0", + "vpxord ymm11, ymm11, ymm0", + "vpxord ymm12, ymm12, ymm0", + "vpxord ymm13, ymm13, ymm0", + "vpxord ymm14, ymm14, ymm0", + "vpxord ymm15, ymm15, ymm0", + // aes-128 round 1 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 1 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 2 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 2 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 3 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 3 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 4 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 4 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 5 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 5 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 6 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 6 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 7 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 7 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 8 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 8 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 9 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 9 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-192 round 10 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 10 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-192 round 11 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 11 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-192 round 12 encrypt + "vmovdqu ymm0, [{simd_256_keys} + 12 * 32]", + "vaesenclast ymm1 , ymm1 , ymm0", + "vaesenclast ymm2 , ymm2 , ymm0", + "vaesenclast ymm3 , ymm3 , ymm0", + "vaesenclast ymm4 , ymm4 , ymm0", + "vaesenclast ymm5 , ymm5 , ymm0", + "vaesenclast ymm6 , ymm6 , ymm0", + "vaesenclast ymm7 , ymm7 , ymm0", + "vaesenclast ymm8 , ymm8 , ymm0", + "vaesenclast ymm9 , ymm9 , ymm0", + "vaesenclast ymm10, ymm10, ymm0", + "vaesenclast ymm11, ymm11, ymm0", + "vaesenclast ymm12, ymm12, ymm0", + "vaesenclast ymm13, ymm13, ymm0", + "vaesenclast ymm14, ymm14, ymm0", + "vaesenclast ymm15, ymm15, ymm0", + + simd_256_keys = in(reg) simd_256_keys.as_ptr(), + + out("ymm0") _, + inout("ymm1") data0, + inout("ymm2") data1, + inout("ymm3") data2, + inout("ymm4") data3, + inout("ymm5") data4, + inout("ymm6") data5, + inout("ymm7") data6, + inout("ymm8") data7, + inout("ymm9") data8, + inout("ymm10") data9, + inout("ymm11") data10, + inout("ymm12") data11, + inout("ymm13") data12, + inout("ymm14") data13, + inout("ymm15") data14, + + options(pure, readonly, nostack, preserves_flags), + }; + + // save cipher-data + optr.add(0).write_unaligned(data0); + optr.add(1).write_unaligned(data1); + optr.add(2).write_unaligned(data2); + optr.add(3).write_unaligned(data3); + optr.add(4).write_unaligned(data4); + optr.add(5).write_unaligned(data5); + optr.add(6).write_unaligned(data6); + optr.add(7).write_unaligned(data7); + optr.add(8).write_unaligned(data8); + optr.add(9).write_unaligned(data9); + optr.add(10).write_unaligned(data10); + optr.add(11).write_unaligned(data11); + optr.add(12).write_unaligned(data12); + optr.add(13).write_unaligned(data13); + optr.add(14).write_unaligned(data14); +} + +#[target_feature(enable = "avx")] +#[inline] +pub(crate) unsafe fn decrypt30( + simd_256_keys: &Simd256RoundKeys<13>, + blocks: InOut<'_, '_, Block30>, +) { + let (iptr, optr) = blocks.into_raw(); + let iptr = iptr.cast::<__m256i>(); + let optr = optr.cast::<__m256i>(); + + // load cipher-data + let mut data0 = iptr.add(0).read_unaligned(); + let mut data1 = iptr.add(1).read_unaligned(); + let mut data2 = iptr.add(2).read_unaligned(); + let mut data3 = iptr.add(3).read_unaligned(); + let mut data4 = iptr.add(4).read_unaligned(); + let mut data5 = iptr.add(5).read_unaligned(); + let mut data6 = iptr.add(6).read_unaligned(); + let mut data7 = iptr.add(7).read_unaligned(); + let mut data8 = iptr.add(8).read_unaligned(); + let mut data9 = iptr.add(9).read_unaligned(); + let mut data10 = iptr.add(10).read_unaligned(); + let mut data11 = iptr.add(11).read_unaligned(); + let mut data12 = iptr.add(12).read_unaligned(); + let mut data13 = iptr.add(13).read_unaligned(); + let mut data14 = iptr.add(14).read_unaligned(); + + asm! { + // aes-192 round 12 decrypt + "vmovdqu ymm0, [{simd_256_keys} + 12 * 32]", + "vpxord ymm1 , ymm1 , ymm0", + "vpxord ymm2 , ymm2 , ymm0", + "vpxord ymm3 , ymm3 , ymm0", + "vpxord ymm4 , ymm4 , ymm0", + "vpxord ymm5 , ymm5 , ymm0", + "vpxord ymm6 , ymm6 , ymm0", + "vpxord ymm7 , ymm7 , ymm0", + "vpxord ymm8 , ymm8 , ymm0", + "vpxord ymm9 , ymm9 , ymm0", + "vpxord ymm10, ymm10, ymm0", + "vpxord ymm11, ymm11, ymm0", + "vpxord ymm12, ymm12, ymm0", + "vpxord ymm13, ymm13, ymm0", + "vpxord ymm14, ymm14, ymm0", + "vpxord ymm15, ymm15, ymm0", + // aes-192 round 11 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 11 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 10 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 10 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 9 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 9 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 8 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 8 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 7 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 7 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 6 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 6 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 5 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 5 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 4 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 4 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 3 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 3 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 2 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 2 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 1 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 1 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 0 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 0 * 32]", + "vaesdeclast ymm1 , ymm1 , ymm0", + "vaesdeclast ymm2 , ymm2 , ymm0", + "vaesdeclast ymm3 , ymm3 , ymm0", + "vaesdeclast ymm4 , ymm4 , ymm0", + "vaesdeclast ymm5 , ymm5 , ymm0", + "vaesdeclast ymm6 , ymm6 , ymm0", + "vaesdeclast ymm7 , ymm7 , ymm0", + "vaesdeclast ymm8 , ymm8 , ymm0", + "vaesdeclast ymm9 , ymm9 , ymm0", + "vaesdeclast ymm10, ymm10, ymm0", + "vaesdeclast ymm11, ymm11, ymm0", + "vaesdeclast ymm12, ymm12, ymm0", + "vaesdeclast ymm13, ymm13, ymm0", + "vaesdeclast ymm14, ymm14, ymm0", + "vaesdeclast ymm15, ymm15, ymm0", + + simd_256_keys = in(reg) simd_256_keys.as_ptr(), + + out("ymm0") _, + inout("ymm1") data0, + inout("ymm2") data1, + inout("ymm3") data2, + inout("ymm4") data3, + inout("ymm5") data4, + inout("ymm6") data5, + inout("ymm7") data6, + inout("ymm8") data7, + inout("ymm9") data8, + inout("ymm10") data9, + inout("ymm11") data10, + inout("ymm12") data11, + inout("ymm13") data12, + inout("ymm14") data13, + inout("ymm15") data14, + + options(pure, readonly, nostack, preserves_flags), + }; + + // save plain-data + optr.add(0).write_unaligned(data0); + optr.add(1).write_unaligned(data1); + optr.add(2).write_unaligned(data2); + optr.add(3).write_unaligned(data3); + optr.add(4).write_unaligned(data4); + optr.add(5).write_unaligned(data5); + optr.add(6).write_unaligned(data6); + optr.add(7).write_unaligned(data7); + optr.add(8).write_unaligned(data8); + optr.add(9).write_unaligned(data9); + optr.add(10).write_unaligned(data10); + optr.add(11).write_unaligned(data11); + optr.add(12).write_unaligned(data12); + optr.add(13).write_unaligned(data13); + optr.add(14).write_unaligned(data14); +} diff --git a/aes/src/x86/vaes256/aes256.rs b/aes/src/x86/vaes256/aes256.rs new file mode 100644 index 00000000..3149e615 --- /dev/null +++ b/aes/src/x86/vaes256/aes256.rs @@ -0,0 +1,673 @@ +use crate::x86::{arch::*, RoundKeys, Simd256RoundKeys}; +use crate::Block30; +use cipher::inout::InOut; +use core::arch::asm; + +#[inline] +pub(crate) unsafe fn parallelize_keys(keys: &RoundKeys<15>) -> Simd256RoundKeys<15> { + [ + _mm256_broadcastsi128_si256(keys[0]), + _mm256_broadcastsi128_si256(keys[1]), + _mm256_broadcastsi128_si256(keys[2]), + _mm256_broadcastsi128_si256(keys[3]), + _mm256_broadcastsi128_si256(keys[4]), + _mm256_broadcastsi128_si256(keys[5]), + _mm256_broadcastsi128_si256(keys[6]), + _mm256_broadcastsi128_si256(keys[7]), + _mm256_broadcastsi128_si256(keys[8]), + _mm256_broadcastsi128_si256(keys[9]), + _mm256_broadcastsi128_si256(keys[10]), + _mm256_broadcastsi128_si256(keys[11]), + _mm256_broadcastsi128_si256(keys[12]), + _mm256_broadcastsi128_si256(keys[13]), + _mm256_broadcastsi128_si256(keys[14]), + ] +} + +#[target_feature(enable = "avx")] +#[inline] +pub(crate) unsafe fn encrypt30( + simd_256_keys: &Simd256RoundKeys<15>, + blocks: InOut<'_, '_, Block30>, +) { + let (iptr, optr) = blocks.into_raw(); + let iptr = iptr.cast::<__m256i>(); + let optr = optr.cast::<__m256i>(); + + // load plain-data + let mut data0 = iptr.add(0).read_unaligned(); + let mut data1 = iptr.add(1).read_unaligned(); + let mut data2 = iptr.add(2).read_unaligned(); + let mut data3 = iptr.add(3).read_unaligned(); + let mut data4 = iptr.add(4).read_unaligned(); + let mut data5 = iptr.add(5).read_unaligned(); + let mut data6 = iptr.add(6).read_unaligned(); + let mut data7 = iptr.add(7).read_unaligned(); + let mut data8 = iptr.add(8).read_unaligned(); + let mut data9 = iptr.add(9).read_unaligned(); + let mut data10 = iptr.add(10).read_unaligned(); + let mut data11 = iptr.add(11).read_unaligned(); + let mut data12 = iptr.add(12).read_unaligned(); + let mut data13 = iptr.add(13).read_unaligned(); + let mut data14 = iptr.add(14).read_unaligned(); + + asm! { + // aes-128 round 0 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 0 * 32]", + "vpxord ymm1 , ymm1 , ymm0", + "vpxord ymm2 , ymm2 , ymm0", + "vpxord ymm3 , ymm3 , ymm0", + "vpxord ymm4 , ymm4 , ymm0", + "vpxord ymm5 , ymm5 , ymm0", + "vpxord ymm6 , ymm6 , ymm0", + "vpxord ymm7 , ymm7 , ymm0", + "vpxord ymm8 , ymm8 , ymm0", + "vpxord ymm9 , ymm9 , ymm0", + "vpxord ymm10, ymm10, ymm0", + "vpxord ymm11, ymm11, ymm0", + "vpxord ymm12, ymm12, ymm0", + "vpxord ymm13, ymm13, ymm0", + "vpxord ymm14, ymm14, ymm0", + "vpxord ymm15, ymm15, ymm0", + // aes-128 round 1 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 1 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 2 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 2 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 3 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 3 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 4 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 4 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 5 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 5 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 6 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 6 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 7 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 7 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 8 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 8 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-128 round 9 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 9 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-192 round 10 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 10 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-192 round 11 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 11 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-192 round 12 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 12 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-192 round 13 encrypt + "vmovdqu ymm0 , [{simd_256_keys} + 13 * 32]", + "vaesenc ymm1 , ymm1 , ymm0", + "vaesenc ymm2 , ymm2 , ymm0", + "vaesenc ymm3 , ymm3 , ymm0", + "vaesenc ymm4 , ymm4 , ymm0", + "vaesenc ymm5 , ymm5 , ymm0", + "vaesenc ymm6 , ymm6 , ymm0", + "vaesenc ymm7 , ymm7 , ymm0", + "vaesenc ymm8 , ymm8 , ymm0", + "vaesenc ymm9 , ymm9 , ymm0", + "vaesenc ymm10, ymm10, ymm0", + "vaesenc ymm11, ymm11, ymm0", + "vaesenc ymm12, ymm12, ymm0", + "vaesenc ymm13, ymm13, ymm0", + "vaesenc ymm14, ymm14, ymm0", + "vaesenc ymm15, ymm15, ymm0", + // aes-192 round 14 encrypt + "vmovdqu ymm0, [{simd_256_keys} + 14 * 32]", + "vaesenclast ymm1 , ymm1 , ymm0", + "vaesenclast ymm2 , ymm2 , ymm0", + "vaesenclast ymm3 , ymm3 , ymm0", + "vaesenclast ymm4 , ymm4 , ymm0", + "vaesenclast ymm5 , ymm5 , ymm0", + "vaesenclast ymm6 , ymm6 , ymm0", + "vaesenclast ymm7 , ymm7 , ymm0", + "vaesenclast ymm8 , ymm8 , ymm0", + "vaesenclast ymm9 , ymm9 , ymm0", + "vaesenclast ymm10, ymm10, ymm0", + "vaesenclast ymm11, ymm11, ymm0", + "vaesenclast ymm12, ymm12, ymm0", + "vaesenclast ymm13, ymm13, ymm0", + "vaesenclast ymm14, ymm14, ymm0", + "vaesenclast ymm15, ymm15, ymm0", + + simd_256_keys = in(reg) simd_256_keys.as_ptr(), + + out("ymm0") _, + inout("ymm1") data0, + inout("ymm2") data1, + inout("ymm3") data2, + inout("ymm4") data3, + inout("ymm5") data4, + inout("ymm6") data5, + inout("ymm7") data6, + inout("ymm8") data7, + inout("ymm9") data8, + inout("ymm10") data9, + inout("ymm11") data10, + inout("ymm12") data11, + inout("ymm13") data12, + inout("ymm14") data13, + inout("ymm15") data14, + + options(pure, readonly, nostack, preserves_flags), + }; + + // save cipher-data + optr.add(0).write_unaligned(data0); + optr.add(1).write_unaligned(data1); + optr.add(2).write_unaligned(data2); + optr.add(3).write_unaligned(data3); + optr.add(4).write_unaligned(data4); + optr.add(5).write_unaligned(data5); + optr.add(6).write_unaligned(data6); + optr.add(7).write_unaligned(data7); + optr.add(8).write_unaligned(data8); + optr.add(9).write_unaligned(data9); + optr.add(10).write_unaligned(data10); + optr.add(11).write_unaligned(data11); + optr.add(12).write_unaligned(data12); + optr.add(13).write_unaligned(data13); + optr.add(14).write_unaligned(data14); +} + +#[target_feature(enable = "avx")] +#[inline] +pub(crate) unsafe fn decrypt30( + simd_256_keys: &Simd256RoundKeys<15>, + blocks: InOut<'_, '_, Block30>, +) { + let (iptr, optr) = blocks.into_raw(); + let iptr = iptr.cast::<__m256i>(); + let optr = optr.cast::<__m256i>(); + + // load cipher-data + let mut data0 = iptr.add(0).read_unaligned(); + let mut data1 = iptr.add(1).read_unaligned(); + let mut data2 = iptr.add(2).read_unaligned(); + let mut data3 = iptr.add(3).read_unaligned(); + let mut data4 = iptr.add(4).read_unaligned(); + let mut data5 = iptr.add(5).read_unaligned(); + let mut data6 = iptr.add(6).read_unaligned(); + let mut data7 = iptr.add(7).read_unaligned(); + let mut data8 = iptr.add(8).read_unaligned(); + let mut data9 = iptr.add(9).read_unaligned(); + let mut data10 = iptr.add(10).read_unaligned(); + let mut data11 = iptr.add(11).read_unaligned(); + let mut data12 = iptr.add(12).read_unaligned(); + let mut data13 = iptr.add(13).read_unaligned(); + let mut data14 = iptr.add(14).read_unaligned(); + + asm! { + // aes-192 round 14 decrypt + "vmovdqu ymm0, [{simd_256_keys} + 14 * 32]", + "vpxord ymm1 , ymm1 , ymm0", + "vpxord ymm2 , ymm2 , ymm0", + "vpxord ymm3 , ymm3 , ymm0", + "vpxord ymm4 , ymm4 , ymm0", + "vpxord ymm5 , ymm5 , ymm0", + "vpxord ymm6 , ymm6 , ymm0", + "vpxord ymm7 , ymm7 , ymm0", + "vpxord ymm8 , ymm8 , ymm0", + "vpxord ymm9 , ymm9 , ymm0", + "vpxord ymm10, ymm10, ymm0", + "vpxord ymm11, ymm11, ymm0", + "vpxord ymm12, ymm12, ymm0", + "vpxord ymm13, ymm13, ymm0", + "vpxord ymm14, ymm14, ymm0", + "vpxord ymm15, ymm15, ymm0", + // aes-192 round 13 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 13 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 12 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 12 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 11 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 11 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 10 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 10 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 9 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 9 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 8 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 8 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 7 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 7 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 6 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 6 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 5 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 5 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 4 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 4 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 3 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 3 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 2 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 2 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 1 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 1 * 32]", + "vaesdec ymm1 , ymm1 , ymm0", + "vaesdec ymm2 , ymm2 , ymm0", + "vaesdec ymm3 , ymm3 , ymm0", + "vaesdec ymm4 , ymm4 , ymm0", + "vaesdec ymm5 , ymm5 , ymm0", + "vaesdec ymm6 , ymm6 , ymm0", + "vaesdec ymm7 , ymm7 , ymm0", + "vaesdec ymm8 , ymm8 , ymm0", + "vaesdec ymm9 , ymm9 , ymm0", + "vaesdec ymm10, ymm10, ymm0", + "vaesdec ymm11, ymm11, ymm0", + "vaesdec ymm12, ymm12, ymm0", + "vaesdec ymm13, ymm13, ymm0", + "vaesdec ymm14, ymm14, ymm0", + "vaesdec ymm15, ymm15, ymm0", + // aes-192 round 0 decrypt + "vmovdqu ymm0 , [{simd_256_keys} + 0 * 32]", + "vaesdeclast ymm1 , ymm1 , ymm0", + "vaesdeclast ymm2 , ymm2 , ymm0", + "vaesdeclast ymm3 , ymm3 , ymm0", + "vaesdeclast ymm4 , ymm4 , ymm0", + "vaesdeclast ymm5 , ymm5 , ymm0", + "vaesdeclast ymm6 , ymm6 , ymm0", + "vaesdeclast ymm7 , ymm7 , ymm0", + "vaesdeclast ymm8 , ymm8 , ymm0", + "vaesdeclast ymm9 , ymm9 , ymm0", + "vaesdeclast ymm10, ymm10, ymm0", + "vaesdeclast ymm11, ymm11, ymm0", + "vaesdeclast ymm12, ymm12, ymm0", + "vaesdeclast ymm13, ymm13, ymm0", + "vaesdeclast ymm14, ymm14, ymm0", + "vaesdeclast ymm15, ymm15, ymm0", + + simd_256_keys = in(reg) simd_256_keys.as_ptr(), + + out("ymm0") _, + inout("ymm1") data0, + inout("ymm2") data1, + inout("ymm3") data2, + inout("ymm4") data3, + inout("ymm5") data4, + inout("ymm6") data5, + inout("ymm7") data6, + inout("ymm8") data7, + inout("ymm9") data8, + inout("ymm10") data9, + inout("ymm11") data10, + inout("ymm12") data11, + inout("ymm13") data12, + inout("ymm14") data13, + inout("ymm15") data14, + + options(pure, readonly, nostack, preserves_flags), + }; + + // save plain-data + optr.add(0).write_unaligned(data0); + optr.add(1).write_unaligned(data1); + optr.add(2).write_unaligned(data2); + optr.add(3).write_unaligned(data3); + optr.add(4).write_unaligned(data4); + optr.add(5).write_unaligned(data5); + optr.add(6).write_unaligned(data6); + optr.add(7).write_unaligned(data7); + optr.add(8).write_unaligned(data8); + optr.add(9).write_unaligned(data9); + optr.add(10).write_unaligned(data10); + optr.add(11).write_unaligned(data11); + optr.add(12).write_unaligned(data12); + optr.add(13).write_unaligned(data13); + optr.add(14).write_unaligned(data14); +} diff --git a/aes/src/x86/vaes512.rs b/aes/src/x86/vaes512.rs new file mode 100644 index 00000000..abb566f0 --- /dev/null +++ b/aes/src/x86/vaes512.rs @@ -0,0 +1,3 @@ +pub(super) mod aes128; +pub(super) mod aes192; +pub(super) mod aes256; diff --git a/aes/src/x86/vaes512/aes128.rs b/aes/src/x86/vaes512/aes128.rs new file mode 100644 index 00000000..a732e001 --- /dev/null +++ b/aes/src/x86/vaes512/aes128.rs @@ -0,0 +1,602 @@ +use crate::x86::{arch::*, RoundKeys, Simd512RoundKeys}; +use crate::Block64; +use cipher::inout::InOut; +use core::{arch::asm, mem::MaybeUninit}; + +#[inline] +pub(crate) unsafe fn parallelize_keys(keys: &RoundKeys<11>) -> Simd512RoundKeys<11> { + let mut v512: [MaybeUninit<__m512i>; 11] = MaybeUninit::uninit().assume_init(); + asm! { + "vbroadcasti32x4 zmm0 , [{keys} + 0 * 16]", + "vbroadcasti32x4 zmm1 , [{keys} + 1 * 16]", + "vbroadcasti32x4 zmm2 , [{keys} + 2 * 16]", + "vbroadcasti32x4 zmm3 , [{keys} + 3 * 16]", + "vbroadcasti32x4 zmm4 , [{keys} + 4 * 16]", + "vbroadcasti32x4 zmm5 , [{keys} + 5 * 16]", + "vbroadcasti32x4 zmm6 , [{keys} + 6 * 16]", + "vbroadcasti32x4 zmm7 , [{keys} + 7 * 16]", + "vbroadcasti32x4 zmm8 , [{keys} + 8 * 16]", + "vbroadcasti32x4 zmm9 , [{keys} + 9 * 16]", + "vbroadcasti32x4 zmm10, [{keys} + 10 * 16]", + + "vmovdqu32 [{optr} + 0 * 64], zmm0", + "vmovdqu32 [{optr} + 1 * 64], zmm1", + "vmovdqu32 [{optr} + 2 * 64], zmm2", + "vmovdqu32 [{optr} + 3 * 64], zmm3", + "vmovdqu32 [{optr} + 4 * 64], zmm4", + "vmovdqu32 [{optr} + 5 * 64], zmm5", + "vmovdqu32 [{optr} + 6 * 64], zmm6", + "vmovdqu32 [{optr} + 7 * 64], zmm7", + "vmovdqu32 [{optr} + 8 * 64], zmm8", + "vmovdqu32 [{optr} + 9 * 64], zmm9", + "vmovdqu32 [{optr} + 10 * 64], zmm10", + + keys = in(reg) keys.as_ptr(), + optr = in(reg) v512.as_mut_ptr().cast::<__m512i>(), + + out("zmm0") _, + out("zmm1") _, + out("zmm2") _, + out("zmm3") _, + out("zmm4") _, + out("zmm5") _, + out("zmm6") _, + out("zmm7") _, + out("zmm8") _, + out("zmm9") _, + out("zmm10") _, + + options(nostack, preserves_flags), + }; + core::mem::transmute(v512) +} + +#[inline] +pub(crate) unsafe fn encrypt64(keys: &Simd512RoundKeys<11>, blocks: InOut<'_, '_, Block64>) { + let (iptr, optr) = blocks.into_raw(); + asm! { + // load keys + "vmovdqu32 zmm0 , [{keys} + 0 * 64]", + "vmovdqu32 zmm1 , [{keys} + 1 * 64]", + "vmovdqu32 zmm2 , [{keys} + 2 * 64]", + "vmovdqu32 zmm3 , [{keys} + 3 * 64]", + "vmovdqu32 zmm4 , [{keys} + 4 * 64]", + "vmovdqu32 zmm5 , [{keys} + 5 * 64]", + "vmovdqu32 zmm6 , [{keys} + 6 * 64]", + "vmovdqu32 zmm7 , [{keys} + 7 * 64]", + "vmovdqu32 zmm8 , [{keys} + 8 * 64]", + "vmovdqu32 zmm9 , [{keys} + 9 * 64]", + "vmovdqu32 zmm10, [{keys} + 10 * 64]", + // load plain-data + "vmovdqu32 zmm16, [{iptr} + 0 * 64]", + "vmovdqu32 zmm17, [{iptr} + 1 * 64]", + "vmovdqu32 zmm18, [{iptr} + 2 * 64]", + "vmovdqu32 zmm19, [{iptr} + 3 * 64]", + "vmovdqu32 zmm20, [{iptr} + 4 * 64]", + "vmovdqu32 zmm21, [{iptr} + 5 * 64]", + "vmovdqu32 zmm22, [{iptr} + 6 * 64]", + "vmovdqu32 zmm23, [{iptr} + 7 * 64]", + "vmovdqu32 zmm24, [{iptr} + 8 * 64]", + "vmovdqu32 zmm25, [{iptr} + 9 * 64]", + "vmovdqu32 zmm26, [{iptr} + 10 * 64]", + "vmovdqu32 zmm27, [{iptr} + 11 * 64]", + "vmovdqu32 zmm28, [{iptr} + 12 * 64]", + "vmovdqu32 zmm29, [{iptr} + 13 * 64]", + "vmovdqu32 zmm30, [{iptr} + 14 * 64]", + "vmovdqu32 zmm31, [{iptr} + 15 * 64]", + // aes-128 round 0 encrypt + "vpxord zmm16, zmm16, zmm0", + "vpxord zmm17, zmm17, zmm0", + "vpxord zmm18, zmm18, zmm0", + "vpxord zmm19, zmm19, zmm0", + "vpxord zmm20, zmm20, zmm0", + "vpxord zmm21, zmm21, zmm0", + "vpxord zmm22, zmm22, zmm0", + "vpxord zmm23, zmm23, zmm0", + "vpxord zmm24, zmm24, zmm0", + "vpxord zmm25, zmm25, zmm0", + "vpxord zmm26, zmm26, zmm0", + "vpxord zmm27, zmm27, zmm0", + "vpxord zmm28, zmm28, zmm0", + "vpxord zmm29, zmm29, zmm0", + "vpxord zmm30, zmm30, zmm0", + "vpxord zmm31, zmm31, zmm0", + // aes-128 round 1 encrypt + "vaesenc zmm16, zmm16, zmm1", + "vaesenc zmm17, zmm17, zmm1", + "vaesenc zmm18, zmm18, zmm1", + "vaesenc zmm19, zmm19, zmm1", + "vaesenc zmm20, zmm20, zmm1", + "vaesenc zmm21, zmm21, zmm1", + "vaesenc zmm22, zmm22, zmm1", + "vaesenc zmm23, zmm23, zmm1", + "vaesenc zmm24, zmm24, zmm1", + "vaesenc zmm25, zmm25, zmm1", + "vaesenc zmm26, zmm26, zmm1", + "vaesenc zmm27, zmm27, zmm1", + "vaesenc zmm28, zmm28, zmm1", + "vaesenc zmm29, zmm29, zmm1", + "vaesenc zmm30, zmm30, zmm1", + "vaesenc zmm31, zmm31, zmm1", + // aes-128 round 2 encrypt + "vaesenc zmm16, zmm16, zmm2", + "vaesenc zmm17, zmm17, zmm2", + "vaesenc zmm18, zmm18, zmm2", + "vaesenc zmm19, zmm19, zmm2", + "vaesenc zmm20, zmm20, zmm2", + "vaesenc zmm21, zmm21, zmm2", + "vaesenc zmm22, zmm22, zmm2", + "vaesenc zmm23, zmm23, zmm2", + "vaesenc zmm24, zmm24, zmm2", + "vaesenc zmm25, zmm25, zmm2", + "vaesenc zmm26, zmm26, zmm2", + "vaesenc zmm27, zmm27, zmm2", + "vaesenc zmm28, zmm28, zmm2", + "vaesenc zmm29, zmm29, zmm2", + "vaesenc zmm30, zmm30, zmm2", + "vaesenc zmm31, zmm31, zmm2", + // aes-128 round 3 encrypt + "vaesenc zmm16, zmm16, zmm3", + "vaesenc zmm17, zmm17, zmm3", + "vaesenc zmm18, zmm18, zmm3", + "vaesenc zmm19, zmm19, zmm3", + "vaesenc zmm20, zmm20, zmm3", + "vaesenc zmm21, zmm21, zmm3", + "vaesenc zmm22, zmm22, zmm3", + "vaesenc zmm23, zmm23, zmm3", + "vaesenc zmm24, zmm24, zmm3", + "vaesenc zmm25, zmm25, zmm3", + "vaesenc zmm26, zmm26, zmm3", + "vaesenc zmm27, zmm27, zmm3", + "vaesenc zmm28, zmm28, zmm3", + "vaesenc zmm29, zmm29, zmm3", + "vaesenc zmm30, zmm30, zmm3", + "vaesenc zmm31, zmm31, zmm3", + // aes-128 round 4 encrypt + "vaesenc zmm16, zmm16, zmm4", + "vaesenc zmm17, zmm17, zmm4", + "vaesenc zmm18, zmm18, zmm4", + "vaesenc zmm19, zmm19, zmm4", + "vaesenc zmm20, zmm20, zmm4", + "vaesenc zmm21, zmm21, zmm4", + "vaesenc zmm22, zmm22, zmm4", + "vaesenc zmm23, zmm23, zmm4", + "vaesenc zmm24, zmm24, zmm4", + "vaesenc zmm25, zmm25, zmm4", + "vaesenc zmm26, zmm26, zmm4", + "vaesenc zmm27, zmm27, zmm4", + "vaesenc zmm28, zmm28, zmm4", + "vaesenc zmm29, zmm29, zmm4", + "vaesenc zmm30, zmm30, zmm4", + "vaesenc zmm31, zmm31, zmm4", + // aes-128 round 5 encrypt + "vaesenc zmm16, zmm16, zmm5", + "vaesenc zmm17, zmm17, zmm5", + "vaesenc zmm18, zmm18, zmm5", + "vaesenc zmm19, zmm19, zmm5", + "vaesenc zmm20, zmm20, zmm5", + "vaesenc zmm21, zmm21, zmm5", + "vaesenc zmm22, zmm22, zmm5", + "vaesenc zmm23, zmm23, zmm5", + "vaesenc zmm24, zmm24, zmm5", + "vaesenc zmm25, zmm25, zmm5", + "vaesenc zmm26, zmm26, zmm5", + "vaesenc zmm27, zmm27, zmm5", + "vaesenc zmm28, zmm28, zmm5", + "vaesenc zmm29, zmm29, zmm5", + "vaesenc zmm30, zmm30, zmm5", + "vaesenc zmm31, zmm31, zmm5", + // aes-128 round 6 encrypt + "vaesenc zmm16, zmm16, zmm6", + "vaesenc zmm17, zmm17, zmm6", + "vaesenc zmm18, zmm18, zmm6", + "vaesenc zmm19, zmm19, zmm6", + "vaesenc zmm20, zmm20, zmm6", + "vaesenc zmm21, zmm21, zmm6", + "vaesenc zmm22, zmm22, zmm6", + "vaesenc zmm23, zmm23, zmm6", + "vaesenc zmm24, zmm24, zmm6", + "vaesenc zmm25, zmm25, zmm6", + "vaesenc zmm26, zmm26, zmm6", + "vaesenc zmm27, zmm27, zmm6", + "vaesenc zmm28, zmm28, zmm6", + "vaesenc zmm29, zmm29, zmm6", + "vaesenc zmm30, zmm30, zmm6", + "vaesenc zmm31, zmm31, zmm6", + // aes-128 round 7 encrypt + "vaesenc zmm16, zmm16, zmm7", + "vaesenc zmm17, zmm17, zmm7", + "vaesenc zmm18, zmm18, zmm7", + "vaesenc zmm19, zmm19, zmm7", + "vaesenc zmm20, zmm20, zmm7", + "vaesenc zmm21, zmm21, zmm7", + "vaesenc zmm22, zmm22, zmm7", + "vaesenc zmm23, zmm23, zmm7", + "vaesenc zmm24, zmm24, zmm7", + "vaesenc zmm25, zmm25, zmm7", + "vaesenc zmm26, zmm26, zmm7", + "vaesenc zmm27, zmm27, zmm7", + "vaesenc zmm28, zmm28, zmm7", + "vaesenc zmm29, zmm29, zmm7", + "vaesenc zmm30, zmm30, zmm7", + "vaesenc zmm31, zmm31, zmm7", + // aes-128 round 8 encrypt + "vaesenc zmm16, zmm16, zmm8", + "vaesenc zmm17, zmm17, zmm8", + "vaesenc zmm18, zmm18, zmm8", + "vaesenc zmm19, zmm19, zmm8", + "vaesenc zmm20, zmm20, zmm8", + "vaesenc zmm21, zmm21, zmm8", + "vaesenc zmm22, zmm22, zmm8", + "vaesenc zmm23, zmm23, zmm8", + "vaesenc zmm24, zmm24, zmm8", + "vaesenc zmm25, zmm25, zmm8", + "vaesenc zmm26, zmm26, zmm8", + "vaesenc zmm27, zmm27, zmm8", + "vaesenc zmm28, zmm28, zmm8", + "vaesenc zmm29, zmm29, zmm8", + "vaesenc zmm30, zmm30, zmm8", + "vaesenc zmm31, zmm31, zmm8", + // aes-128 round 9 encrypt + "vaesenc zmm16, zmm16, zmm9", + "vaesenc zmm17, zmm17, zmm9", + "vaesenc zmm18, zmm18, zmm9", + "vaesenc zmm19, zmm19, zmm9", + "vaesenc zmm20, zmm20, zmm9", + "vaesenc zmm21, zmm21, zmm9", + "vaesenc zmm22, zmm22, zmm9", + "vaesenc zmm23, zmm23, zmm9", + "vaesenc zmm24, zmm24, zmm9", + "vaesenc zmm25, zmm25, zmm9", + "vaesenc zmm26, zmm26, zmm9", + "vaesenc zmm27, zmm27, zmm9", + "vaesenc zmm28, zmm28, zmm9", + "vaesenc zmm29, zmm29, zmm9", + "vaesenc zmm30, zmm30, zmm9", + "vaesenc zmm31, zmm31, zmm9", + // aes-128 round 10 encrypt + "vaesenclast zmm16, zmm16, zmm10", + "vaesenclast zmm17, zmm17, zmm10", + "vaesenclast zmm18, zmm18, zmm10", + "vaesenclast zmm19, zmm19, zmm10", + "vaesenclast zmm20, zmm20, zmm10", + "vaesenclast zmm21, zmm21, zmm10", + "vaesenclast zmm22, zmm22, zmm10", + "vaesenclast zmm23, zmm23, zmm10", + "vaesenclast zmm24, zmm24, zmm10", + "vaesenclast zmm25, zmm25, zmm10", + "vaesenclast zmm26, zmm26, zmm10", + "vaesenclast zmm27, zmm27, zmm10", + "vaesenclast zmm28, zmm28, zmm10", + "vaesenclast zmm29, zmm29, zmm10", + "vaesenclast zmm30, zmm30, zmm10", + "vaesenclast zmm31, zmm31, zmm10", + // save cipher-data + "vmovdqu32 [{optr} + 0 * 64], zmm16", + "vmovdqu32 [{optr} + 1 * 64], zmm17", + "vmovdqu32 [{optr} + 2 * 64], zmm18", + "vmovdqu32 [{optr} + 3 * 64], zmm19", + "vmovdqu32 [{optr} + 4 * 64], zmm20", + "vmovdqu32 [{optr} + 5 * 64], zmm21", + "vmovdqu32 [{optr} + 6 * 64], zmm22", + "vmovdqu32 [{optr} + 7 * 64], zmm23", + "vmovdqu32 [{optr} + 8 * 64], zmm24", + "vmovdqu32 [{optr} + 9 * 64], zmm25", + "vmovdqu32 [{optr} + 10 * 64], zmm26", + "vmovdqu32 [{optr} + 11 * 64], zmm27", + "vmovdqu32 [{optr} + 12 * 64], zmm28", + "vmovdqu32 [{optr} + 13 * 64], zmm29", + "vmovdqu32 [{optr} + 14 * 64], zmm30", + "vmovdqu32 [{optr} + 15 * 64], zmm31", + + keys = in(reg) keys.as_ptr(), + iptr = in(reg) iptr, + optr = in(reg) optr, + + out("zmm0") _, + out("zmm1") _, + out("zmm2") _, + out("zmm3") _, + out("zmm4") _, + out("zmm5") _, + out("zmm6") _, + out("zmm7") _, + out("zmm8") _, + out("zmm9") _, + out("zmm10") _, + + out("zmm16") _, + out("zmm17") _, + out("zmm18") _, + out("zmm19") _, + out("zmm20") _, + out("zmm21") _, + out("zmm22") _, + out("zmm23") _, + out("zmm24") _, + out("zmm25") _, + out("zmm26") _, + out("zmm27") _, + out("zmm28") _, + out("zmm29") _, + out("zmm30") _, + out("zmm31") _, + + options(nostack, preserves_flags), + }; +} + +#[inline] +pub(crate) unsafe fn decrypt64(keys: &Simd512RoundKeys<11>, blocks: InOut<'_, '_, Block64>) { + let (iptr, optr) = blocks.into_raw(); + asm! { + // load keys + "vmovdqu32 zmm10, [{keys} + 10 * 64]", + "vmovdqu32 zmm9 , [{keys} + 9 * 64]", + "vmovdqu32 zmm8 , [{keys} + 8 * 64]", + "vmovdqu32 zmm7 , [{keys} + 7 * 64]", + "vmovdqu32 zmm6 , [{keys} + 6 * 64]", + "vmovdqu32 zmm5 , [{keys} + 5 * 64]", + "vmovdqu32 zmm4 , [{keys} + 4 * 64]", + "vmovdqu32 zmm3 , [{keys} + 3 * 64]", + "vmovdqu32 zmm2 , [{keys} + 2 * 64]", + "vmovdqu32 zmm1 , [{keys} + 1 * 64]", + "vmovdqu32 zmm0 , [{keys} + 0 * 64]", + // load cipher-data + "vmovdqu32 zmm16, [{iptr} + 0 * 64]", + "vmovdqu32 zmm17, [{iptr} + 1 * 64]", + "vmovdqu32 zmm18, [{iptr} + 2 * 64]", + "vmovdqu32 zmm19, [{iptr} + 3 * 64]", + "vmovdqu32 zmm20, [{iptr} + 4 * 64]", + "vmovdqu32 zmm21, [{iptr} + 5 * 64]", + "vmovdqu32 zmm22, [{iptr} + 6 * 64]", + "vmovdqu32 zmm23, [{iptr} + 7 * 64]", + "vmovdqu32 zmm24, [{iptr} + 8 * 64]", + "vmovdqu32 zmm25, [{iptr} + 9 * 64]", + "vmovdqu32 zmm26, [{iptr} + 10 * 64]", + "vmovdqu32 zmm27, [{iptr} + 11 * 64]", + "vmovdqu32 zmm28, [{iptr} + 12 * 64]", + "vmovdqu32 zmm29, [{iptr} + 13 * 64]", + "vmovdqu32 zmm30, [{iptr} + 14 * 64]", + "vmovdqu32 zmm31, [{iptr} + 15 * 64]", + // aes-128 round 10 decrypt + "vpxord zmm16, zmm16, zmm10", + "vpxord zmm17, zmm17, zmm10", + "vpxord zmm18, zmm18, zmm10", + "vpxord zmm19, zmm19, zmm10", + "vpxord zmm20, zmm20, zmm10", + "vpxord zmm21, zmm21, zmm10", + "vpxord zmm22, zmm22, zmm10", + "vpxord zmm23, zmm23, zmm10", + "vpxord zmm24, zmm24, zmm10", + "vpxord zmm25, zmm25, zmm10", + "vpxord zmm26, zmm26, zmm10", + "vpxord zmm27, zmm27, zmm10", + "vpxord zmm28, zmm28, zmm10", + "vpxord zmm29, zmm29, zmm10", + "vpxord zmm30, zmm30, zmm10", + "vpxord zmm31, zmm31, zmm10", + // aes-128 round 9 decrypt + "vaesdec zmm16, zmm16, zmm9", + "vaesdec zmm17, zmm17, zmm9", + "vaesdec zmm18, zmm18, zmm9", + "vaesdec zmm19, zmm19, zmm9", + "vaesdec zmm20, zmm20, zmm9", + "vaesdec zmm21, zmm21, zmm9", + "vaesdec zmm22, zmm22, zmm9", + "vaesdec zmm23, zmm23, zmm9", + "vaesdec zmm24, zmm24, zmm9", + "vaesdec zmm25, zmm25, zmm9", + "vaesdec zmm26, zmm26, zmm9", + "vaesdec zmm27, zmm27, zmm9", + "vaesdec zmm28, zmm28, zmm9", + "vaesdec zmm29, zmm29, zmm9", + "vaesdec zmm30, zmm30, zmm9", + "vaesdec zmm31, zmm31, zmm9", + // aes-128 round 8 decrypt + "vaesdec zmm16, zmm16, zmm8", + "vaesdec zmm17, zmm17, zmm8", + "vaesdec zmm18, zmm18, zmm8", + "vaesdec zmm19, zmm19, zmm8", + "vaesdec zmm20, zmm20, zmm8", + "vaesdec zmm21, zmm21, zmm8", + "vaesdec zmm22, zmm22, zmm8", + "vaesdec zmm23, zmm23, zmm8", + "vaesdec zmm24, zmm24, zmm8", + "vaesdec zmm25, zmm25, zmm8", + "vaesdec zmm26, zmm26, zmm8", + "vaesdec zmm27, zmm27, zmm8", + "vaesdec zmm28, zmm28, zmm8", + "vaesdec zmm29, zmm29, zmm8", + "vaesdec zmm30, zmm30, zmm8", + "vaesdec zmm31, zmm31, zmm8", + // aes-128 round 7 decrypt + "vaesdec zmm16, zmm16, zmm7", + "vaesdec zmm17, zmm17, zmm7", + "vaesdec zmm18, zmm18, zmm7", + "vaesdec zmm19, zmm19, zmm7", + "vaesdec zmm20, zmm20, zmm7", + "vaesdec zmm21, zmm21, zmm7", + "vaesdec zmm22, zmm22, zmm7", + "vaesdec zmm23, zmm23, zmm7", + "vaesdec zmm24, zmm24, zmm7", + "vaesdec zmm25, zmm25, zmm7", + "vaesdec zmm26, zmm26, zmm7", + "vaesdec zmm27, zmm27, zmm7", + "vaesdec zmm28, zmm28, zmm7", + "vaesdec zmm29, zmm29, zmm7", + "vaesdec zmm30, zmm30, zmm7", + "vaesdec zmm31, zmm31, zmm7", + // aes-128 round 6 decrypt + "vaesdec zmm16, zmm16, zmm6", + "vaesdec zmm17, zmm17, zmm6", + "vaesdec zmm18, zmm18, zmm6", + "vaesdec zmm19, zmm19, zmm6", + "vaesdec zmm20, zmm20, zmm6", + "vaesdec zmm21, zmm21, zmm6", + "vaesdec zmm22, zmm22, zmm6", + "vaesdec zmm23, zmm23, zmm6", + "vaesdec zmm24, zmm24, zmm6", + "vaesdec zmm25, zmm25, zmm6", + "vaesdec zmm26, zmm26, zmm6", + "vaesdec zmm27, zmm27, zmm6", + "vaesdec zmm28, zmm28, zmm6", + "vaesdec zmm29, zmm29, zmm6", + "vaesdec zmm30, zmm30, zmm6", + "vaesdec zmm31, zmm31, zmm6", + // aes-128 round 5 decrypt + "vaesdec zmm16, zmm16, zmm5", + "vaesdec zmm17, zmm17, zmm5", + "vaesdec zmm18, zmm18, zmm5", + "vaesdec zmm19, zmm19, zmm5", + "vaesdec zmm20, zmm20, zmm5", + "vaesdec zmm21, zmm21, zmm5", + "vaesdec zmm22, zmm22, zmm5", + "vaesdec zmm23, zmm23, zmm5", + "vaesdec zmm24, zmm24, zmm5", + "vaesdec zmm25, zmm25, zmm5", + "vaesdec zmm26, zmm26, zmm5", + "vaesdec zmm27, zmm27, zmm5", + "vaesdec zmm28, zmm28, zmm5", + "vaesdec zmm29, zmm29, zmm5", + "vaesdec zmm30, zmm30, zmm5", + "vaesdec zmm31, zmm31, zmm5", + // aes-128 round 4 decrypt + "vaesdec zmm16, zmm16, zmm4", + "vaesdec zmm17, zmm17, zmm4", + "vaesdec zmm18, zmm18, zmm4", + "vaesdec zmm19, zmm19, zmm4", + "vaesdec zmm20, zmm20, zmm4", + "vaesdec zmm21, zmm21, zmm4", + "vaesdec zmm22, zmm22, zmm4", + "vaesdec zmm23, zmm23, zmm4", + "vaesdec zmm24, zmm24, zmm4", + "vaesdec zmm25, zmm25, zmm4", + "vaesdec zmm26, zmm26, zmm4", + "vaesdec zmm27, zmm27, zmm4", + "vaesdec zmm28, zmm28, zmm4", + "vaesdec zmm29, zmm29, zmm4", + "vaesdec zmm30, zmm30, zmm4", + "vaesdec zmm31, zmm31, zmm4", + // aes-128 round 3 decrypt + "vaesdec zmm16, zmm16, zmm3", + "vaesdec zmm17, zmm17, zmm3", + "vaesdec zmm18, zmm18, zmm3", + "vaesdec zmm19, zmm19, zmm3", + "vaesdec zmm20, zmm20, zmm3", + "vaesdec zmm21, zmm21, zmm3", + "vaesdec zmm22, zmm22, zmm3", + "vaesdec zmm23, zmm23, zmm3", + "vaesdec zmm24, zmm24, zmm3", + "vaesdec zmm25, zmm25, zmm3", + "vaesdec zmm26, zmm26, zmm3", + "vaesdec zmm27, zmm27, zmm3", + "vaesdec zmm28, zmm28, zmm3", + "vaesdec zmm29, zmm29, zmm3", + "vaesdec zmm30, zmm30, zmm3", + "vaesdec zmm31, zmm31, zmm3", + // aes-128 round 2 decrypt + "vaesdec zmm16, zmm16, zmm2", + "vaesdec zmm17, zmm17, zmm2", + "vaesdec zmm18, zmm18, zmm2", + "vaesdec zmm19, zmm19, zmm2", + "vaesdec zmm20, zmm20, zmm2", + "vaesdec zmm21, zmm21, zmm2", + "vaesdec zmm22, zmm22, zmm2", + "vaesdec zmm23, zmm23, zmm2", + "vaesdec zmm24, zmm24, zmm2", + "vaesdec zmm25, zmm25, zmm2", + "vaesdec zmm26, zmm26, zmm2", + "vaesdec zmm27, zmm27, zmm2", + "vaesdec zmm28, zmm28, zmm2", + "vaesdec zmm29, zmm29, zmm2", + "vaesdec zmm30, zmm30, zmm2", + "vaesdec zmm31, zmm31, zmm2", + // aes-128 round 1 decrypt + "vaesdec zmm16, zmm16, zmm1", + "vaesdec zmm17, zmm17, zmm1", + "vaesdec zmm18, zmm18, zmm1", + "vaesdec zmm19, zmm19, zmm1", + "vaesdec zmm20, zmm20, zmm1", + "vaesdec zmm21, zmm21, zmm1", + "vaesdec zmm22, zmm22, zmm1", + "vaesdec zmm23, zmm23, zmm1", + "vaesdec zmm24, zmm24, zmm1", + "vaesdec zmm25, zmm25, zmm1", + "vaesdec zmm26, zmm26, zmm1", + "vaesdec zmm27, zmm27, zmm1", + "vaesdec zmm28, zmm28, zmm1", + "vaesdec zmm29, zmm29, zmm1", + "vaesdec zmm30, zmm30, zmm1", + "vaesdec zmm31, zmm31, zmm1", + // aes-128 round 0 decrypt + "vaesdeclast zmm16, zmm16, zmm0", + "vaesdeclast zmm17, zmm17, zmm0", + "vaesdeclast zmm18, zmm18, zmm0", + "vaesdeclast zmm19, zmm19, zmm0", + "vaesdeclast zmm20, zmm20, zmm0", + "vaesdeclast zmm21, zmm21, zmm0", + "vaesdeclast zmm22, zmm22, zmm0", + "vaesdeclast zmm23, zmm23, zmm0", + "vaesdeclast zmm24, zmm24, zmm0", + "vaesdeclast zmm25, zmm25, zmm0", + "vaesdeclast zmm26, zmm26, zmm0", + "vaesdeclast zmm27, zmm27, zmm0", + "vaesdeclast zmm28, zmm28, zmm0", + "vaesdeclast zmm29, zmm29, zmm0", + "vaesdeclast zmm30, zmm30, zmm0", + "vaesdeclast zmm31, zmm31, zmm0", + // save plain-data + "vmovdqu32 [{optr} + 0 * 64], zmm16", + "vmovdqu32 [{optr} + 1 * 64], zmm17", + "vmovdqu32 [{optr} + 2 * 64], zmm18", + "vmovdqu32 [{optr} + 3 * 64], zmm19", + "vmovdqu32 [{optr} + 4 * 64], zmm20", + "vmovdqu32 [{optr} + 5 * 64], zmm21", + "vmovdqu32 [{optr} + 6 * 64], zmm22", + "vmovdqu32 [{optr} + 7 * 64], zmm23", + "vmovdqu32 [{optr} + 8 * 64], zmm24", + "vmovdqu32 [{optr} + 9 * 64], zmm25", + "vmovdqu32 [{optr} + 10 * 64], zmm26", + "vmovdqu32 [{optr} + 11 * 64], zmm27", + "vmovdqu32 [{optr} + 12 * 64], zmm28", + "vmovdqu32 [{optr} + 13 * 64], zmm29", + "vmovdqu32 [{optr} + 14 * 64], zmm30", + "vmovdqu32 [{optr} + 15 * 64], zmm31", + + keys = in(reg) keys.as_ptr(), + iptr = in(reg) iptr, + optr = in(reg) optr, + + out("zmm0") _, + out("zmm1") _, + out("zmm2") _, + out("zmm3") _, + out("zmm4") _, + out("zmm5") _, + out("zmm6") _, + out("zmm7") _, + out("zmm8") _, + out("zmm9") _, + out("zmm10") _, + + out("zmm16") _, + out("zmm17") _, + out("zmm18") _, + out("zmm19") _, + out("zmm20") _, + out("zmm21") _, + out("zmm22") _, + out("zmm23") _, + out("zmm24") _, + out("zmm25") _, + out("zmm26") _, + out("zmm27") _, + out("zmm28") _, + out("zmm29") _, + out("zmm30") _, + out("zmm31") _, + + options(nostack, preserves_flags), + }; +} diff --git a/aes/src/x86/vaes512/aes192.rs b/aes/src/x86/vaes512/aes192.rs new file mode 100644 index 00000000..9c002011 --- /dev/null +++ b/aes/src/x86/vaes512/aes192.rs @@ -0,0 +1,684 @@ +use crate::x86::{arch::*, RoundKeys, Simd512RoundKeys}; +use crate::Block64; +use cipher::inout::InOut; +use core::{arch::asm, mem::MaybeUninit}; + +#[inline] +pub(crate) unsafe fn parallelize_keys(keys: &RoundKeys<13>) -> Simd512RoundKeys<13> { + let mut v512: [MaybeUninit<__m512i>; 13] = MaybeUninit::uninit().assume_init(); + asm! { + "vbroadcasti32x4 zmm0 , [{keys} + 0 * 16]", + "vbroadcasti32x4 zmm1 , [{keys} + 1 * 16]", + "vbroadcasti32x4 zmm2 , [{keys} + 2 * 16]", + "vbroadcasti32x4 zmm3 , [{keys} + 3 * 16]", + "vbroadcasti32x4 zmm4 , [{keys} + 4 * 16]", + "vbroadcasti32x4 zmm5 , [{keys} + 5 * 16]", + "vbroadcasti32x4 zmm6 , [{keys} + 6 * 16]", + "vbroadcasti32x4 zmm7 , [{keys} + 7 * 16]", + "vbroadcasti32x4 zmm8 , [{keys} + 8 * 16]", + "vbroadcasti32x4 zmm9 , [{keys} + 9 * 16]", + "vbroadcasti32x4 zmm10, [{keys} + 10 * 16]", + "vbroadcasti32x4 zmm11, [{keys} + 11 * 16]", + "vbroadcasti32x4 zmm12, [{keys} + 12 * 16]", + + "vmovdqu32 [{optr} + 0 * 64], zmm0", + "vmovdqu32 [{optr} + 1 * 64], zmm1", + "vmovdqu32 [{optr} + 2 * 64], zmm2", + "vmovdqu32 [{optr} + 3 * 64], zmm3", + "vmovdqu32 [{optr} + 4 * 64], zmm4", + "vmovdqu32 [{optr} + 5 * 64], zmm5", + "vmovdqu32 [{optr} + 6 * 64], zmm6", + "vmovdqu32 [{optr} + 7 * 64], zmm7", + "vmovdqu32 [{optr} + 8 * 64], zmm8", + "vmovdqu32 [{optr} + 9 * 64], zmm9", + "vmovdqu32 [{optr} + 10 * 64], zmm10", + "vmovdqu32 [{optr} + 11 * 64], zmm11", + "vmovdqu32 [{optr} + 12 * 64], zmm12", + + keys = in(reg) keys.as_ptr(), + optr = in(reg) v512.as_mut_ptr().cast::<__m512i>(), + + out("zmm0") _, + out("zmm1") _, + out("zmm2") _, + out("zmm3") _, + out("zmm4") _, + out("zmm5") _, + out("zmm6") _, + out("zmm7") _, + out("zmm8") _, + out("zmm9") _, + out("zmm10") _, + out("zmm11") _, + out("zmm12") _, + + options(nostack, preserves_flags), + }; + core::mem::transmute(v512) +} + +#[inline] +pub(crate) unsafe fn encrypt64(keys: &Simd512RoundKeys<13>, blocks: InOut<'_, '_, Block64>) { + let (iptr, optr) = blocks.into_raw(); + asm! { + // load keys + "vmovdqu32 zmm0 , [{keys} + 0 * 64]", + "vmovdqu32 zmm1 , [{keys} + 1 * 64]", + "vmovdqu32 zmm2 , [{keys} + 2 * 64]", + "vmovdqu32 zmm3 , [{keys} + 3 * 64]", + "vmovdqu32 zmm4 , [{keys} + 4 * 64]", + "vmovdqu32 zmm5 , [{keys} + 5 * 64]", + "vmovdqu32 zmm6 , [{keys} + 6 * 64]", + "vmovdqu32 zmm7 , [{keys} + 7 * 64]", + "vmovdqu32 zmm8 , [{keys} + 8 * 64]", + "vmovdqu32 zmm9 , [{keys} + 9 * 64]", + "vmovdqu32 zmm10, [{keys} + 10 * 64]", + "vmovdqu32 zmm11, [{keys} + 11 * 64]", + "vmovdqu32 zmm12, [{keys} + 12 * 64]", + // load plain-data + "vmovdqu32 zmm16, [{iptr} + 0 * 64]", + "vmovdqu32 zmm17, [{iptr} + 1 * 64]", + "vmovdqu32 zmm18, [{iptr} + 2 * 64]", + "vmovdqu32 zmm19, [{iptr} + 3 * 64]", + "vmovdqu32 zmm20, [{iptr} + 4 * 64]", + "vmovdqu32 zmm21, [{iptr} + 5 * 64]", + "vmovdqu32 zmm22, [{iptr} + 6 * 64]", + "vmovdqu32 zmm23, [{iptr} + 7 * 64]", + "vmovdqu32 zmm24, [{iptr} + 8 * 64]", + "vmovdqu32 zmm25, [{iptr} + 9 * 64]", + "vmovdqu32 zmm26, [{iptr} + 10 * 64]", + "vmovdqu32 zmm27, [{iptr} + 11 * 64]", + "vmovdqu32 zmm28, [{iptr} + 12 * 64]", + "vmovdqu32 zmm29, [{iptr} + 13 * 64]", + "vmovdqu32 zmm30, [{iptr} + 14 * 64]", + "vmovdqu32 zmm31, [{iptr} + 15 * 64]", + // aes-192 round 0 encrypt + "vpxord zmm16, zmm16, zmm0", + "vpxord zmm17, zmm17, zmm0", + "vpxord zmm18, zmm18, zmm0", + "vpxord zmm19, zmm19, zmm0", + "vpxord zmm20, zmm20, zmm0", + "vpxord zmm21, zmm21, zmm0", + "vpxord zmm22, zmm22, zmm0", + "vpxord zmm23, zmm23, zmm0", + "vpxord zmm24, zmm24, zmm0", + "vpxord zmm25, zmm25, zmm0", + "vpxord zmm26, zmm26, zmm0", + "vpxord zmm27, zmm27, zmm0", + "vpxord zmm28, zmm28, zmm0", + "vpxord zmm29, zmm29, zmm0", + "vpxord zmm30, zmm30, zmm0", + "vpxord zmm31, zmm31, zmm0", + // aes-192 round 1 encrypt + "vaesenc zmm16, zmm16, zmm1", + "vaesenc zmm17, zmm17, zmm1", + "vaesenc zmm18, zmm18, zmm1", + "vaesenc zmm19, zmm19, zmm1", + "vaesenc zmm20, zmm20, zmm1", + "vaesenc zmm21, zmm21, zmm1", + "vaesenc zmm22, zmm22, zmm1", + "vaesenc zmm23, zmm23, zmm1", + "vaesenc zmm24, zmm24, zmm1", + "vaesenc zmm25, zmm25, zmm1", + "vaesenc zmm26, zmm26, zmm1", + "vaesenc zmm27, zmm27, zmm1", + "vaesenc zmm28, zmm28, zmm1", + "vaesenc zmm29, zmm29, zmm1", + "vaesenc zmm30, zmm30, zmm1", + "vaesenc zmm31, zmm31, zmm1", + // aes-192 round 2 encrypt + "vaesenc zmm16, zmm16, zmm2", + "vaesenc zmm17, zmm17, zmm2", + "vaesenc zmm18, zmm18, zmm2", + "vaesenc zmm19, zmm19, zmm2", + "vaesenc zmm20, zmm20, zmm2", + "vaesenc zmm21, zmm21, zmm2", + "vaesenc zmm22, zmm22, zmm2", + "vaesenc zmm23, zmm23, zmm2", + "vaesenc zmm24, zmm24, zmm2", + "vaesenc zmm25, zmm25, zmm2", + "vaesenc zmm26, zmm26, zmm2", + "vaesenc zmm27, zmm27, zmm2", + "vaesenc zmm28, zmm28, zmm2", + "vaesenc zmm29, zmm29, zmm2", + "vaesenc zmm30, zmm30, zmm2", + "vaesenc zmm31, zmm31, zmm2", + // aes-192 round 3 encrypt + "vaesenc zmm16, zmm16, zmm3", + "vaesenc zmm17, zmm17, zmm3", + "vaesenc zmm18, zmm18, zmm3", + "vaesenc zmm19, zmm19, zmm3", + "vaesenc zmm20, zmm20, zmm3", + "vaesenc zmm21, zmm21, zmm3", + "vaesenc zmm22, zmm22, zmm3", + "vaesenc zmm23, zmm23, zmm3", + "vaesenc zmm24, zmm24, zmm3", + "vaesenc zmm25, zmm25, zmm3", + "vaesenc zmm26, zmm26, zmm3", + "vaesenc zmm27, zmm27, zmm3", + "vaesenc zmm28, zmm28, zmm3", + "vaesenc zmm29, zmm29, zmm3", + "vaesenc zmm30, zmm30, zmm3", + "vaesenc zmm31, zmm31, zmm3", + // aes-192 round 4 encrypt + "vaesenc zmm16, zmm16, zmm4", + "vaesenc zmm17, zmm17, zmm4", + "vaesenc zmm18, zmm18, zmm4", + "vaesenc zmm19, zmm19, zmm4", + "vaesenc zmm20, zmm20, zmm4", + "vaesenc zmm21, zmm21, zmm4", + "vaesenc zmm22, zmm22, zmm4", + "vaesenc zmm23, zmm23, zmm4", + "vaesenc zmm24, zmm24, zmm4", + "vaesenc zmm25, zmm25, zmm4", + "vaesenc zmm26, zmm26, zmm4", + "vaesenc zmm27, zmm27, zmm4", + "vaesenc zmm28, zmm28, zmm4", + "vaesenc zmm29, zmm29, zmm4", + "vaesenc zmm30, zmm30, zmm4", + "vaesenc zmm31, zmm31, zmm4", + // aes-192 round 5 encrypt + "vaesenc zmm16, zmm16, zmm5", + "vaesenc zmm17, zmm17, zmm5", + "vaesenc zmm18, zmm18, zmm5", + "vaesenc zmm19, zmm19, zmm5", + "vaesenc zmm20, zmm20, zmm5", + "vaesenc zmm21, zmm21, zmm5", + "vaesenc zmm22, zmm22, zmm5", + "vaesenc zmm23, zmm23, zmm5", + "vaesenc zmm24, zmm24, zmm5", + "vaesenc zmm25, zmm25, zmm5", + "vaesenc zmm26, zmm26, zmm5", + "vaesenc zmm27, zmm27, zmm5", + "vaesenc zmm28, zmm28, zmm5", + "vaesenc zmm29, zmm29, zmm5", + "vaesenc zmm30, zmm30, zmm5", + "vaesenc zmm31, zmm31, zmm5", + // aes-192 round 6 encrypt + "vaesenc zmm16, zmm16, zmm6", + "vaesenc zmm17, zmm17, zmm6", + "vaesenc zmm18, zmm18, zmm6", + "vaesenc zmm19, zmm19, zmm6", + "vaesenc zmm20, zmm20, zmm6", + "vaesenc zmm21, zmm21, zmm6", + "vaesenc zmm22, zmm22, zmm6", + "vaesenc zmm23, zmm23, zmm6", + "vaesenc zmm24, zmm24, zmm6", + "vaesenc zmm25, zmm25, zmm6", + "vaesenc zmm26, zmm26, zmm6", + "vaesenc zmm27, zmm27, zmm6", + "vaesenc zmm28, zmm28, zmm6", + "vaesenc zmm29, zmm29, zmm6", + "vaesenc zmm30, zmm30, zmm6", + "vaesenc zmm31, zmm31, zmm6", + // aes-192 round 7 encrypt + "vaesenc zmm16, zmm16, zmm7", + "vaesenc zmm17, zmm17, zmm7", + "vaesenc zmm18, zmm18, zmm7", + "vaesenc zmm19, zmm19, zmm7", + "vaesenc zmm20, zmm20, zmm7", + "vaesenc zmm21, zmm21, zmm7", + "vaesenc zmm22, zmm22, zmm7", + "vaesenc zmm23, zmm23, zmm7", + "vaesenc zmm24, zmm24, zmm7", + "vaesenc zmm25, zmm25, zmm7", + "vaesenc zmm26, zmm26, zmm7", + "vaesenc zmm27, zmm27, zmm7", + "vaesenc zmm28, zmm28, zmm7", + "vaesenc zmm29, zmm29, zmm7", + "vaesenc zmm30, zmm30, zmm7", + "vaesenc zmm31, zmm31, zmm7", + // aes-192 round 8 encrypt + "vaesenc zmm16, zmm16, zmm8", + "vaesenc zmm17, zmm17, zmm8", + "vaesenc zmm18, zmm18, zmm8", + "vaesenc zmm19, zmm19, zmm8", + "vaesenc zmm20, zmm20, zmm8", + "vaesenc zmm21, zmm21, zmm8", + "vaesenc zmm22, zmm22, zmm8", + "vaesenc zmm23, zmm23, zmm8", + "vaesenc zmm24, zmm24, zmm8", + "vaesenc zmm25, zmm25, zmm8", + "vaesenc zmm26, zmm26, zmm8", + "vaesenc zmm27, zmm27, zmm8", + "vaesenc zmm28, zmm28, zmm8", + "vaesenc zmm29, zmm29, zmm8", + "vaesenc zmm30, zmm30, zmm8", + "vaesenc zmm31, zmm31, zmm8", + // aes-192 round 9 encrypt + "vaesenc zmm16, zmm16, zmm9", + "vaesenc zmm17, zmm17, zmm9", + "vaesenc zmm18, zmm18, zmm9", + "vaesenc zmm19, zmm19, zmm9", + "vaesenc zmm20, zmm20, zmm9", + "vaesenc zmm21, zmm21, zmm9", + "vaesenc zmm22, zmm22, zmm9", + "vaesenc zmm23, zmm23, zmm9", + "vaesenc zmm24, zmm24, zmm9", + "vaesenc zmm25, zmm25, zmm9", + "vaesenc zmm26, zmm26, zmm9", + "vaesenc zmm27, zmm27, zmm9", + "vaesenc zmm28, zmm28, zmm9", + "vaesenc zmm29, zmm29, zmm9", + "vaesenc zmm30, zmm30, zmm9", + "vaesenc zmm31, zmm31, zmm9", + // aes-192 round 10 encrypt + "vaesenc zmm16, zmm16, zmm10", + "vaesenc zmm17, zmm17, zmm10", + "vaesenc zmm18, zmm18, zmm10", + "vaesenc zmm19, zmm19, zmm10", + "vaesenc zmm20, zmm20, zmm10", + "vaesenc zmm21, zmm21, zmm10", + "vaesenc zmm22, zmm22, zmm10", + "vaesenc zmm23, zmm23, zmm10", + "vaesenc zmm24, zmm24, zmm10", + "vaesenc zmm25, zmm25, zmm10", + "vaesenc zmm26, zmm26, zmm10", + "vaesenc zmm27, zmm27, zmm10", + "vaesenc zmm28, zmm28, zmm10", + "vaesenc zmm29, zmm29, zmm10", + "vaesenc zmm30, zmm30, zmm10", + "vaesenc zmm31, zmm31, zmm10", + // aes-192 round 11 encrypt + "vaesenc zmm16, zmm16, zmm11", + "vaesenc zmm17, zmm17, zmm11", + "vaesenc zmm18, zmm18, zmm11", + "vaesenc zmm19, zmm19, zmm11", + "vaesenc zmm20, zmm20, zmm11", + "vaesenc zmm21, zmm21, zmm11", + "vaesenc zmm22, zmm22, zmm11", + "vaesenc zmm23, zmm23, zmm11", + "vaesenc zmm24, zmm24, zmm11", + "vaesenc zmm25, zmm25, zmm11", + "vaesenc zmm26, zmm26, zmm11", + "vaesenc zmm27, zmm27, zmm11", + "vaesenc zmm28, zmm28, zmm11", + "vaesenc zmm29, zmm29, zmm11", + "vaesenc zmm30, zmm30, zmm11", + "vaesenc zmm31, zmm31, zmm11", + // aes-192 round 12 encrypt + "vaesenclast zmm16, zmm16, zmm12", + "vaesenclast zmm17, zmm17, zmm12", + "vaesenclast zmm18, zmm18, zmm12", + "vaesenclast zmm19, zmm19, zmm12", + "vaesenclast zmm20, zmm20, zmm12", + "vaesenclast zmm21, zmm21, zmm12", + "vaesenclast zmm22, zmm22, zmm12", + "vaesenclast zmm23, zmm23, zmm12", + "vaesenclast zmm24, zmm24, zmm12", + "vaesenclast zmm25, zmm25, zmm12", + "vaesenclast zmm26, zmm26, zmm12", + "vaesenclast zmm27, zmm27, zmm12", + "vaesenclast zmm28, zmm28, zmm12", + "vaesenclast zmm29, zmm29, zmm12", + "vaesenclast zmm30, zmm30, zmm12", + "vaesenclast zmm31, zmm31, zmm12", + // save cipher-data + "vmovdqu32 [{optr} + 0 * 64], zmm16", + "vmovdqu32 [{optr} + 1 * 64], zmm17", + "vmovdqu32 [{optr} + 2 * 64], zmm18", + "vmovdqu32 [{optr} + 3 * 64], zmm19", + "vmovdqu32 [{optr} + 4 * 64], zmm20", + "vmovdqu32 [{optr} + 5 * 64], zmm21", + "vmovdqu32 [{optr} + 6 * 64], zmm22", + "vmovdqu32 [{optr} + 7 * 64], zmm23", + "vmovdqu32 [{optr} + 8 * 64], zmm24", + "vmovdqu32 [{optr} + 9 * 64], zmm25", + "vmovdqu32 [{optr} + 10 * 64], zmm26", + "vmovdqu32 [{optr} + 11 * 64], zmm27", + "vmovdqu32 [{optr} + 12 * 64], zmm28", + "vmovdqu32 [{optr} + 13 * 64], zmm29", + "vmovdqu32 [{optr} + 14 * 64], zmm30", + "vmovdqu32 [{optr} + 15 * 64], zmm31", + + keys = in(reg) keys.as_ptr(), + iptr = in(reg) iptr, + optr = in(reg) optr, + + out("zmm0") _, + out("zmm1") _, + out("zmm2") _, + out("zmm3") _, + out("zmm4") _, + out("zmm5") _, + out("zmm6") _, + out("zmm7") _, + out("zmm8") _, + out("zmm9") _, + out("zmm10") _, + out("zmm11") _, + out("zmm12") _, + + out("zmm16") _, + out("zmm17") _, + out("zmm18") _, + out("zmm19") _, + out("zmm20") _, + out("zmm21") _, + out("zmm22") _, + out("zmm23") _, + out("zmm24") _, + out("zmm25") _, + out("zmm26") _, + out("zmm27") _, + out("zmm28") _, + out("zmm29") _, + out("zmm30") _, + out("zmm31") _, + + options(nostack, preserves_flags), + }; +} + +#[inline] +pub(crate) unsafe fn decrypt64(keys: &Simd512RoundKeys<13>, blocks: InOut<'_, '_, Block64>) { + let (iptr, optr) = blocks.into_raw(); + asm! { + // load keys + "vmovdqu32 zmm12, [{keys} + 12 * 64]", + "vmovdqu32 zmm11, [{keys} + 11 * 64]", + "vmovdqu32 zmm10, [{keys} + 10 * 64]", + "vmovdqu32 zmm9 , [{keys} + 9 * 64]", + "vmovdqu32 zmm8 , [{keys} + 8 * 64]", + "vmovdqu32 zmm7 , [{keys} + 7 * 64]", + "vmovdqu32 zmm6 , [{keys} + 6 * 64]", + "vmovdqu32 zmm5 , [{keys} + 5 * 64]", + "vmovdqu32 zmm4 , [{keys} + 4 * 64]", + "vmovdqu32 zmm3 , [{keys} + 3 * 64]", + "vmovdqu32 zmm2 , [{keys} + 2 * 64]", + "vmovdqu32 zmm1 , [{keys} + 1 * 64]", + "vmovdqu32 zmm0 , [{keys} + 0 * 64]", + // load cipher-data + "vmovdqu32 zmm16, [{iptr} + 0 * 64]", + "vmovdqu32 zmm17, [{iptr} + 1 * 64]", + "vmovdqu32 zmm18, [{iptr} + 2 * 64]", + "vmovdqu32 zmm19, [{iptr} + 3 * 64]", + "vmovdqu32 zmm20, [{iptr} + 4 * 64]", + "vmovdqu32 zmm21, [{iptr} + 5 * 64]", + "vmovdqu32 zmm22, [{iptr} + 6 * 64]", + "vmovdqu32 zmm23, [{iptr} + 7 * 64]", + "vmovdqu32 zmm24, [{iptr} + 8 * 64]", + "vmovdqu32 zmm25, [{iptr} + 9 * 64]", + "vmovdqu32 zmm26, [{iptr} + 10 * 64]", + "vmovdqu32 zmm27, [{iptr} + 11 * 64]", + "vmovdqu32 zmm28, [{iptr} + 12 * 64]", + "vmovdqu32 zmm29, [{iptr} + 13 * 64]", + "vmovdqu32 zmm30, [{iptr} + 14 * 64]", + "vmovdqu32 zmm31, [{iptr} + 15 * 64]", + // aes-192 round 12 decrypt + "vpxord zmm16, zmm16, zmm12", + "vpxord zmm17, zmm17, zmm12", + "vpxord zmm18, zmm18, zmm12", + "vpxord zmm19, zmm19, zmm12", + "vpxord zmm20, zmm20, zmm12", + "vpxord zmm21, zmm21, zmm12", + "vpxord zmm22, zmm22, zmm12", + "vpxord zmm23, zmm23, zmm12", + "vpxord zmm24, zmm24, zmm12", + "vpxord zmm25, zmm25, zmm12", + "vpxord zmm26, zmm26, zmm12", + "vpxord zmm27, zmm27, zmm12", + "vpxord zmm28, zmm28, zmm12", + "vpxord zmm29, zmm29, zmm12", + "vpxord zmm30, zmm30, zmm12", + "vpxord zmm31, zmm31, zmm12", + // aes-192 round 11 decrypt + "vaesdec zmm16, zmm16, zmm11", + "vaesdec zmm17, zmm17, zmm11", + "vaesdec zmm18, zmm18, zmm11", + "vaesdec zmm19, zmm19, zmm11", + "vaesdec zmm20, zmm20, zmm11", + "vaesdec zmm21, zmm21, zmm11", + "vaesdec zmm22, zmm22, zmm11", + "vaesdec zmm23, zmm23, zmm11", + "vaesdec zmm24, zmm24, zmm11", + "vaesdec zmm25, zmm25, zmm11", + "vaesdec zmm26, zmm26, zmm11", + "vaesdec zmm27, zmm27, zmm11", + "vaesdec zmm28, zmm28, zmm11", + "vaesdec zmm29, zmm29, zmm11", + "vaesdec zmm30, zmm30, zmm11", + "vaesdec zmm31, zmm31, zmm11", + // aes-192 round 10 decrypt + "vaesdec zmm16, zmm16, zmm10", + "vaesdec zmm17, zmm17, zmm10", + "vaesdec zmm18, zmm18, zmm10", + "vaesdec zmm19, zmm19, zmm10", + "vaesdec zmm20, zmm20, zmm10", + "vaesdec zmm21, zmm21, zmm10", + "vaesdec zmm22, zmm22, zmm10", + "vaesdec zmm23, zmm23, zmm10", + "vaesdec zmm24, zmm24, zmm10", + "vaesdec zmm25, zmm25, zmm10", + "vaesdec zmm26, zmm26, zmm10", + "vaesdec zmm27, zmm27, zmm10", + "vaesdec zmm28, zmm28, zmm10", + "vaesdec zmm29, zmm29, zmm10", + "vaesdec zmm30, zmm30, zmm10", + "vaesdec zmm31, zmm31, zmm10", + // aes-192 round 9 decrypt + "vaesdec zmm16, zmm16, zmm9", + "vaesdec zmm17, zmm17, zmm9", + "vaesdec zmm18, zmm18, zmm9", + "vaesdec zmm19, zmm19, zmm9", + "vaesdec zmm20, zmm20, zmm9", + "vaesdec zmm21, zmm21, zmm9", + "vaesdec zmm22, zmm22, zmm9", + "vaesdec zmm23, zmm23, zmm9", + "vaesdec zmm24, zmm24, zmm9", + "vaesdec zmm25, zmm25, zmm9", + "vaesdec zmm26, zmm26, zmm9", + "vaesdec zmm27, zmm27, zmm9", + "vaesdec zmm28, zmm28, zmm9", + "vaesdec zmm29, zmm29, zmm9", + "vaesdec zmm30, zmm30, zmm9", + "vaesdec zmm31, zmm31, zmm9", + // aes-192 round 8 decrypt + "vaesdec zmm16, zmm16, zmm8", + "vaesdec zmm17, zmm17, zmm8", + "vaesdec zmm18, zmm18, zmm8", + "vaesdec zmm19, zmm19, zmm8", + "vaesdec zmm20, zmm20, zmm8", + "vaesdec zmm21, zmm21, zmm8", + "vaesdec zmm22, zmm22, zmm8", + "vaesdec zmm23, zmm23, zmm8", + "vaesdec zmm24, zmm24, zmm8", + "vaesdec zmm25, zmm25, zmm8", + "vaesdec zmm26, zmm26, zmm8", + "vaesdec zmm27, zmm27, zmm8", + "vaesdec zmm28, zmm28, zmm8", + "vaesdec zmm29, zmm29, zmm8", + "vaesdec zmm30, zmm30, zmm8", + "vaesdec zmm31, zmm31, zmm8", + // aes-192 round 7 decrypt + "vaesdec zmm16, zmm16, zmm7", + "vaesdec zmm17, zmm17, zmm7", + "vaesdec zmm18, zmm18, zmm7", + "vaesdec zmm19, zmm19, zmm7", + "vaesdec zmm20, zmm20, zmm7", + "vaesdec zmm21, zmm21, zmm7", + "vaesdec zmm22, zmm22, zmm7", + "vaesdec zmm23, zmm23, zmm7", + "vaesdec zmm24, zmm24, zmm7", + "vaesdec zmm25, zmm25, zmm7", + "vaesdec zmm26, zmm26, zmm7", + "vaesdec zmm27, zmm27, zmm7", + "vaesdec zmm28, zmm28, zmm7", + "vaesdec zmm29, zmm29, zmm7", + "vaesdec zmm30, zmm30, zmm7", + "vaesdec zmm31, zmm31, zmm7", + // aes-192 round 6 decrypt + "vaesdec zmm16, zmm16, zmm6", + "vaesdec zmm17, zmm17, zmm6", + "vaesdec zmm18, zmm18, zmm6", + "vaesdec zmm19, zmm19, zmm6", + "vaesdec zmm20, zmm20, zmm6", + "vaesdec zmm21, zmm21, zmm6", + "vaesdec zmm22, zmm22, zmm6", + "vaesdec zmm23, zmm23, zmm6", + "vaesdec zmm24, zmm24, zmm6", + "vaesdec zmm25, zmm25, zmm6", + "vaesdec zmm26, zmm26, zmm6", + "vaesdec zmm27, zmm27, zmm6", + "vaesdec zmm28, zmm28, zmm6", + "vaesdec zmm29, zmm29, zmm6", + "vaesdec zmm30, zmm30, zmm6", + "vaesdec zmm31, zmm31, zmm6", + // aes-192 round 5 decrypt + "vaesdec zmm16, zmm16, zmm5", + "vaesdec zmm17, zmm17, zmm5", + "vaesdec zmm18, zmm18, zmm5", + "vaesdec zmm19, zmm19, zmm5", + "vaesdec zmm20, zmm20, zmm5", + "vaesdec zmm21, zmm21, zmm5", + "vaesdec zmm22, zmm22, zmm5", + "vaesdec zmm23, zmm23, zmm5", + "vaesdec zmm24, zmm24, zmm5", + "vaesdec zmm25, zmm25, zmm5", + "vaesdec zmm26, zmm26, zmm5", + "vaesdec zmm27, zmm27, zmm5", + "vaesdec zmm28, zmm28, zmm5", + "vaesdec zmm29, zmm29, zmm5", + "vaesdec zmm30, zmm30, zmm5", + "vaesdec zmm31, zmm31, zmm5", + // aes-192 round 4 decrypt + "vaesdec zmm16, zmm16, zmm4", + "vaesdec zmm17, zmm17, zmm4", + "vaesdec zmm18, zmm18, zmm4", + "vaesdec zmm19, zmm19, zmm4", + "vaesdec zmm20, zmm20, zmm4", + "vaesdec zmm21, zmm21, zmm4", + "vaesdec zmm22, zmm22, zmm4", + "vaesdec zmm23, zmm23, zmm4", + "vaesdec zmm24, zmm24, zmm4", + "vaesdec zmm25, zmm25, zmm4", + "vaesdec zmm26, zmm26, zmm4", + "vaesdec zmm27, zmm27, zmm4", + "vaesdec zmm28, zmm28, zmm4", + "vaesdec zmm29, zmm29, zmm4", + "vaesdec zmm30, zmm30, zmm4", + "vaesdec zmm31, zmm31, zmm4", + // aes-192 round 3 decrypt + "vaesdec zmm16, zmm16, zmm3", + "vaesdec zmm17, zmm17, zmm3", + "vaesdec zmm18, zmm18, zmm3", + "vaesdec zmm19, zmm19, zmm3", + "vaesdec zmm20, zmm20, zmm3", + "vaesdec zmm21, zmm21, zmm3", + "vaesdec zmm22, zmm22, zmm3", + "vaesdec zmm23, zmm23, zmm3", + "vaesdec zmm24, zmm24, zmm3", + "vaesdec zmm25, zmm25, zmm3", + "vaesdec zmm26, zmm26, zmm3", + "vaesdec zmm27, zmm27, zmm3", + "vaesdec zmm28, zmm28, zmm3", + "vaesdec zmm29, zmm29, zmm3", + "vaesdec zmm30, zmm30, zmm3", + "vaesdec zmm31, zmm31, zmm3", + // aes-192 round 2 decrypt + "vaesdec zmm16, zmm16, zmm2", + "vaesdec zmm17, zmm17, zmm2", + "vaesdec zmm18, zmm18, zmm2", + "vaesdec zmm19, zmm19, zmm2", + "vaesdec zmm20, zmm20, zmm2", + "vaesdec zmm21, zmm21, zmm2", + "vaesdec zmm22, zmm22, zmm2", + "vaesdec zmm23, zmm23, zmm2", + "vaesdec zmm24, zmm24, zmm2", + "vaesdec zmm25, zmm25, zmm2", + "vaesdec zmm26, zmm26, zmm2", + "vaesdec zmm27, zmm27, zmm2", + "vaesdec zmm28, zmm28, zmm2", + "vaesdec zmm29, zmm29, zmm2", + "vaesdec zmm30, zmm30, zmm2", + "vaesdec zmm31, zmm31, zmm2", + // aes-192 round 1 decrypt + "vaesdec zmm16, zmm16, zmm1", + "vaesdec zmm17, zmm17, zmm1", + "vaesdec zmm18, zmm18, zmm1", + "vaesdec zmm19, zmm19, zmm1", + "vaesdec zmm20, zmm20, zmm1", + "vaesdec zmm21, zmm21, zmm1", + "vaesdec zmm22, zmm22, zmm1", + "vaesdec zmm23, zmm23, zmm1", + "vaesdec zmm24, zmm24, zmm1", + "vaesdec zmm25, zmm25, zmm1", + "vaesdec zmm26, zmm26, zmm1", + "vaesdec zmm27, zmm27, zmm1", + "vaesdec zmm28, zmm28, zmm1", + "vaesdec zmm29, zmm29, zmm1", + "vaesdec zmm30, zmm30, zmm1", + "vaesdec zmm31, zmm31, zmm1", + // aes-192 round 0 decrypt + "vaesdeclast zmm16, zmm16, zmm0", + "vaesdeclast zmm17, zmm17, zmm0", + "vaesdeclast zmm18, zmm18, zmm0", + "vaesdeclast zmm19, zmm19, zmm0", + "vaesdeclast zmm20, zmm20, zmm0", + "vaesdeclast zmm21, zmm21, zmm0", + "vaesdeclast zmm22, zmm22, zmm0", + "vaesdeclast zmm23, zmm23, zmm0", + "vaesdeclast zmm24, zmm24, zmm0", + "vaesdeclast zmm25, zmm25, zmm0", + "vaesdeclast zmm26, zmm26, zmm0", + "vaesdeclast zmm27, zmm27, zmm0", + "vaesdeclast zmm28, zmm28, zmm0", + "vaesdeclast zmm29, zmm29, zmm0", + "vaesdeclast zmm30, zmm30, zmm0", + "vaesdeclast zmm31, zmm31, zmm0", + // save plain-data + "vmovdqu32 [{optr} + 0 * 64], zmm16", + "vmovdqu32 [{optr} + 1 * 64], zmm17", + "vmovdqu32 [{optr} + 2 * 64], zmm18", + "vmovdqu32 [{optr} + 3 * 64], zmm19", + "vmovdqu32 [{optr} + 4 * 64], zmm20", + "vmovdqu32 [{optr} + 5 * 64], zmm21", + "vmovdqu32 [{optr} + 6 * 64], zmm22", + "vmovdqu32 [{optr} + 7 * 64], zmm23", + "vmovdqu32 [{optr} + 8 * 64], zmm24", + "vmovdqu32 [{optr} + 9 * 64], zmm25", + "vmovdqu32 [{optr} + 10 * 64], zmm26", + "vmovdqu32 [{optr} + 11 * 64], zmm27", + "vmovdqu32 [{optr} + 12 * 64], zmm28", + "vmovdqu32 [{optr} + 13 * 64], zmm29", + "vmovdqu32 [{optr} + 14 * 64], zmm30", + "vmovdqu32 [{optr} + 15 * 64], zmm31", + + keys = in(reg) keys.as_ptr(), + iptr = in(reg) iptr, + optr = in(reg) optr, + + out("zmm0") _, + out("zmm1") _, + out("zmm2") _, + out("zmm3") _, + out("zmm4") _, + out("zmm5") _, + out("zmm6") _, + out("zmm7") _, + out("zmm8") _, + out("zmm9") _, + out("zmm10") _, + out("zmm11") _, + out("zmm12") _, + + out("zmm16") _, + out("zmm17") _, + out("zmm18") _, + out("zmm19") _, + out("zmm20") _, + out("zmm21") _, + out("zmm22") _, + out("zmm23") _, + out("zmm24") _, + out("zmm25") _, + out("zmm26") _, + out("zmm27") _, + out("zmm28") _, + out("zmm29") _, + out("zmm30") _, + out("zmm31") _, + + options(nostack, preserves_flags), + }; +} diff --git a/aes/src/x86/vaes512/aes256.rs b/aes/src/x86/vaes512/aes256.rs new file mode 100644 index 00000000..503063d4 --- /dev/null +++ b/aes/src/x86/vaes512/aes256.rs @@ -0,0 +1,766 @@ +use crate::x86::{arch::*, RoundKeys, Simd512RoundKeys}; +use crate::Block64; +use cipher::inout::InOut; +use core::{arch::asm, mem::MaybeUninit}; + +#[inline] +pub(crate) unsafe fn parallelize_keys(keys: &RoundKeys<15>) -> Simd512RoundKeys<15> { + let mut v512: [MaybeUninit<__m512i>; 15] = MaybeUninit::uninit().assume_init(); + asm! { + "vbroadcasti32x4 zmm0 , [{keys} + 0 * 16]", + "vbroadcasti32x4 zmm1 , [{keys} + 1 * 16]", + "vbroadcasti32x4 zmm2 , [{keys} + 2 * 16]", + "vbroadcasti32x4 zmm3 , [{keys} + 3 * 16]", + "vbroadcasti32x4 zmm4 , [{keys} + 4 * 16]", + "vbroadcasti32x4 zmm5 , [{keys} + 5 * 16]", + "vbroadcasti32x4 zmm6 , [{keys} + 6 * 16]", + "vbroadcasti32x4 zmm7 , [{keys} + 7 * 16]", + "vbroadcasti32x4 zmm8 , [{keys} + 8 * 16]", + "vbroadcasti32x4 zmm9 , [{keys} + 9 * 16]", + "vbroadcasti32x4 zmm10, [{keys} + 10 * 16]", + "vbroadcasti32x4 zmm11, [{keys} + 11 * 16]", + "vbroadcasti32x4 zmm12, [{keys} + 12 * 16]", + "vbroadcasti32x4 zmm13, [{keys} + 13 * 16]", + "vbroadcasti32x4 zmm14, [{keys} + 14 * 16]", + + "vmovdqu32 [{optr} + 0 * 64], zmm0", + "vmovdqu32 [{optr} + 1 * 64], zmm1", + "vmovdqu32 [{optr} + 2 * 64], zmm2", + "vmovdqu32 [{optr} + 3 * 64], zmm3", + "vmovdqu32 [{optr} + 4 * 64], zmm4", + "vmovdqu32 [{optr} + 5 * 64], zmm5", + "vmovdqu32 [{optr} + 6 * 64], zmm6", + "vmovdqu32 [{optr} + 7 * 64], zmm7", + "vmovdqu32 [{optr} + 8 * 64], zmm8", + "vmovdqu32 [{optr} + 9 * 64], zmm9", + "vmovdqu32 [{optr} + 10 * 64], zmm10", + "vmovdqu32 [{optr} + 11 * 64], zmm11", + "vmovdqu32 [{optr} + 12 * 64], zmm12", + "vmovdqu32 [{optr} + 13 * 64], zmm13", + "vmovdqu32 [{optr} + 14 * 64], zmm14", + + keys = in(reg) keys.as_ptr(), + optr = in(reg) v512.as_mut_ptr().cast::<__m512i>(), + + out("zmm0") _, + out("zmm1") _, + out("zmm2") _, + out("zmm3") _, + out("zmm4") _, + out("zmm5") _, + out("zmm6") _, + out("zmm7") _, + out("zmm8") _, + out("zmm9") _, + out("zmm10") _, + out("zmm11") _, + out("zmm12") _, + out("zmm13") _, + out("zmm14") _, + + options(nostack, preserves_flags), + }; + core::mem::transmute(v512) +} + +#[inline] +pub(crate) unsafe fn encrypt64(keys: &Simd512RoundKeys<15>, blocks: InOut<'_, '_, Block64>) { + let (iptr, optr) = blocks.into_raw(); + asm! { + // load keys + "vmovdqu32 zmm0 , [{keys} + 0 * 64]", + "vmovdqu32 zmm1 , [{keys} + 1 * 64]", + "vmovdqu32 zmm2 , [{keys} + 2 * 64]", + "vmovdqu32 zmm3 , [{keys} + 3 * 64]", + "vmovdqu32 zmm4 , [{keys} + 4 * 64]", + "vmovdqu32 zmm5 , [{keys} + 5 * 64]", + "vmovdqu32 zmm6 , [{keys} + 6 * 64]", + "vmovdqu32 zmm7 , [{keys} + 7 * 64]", + "vmovdqu32 zmm8 , [{keys} + 8 * 64]", + "vmovdqu32 zmm9 , [{keys} + 9 * 64]", + "vmovdqu32 zmm10, [{keys} + 10 * 64]", + "vmovdqu32 zmm11, [{keys} + 11 * 64]", + "vmovdqu32 zmm12, [{keys} + 12 * 64]", + "vmovdqu32 zmm13, [{keys} + 13 * 64]", + "vmovdqu32 zmm14, [{keys} + 14 * 64]", + // load plain-data + "vmovdqu32 zmm16, [{iptr} + 0 * 64]", + "vmovdqu32 zmm17, [{iptr} + 1 * 64]", + "vmovdqu32 zmm18, [{iptr} + 2 * 64]", + "vmovdqu32 zmm19, [{iptr} + 3 * 64]", + "vmovdqu32 zmm20, [{iptr} + 4 * 64]", + "vmovdqu32 zmm21, [{iptr} + 5 * 64]", + "vmovdqu32 zmm22, [{iptr} + 6 * 64]", + "vmovdqu32 zmm23, [{iptr} + 7 * 64]", + "vmovdqu32 zmm24, [{iptr} + 8 * 64]", + "vmovdqu32 zmm25, [{iptr} + 9 * 64]", + "vmovdqu32 zmm26, [{iptr} + 10 * 64]", + "vmovdqu32 zmm27, [{iptr} + 11 * 64]", + "vmovdqu32 zmm28, [{iptr} + 12 * 64]", + "vmovdqu32 zmm29, [{iptr} + 13 * 64]", + "vmovdqu32 zmm30, [{iptr} + 14 * 64]", + "vmovdqu32 zmm31, [{iptr} + 15 * 64]", + // aes-256 round 0 encrypt + "vpxord zmm16, zmm16, zmm0", + "vpxord zmm17, zmm17, zmm0", + "vpxord zmm18, zmm18, zmm0", + "vpxord zmm19, zmm19, zmm0", + "vpxord zmm20, zmm20, zmm0", + "vpxord zmm21, zmm21, zmm0", + "vpxord zmm22, zmm22, zmm0", + "vpxord zmm23, zmm23, zmm0", + "vpxord zmm24, zmm24, zmm0", + "vpxord zmm25, zmm25, zmm0", + "vpxord zmm26, zmm26, zmm0", + "vpxord zmm27, zmm27, zmm0", + "vpxord zmm28, zmm28, zmm0", + "vpxord zmm29, zmm29, zmm0", + "vpxord zmm30, zmm30, zmm0", + "vpxord zmm31, zmm31, zmm0", + // aes-256 round 1 encrypt + "vaesenc zmm16, zmm16, zmm1", + "vaesenc zmm17, zmm17, zmm1", + "vaesenc zmm18, zmm18, zmm1", + "vaesenc zmm19, zmm19, zmm1", + "vaesenc zmm20, zmm20, zmm1", + "vaesenc zmm21, zmm21, zmm1", + "vaesenc zmm22, zmm22, zmm1", + "vaesenc zmm23, zmm23, zmm1", + "vaesenc zmm24, zmm24, zmm1", + "vaesenc zmm25, zmm25, zmm1", + "vaesenc zmm26, zmm26, zmm1", + "vaesenc zmm27, zmm27, zmm1", + "vaesenc zmm28, zmm28, zmm1", + "vaesenc zmm29, zmm29, zmm1", + "vaesenc zmm30, zmm30, zmm1", + "vaesenc zmm31, zmm31, zmm1", + // aes-256 round 2 encrypt + "vaesenc zmm16, zmm16, zmm2", + "vaesenc zmm17, zmm17, zmm2", + "vaesenc zmm18, zmm18, zmm2", + "vaesenc zmm19, zmm19, zmm2", + "vaesenc zmm20, zmm20, zmm2", + "vaesenc zmm21, zmm21, zmm2", + "vaesenc zmm22, zmm22, zmm2", + "vaesenc zmm23, zmm23, zmm2", + "vaesenc zmm24, zmm24, zmm2", + "vaesenc zmm25, zmm25, zmm2", + "vaesenc zmm26, zmm26, zmm2", + "vaesenc zmm27, zmm27, zmm2", + "vaesenc zmm28, zmm28, zmm2", + "vaesenc zmm29, zmm29, zmm2", + "vaesenc zmm30, zmm30, zmm2", + "vaesenc zmm31, zmm31, zmm2", + // aes-256 round 3 encrypt + "vaesenc zmm16, zmm16, zmm3", + "vaesenc zmm17, zmm17, zmm3", + "vaesenc zmm18, zmm18, zmm3", + "vaesenc zmm19, zmm19, zmm3", + "vaesenc zmm20, zmm20, zmm3", + "vaesenc zmm21, zmm21, zmm3", + "vaesenc zmm22, zmm22, zmm3", + "vaesenc zmm23, zmm23, zmm3", + "vaesenc zmm24, zmm24, zmm3", + "vaesenc zmm25, zmm25, zmm3", + "vaesenc zmm26, zmm26, zmm3", + "vaesenc zmm27, zmm27, zmm3", + "vaesenc zmm28, zmm28, zmm3", + "vaesenc zmm29, zmm29, zmm3", + "vaesenc zmm30, zmm30, zmm3", + "vaesenc zmm31, zmm31, zmm3", + // aes-256 round 4 encrypt + "vaesenc zmm16, zmm16, zmm4", + "vaesenc zmm17, zmm17, zmm4", + "vaesenc zmm18, zmm18, zmm4", + "vaesenc zmm19, zmm19, zmm4", + "vaesenc zmm20, zmm20, zmm4", + "vaesenc zmm21, zmm21, zmm4", + "vaesenc zmm22, zmm22, zmm4", + "vaesenc zmm23, zmm23, zmm4", + "vaesenc zmm24, zmm24, zmm4", + "vaesenc zmm25, zmm25, zmm4", + "vaesenc zmm26, zmm26, zmm4", + "vaesenc zmm27, zmm27, zmm4", + "vaesenc zmm28, zmm28, zmm4", + "vaesenc zmm29, zmm29, zmm4", + "vaesenc zmm30, zmm30, zmm4", + "vaesenc zmm31, zmm31, zmm4", + // aes-256 round 5 encrypt + "vaesenc zmm16, zmm16, zmm5", + "vaesenc zmm17, zmm17, zmm5", + "vaesenc zmm18, zmm18, zmm5", + "vaesenc zmm19, zmm19, zmm5", + "vaesenc zmm20, zmm20, zmm5", + "vaesenc zmm21, zmm21, zmm5", + "vaesenc zmm22, zmm22, zmm5", + "vaesenc zmm23, zmm23, zmm5", + "vaesenc zmm24, zmm24, zmm5", + "vaesenc zmm25, zmm25, zmm5", + "vaesenc zmm26, zmm26, zmm5", + "vaesenc zmm27, zmm27, zmm5", + "vaesenc zmm28, zmm28, zmm5", + "vaesenc zmm29, zmm29, zmm5", + "vaesenc zmm30, zmm30, zmm5", + "vaesenc zmm31, zmm31, zmm5", + // aes-256 round 6 encrypt + "vaesenc zmm16, zmm16, zmm6", + "vaesenc zmm17, zmm17, zmm6", + "vaesenc zmm18, zmm18, zmm6", + "vaesenc zmm19, zmm19, zmm6", + "vaesenc zmm20, zmm20, zmm6", + "vaesenc zmm21, zmm21, zmm6", + "vaesenc zmm22, zmm22, zmm6", + "vaesenc zmm23, zmm23, zmm6", + "vaesenc zmm24, zmm24, zmm6", + "vaesenc zmm25, zmm25, zmm6", + "vaesenc zmm26, zmm26, zmm6", + "vaesenc zmm27, zmm27, zmm6", + "vaesenc zmm28, zmm28, zmm6", + "vaesenc zmm29, zmm29, zmm6", + "vaesenc zmm30, zmm30, zmm6", + "vaesenc zmm31, zmm31, zmm6", + // aes-256 round 7 encrypt + "vaesenc zmm16, zmm16, zmm7", + "vaesenc zmm17, zmm17, zmm7", + "vaesenc zmm18, zmm18, zmm7", + "vaesenc zmm19, zmm19, zmm7", + "vaesenc zmm20, zmm20, zmm7", + "vaesenc zmm21, zmm21, zmm7", + "vaesenc zmm22, zmm22, zmm7", + "vaesenc zmm23, zmm23, zmm7", + "vaesenc zmm24, zmm24, zmm7", + "vaesenc zmm25, zmm25, zmm7", + "vaesenc zmm26, zmm26, zmm7", + "vaesenc zmm27, zmm27, zmm7", + "vaesenc zmm28, zmm28, zmm7", + "vaesenc zmm29, zmm29, zmm7", + "vaesenc zmm30, zmm30, zmm7", + "vaesenc zmm31, zmm31, zmm7", + // aes-256 round 8 encrypt + "vaesenc zmm16, zmm16, zmm8", + "vaesenc zmm17, zmm17, zmm8", + "vaesenc zmm18, zmm18, zmm8", + "vaesenc zmm19, zmm19, zmm8", + "vaesenc zmm20, zmm20, zmm8", + "vaesenc zmm21, zmm21, zmm8", + "vaesenc zmm22, zmm22, zmm8", + "vaesenc zmm23, zmm23, zmm8", + "vaesenc zmm24, zmm24, zmm8", + "vaesenc zmm25, zmm25, zmm8", + "vaesenc zmm26, zmm26, zmm8", + "vaesenc zmm27, zmm27, zmm8", + "vaesenc zmm28, zmm28, zmm8", + "vaesenc zmm29, zmm29, zmm8", + "vaesenc zmm30, zmm30, zmm8", + "vaesenc zmm31, zmm31, zmm8", + // aes-256 round 9 encrypt + "vaesenc zmm16, zmm16, zmm9", + "vaesenc zmm17, zmm17, zmm9", + "vaesenc zmm18, zmm18, zmm9", + "vaesenc zmm19, zmm19, zmm9", + "vaesenc zmm20, zmm20, zmm9", + "vaesenc zmm21, zmm21, zmm9", + "vaesenc zmm22, zmm22, zmm9", + "vaesenc zmm23, zmm23, zmm9", + "vaesenc zmm24, zmm24, zmm9", + "vaesenc zmm25, zmm25, zmm9", + "vaesenc zmm26, zmm26, zmm9", + "vaesenc zmm27, zmm27, zmm9", + "vaesenc zmm28, zmm28, zmm9", + "vaesenc zmm29, zmm29, zmm9", + "vaesenc zmm30, zmm30, zmm9", + "vaesenc zmm31, zmm31, zmm9", + // aes-256 round 10 encrypt + "vaesenc zmm16, zmm16, zmm10", + "vaesenc zmm17, zmm17, zmm10", + "vaesenc zmm18, zmm18, zmm10", + "vaesenc zmm19, zmm19, zmm10", + "vaesenc zmm20, zmm20, zmm10", + "vaesenc zmm21, zmm21, zmm10", + "vaesenc zmm22, zmm22, zmm10", + "vaesenc zmm23, zmm23, zmm10", + "vaesenc zmm24, zmm24, zmm10", + "vaesenc zmm25, zmm25, zmm10", + "vaesenc zmm26, zmm26, zmm10", + "vaesenc zmm27, zmm27, zmm10", + "vaesenc zmm28, zmm28, zmm10", + "vaesenc zmm29, zmm29, zmm10", + "vaesenc zmm30, zmm30, zmm10", + "vaesenc zmm31, zmm31, zmm10", + // aes-256 round 11 encrypt + "vaesenc zmm16, zmm16, zmm11", + "vaesenc zmm17, zmm17, zmm11", + "vaesenc zmm18, zmm18, zmm11", + "vaesenc zmm19, zmm19, zmm11", + "vaesenc zmm20, zmm20, zmm11", + "vaesenc zmm21, zmm21, zmm11", + "vaesenc zmm22, zmm22, zmm11", + "vaesenc zmm23, zmm23, zmm11", + "vaesenc zmm24, zmm24, zmm11", + "vaesenc zmm25, zmm25, zmm11", + "vaesenc zmm26, zmm26, zmm11", + "vaesenc zmm27, zmm27, zmm11", + "vaesenc zmm28, zmm28, zmm11", + "vaesenc zmm29, zmm29, zmm11", + "vaesenc zmm30, zmm30, zmm11", + "vaesenc zmm31, zmm31, zmm11", + // aes-256 round 12 encrypt + "vaesenc zmm16, zmm16, zmm12", + "vaesenc zmm17, zmm17, zmm12", + "vaesenc zmm18, zmm18, zmm12", + "vaesenc zmm19, zmm19, zmm12", + "vaesenc zmm20, zmm20, zmm12", + "vaesenc zmm21, zmm21, zmm12", + "vaesenc zmm22, zmm22, zmm12", + "vaesenc zmm23, zmm23, zmm12", + "vaesenc zmm24, zmm24, zmm12", + "vaesenc zmm25, zmm25, zmm12", + "vaesenc zmm26, zmm26, zmm12", + "vaesenc zmm27, zmm27, zmm12", + "vaesenc zmm28, zmm28, zmm12", + "vaesenc zmm29, zmm29, zmm12", + "vaesenc zmm30, zmm30, zmm12", + "vaesenc zmm31, zmm31, zmm12", + // aes-256 round 13 encrypt + "vaesenc zmm16, zmm16, zmm13", + "vaesenc zmm17, zmm17, zmm13", + "vaesenc zmm18, zmm18, zmm13", + "vaesenc zmm19, zmm19, zmm13", + "vaesenc zmm20, zmm20, zmm13", + "vaesenc zmm21, zmm21, zmm13", + "vaesenc zmm22, zmm22, zmm13", + "vaesenc zmm23, zmm23, zmm13", + "vaesenc zmm24, zmm24, zmm13", + "vaesenc zmm25, zmm25, zmm13", + "vaesenc zmm26, zmm26, zmm13", + "vaesenc zmm27, zmm27, zmm13", + "vaesenc zmm28, zmm28, zmm13", + "vaesenc zmm29, zmm29, zmm13", + "vaesenc zmm30, zmm30, zmm13", + "vaesenc zmm31, zmm31, zmm13", + // aes-256 round 14 encrypt + "vaesenclast zmm16, zmm16, zmm14", + "vaesenclast zmm17, zmm17, zmm14", + "vaesenclast zmm18, zmm18, zmm14", + "vaesenclast zmm19, zmm19, zmm14", + "vaesenclast zmm20, zmm20, zmm14", + "vaesenclast zmm21, zmm21, zmm14", + "vaesenclast zmm22, zmm22, zmm14", + "vaesenclast zmm23, zmm23, zmm14", + "vaesenclast zmm24, zmm24, zmm14", + "vaesenclast zmm25, zmm25, zmm14", + "vaesenclast zmm26, zmm26, zmm14", + "vaesenclast zmm27, zmm27, zmm14", + "vaesenclast zmm28, zmm28, zmm14", + "vaesenclast zmm29, zmm29, zmm14", + "vaesenclast zmm30, zmm30, zmm14", + "vaesenclast zmm31, zmm31, zmm14", + // save cipher-data + "vmovdqu32 [{optr} + 0 * 64], zmm16", + "vmovdqu32 [{optr} + 1 * 64], zmm17", + "vmovdqu32 [{optr} + 2 * 64], zmm18", + "vmovdqu32 [{optr} + 3 * 64], zmm19", + "vmovdqu32 [{optr} + 4 * 64], zmm20", + "vmovdqu32 [{optr} + 5 * 64], zmm21", + "vmovdqu32 [{optr} + 6 * 64], zmm22", + "vmovdqu32 [{optr} + 7 * 64], zmm23", + "vmovdqu32 [{optr} + 8 * 64], zmm24", + "vmovdqu32 [{optr} + 9 * 64], zmm25", + "vmovdqu32 [{optr} + 10 * 64], zmm26", + "vmovdqu32 [{optr} + 11 * 64], zmm27", + "vmovdqu32 [{optr} + 12 * 64], zmm28", + "vmovdqu32 [{optr} + 13 * 64], zmm29", + "vmovdqu32 [{optr} + 14 * 64], zmm30", + "vmovdqu32 [{optr} + 15 * 64], zmm31", + + keys = in(reg) keys.as_ptr(), + iptr = in(reg) iptr, + optr = in(reg) optr, + + out("zmm0") _, + out("zmm1") _, + out("zmm2") _, + out("zmm3") _, + out("zmm4") _, + out("zmm5") _, + out("zmm6") _, + out("zmm7") _, + out("zmm8") _, + out("zmm9") _, + out("zmm10") _, + out("zmm11") _, + out("zmm12") _, + out("zmm13") _, + out("zmm14") _, + + out("zmm16") _, + out("zmm17") _, + out("zmm18") _, + out("zmm19") _, + out("zmm20") _, + out("zmm21") _, + out("zmm22") _, + out("zmm23") _, + out("zmm24") _, + out("zmm25") _, + out("zmm26") _, + out("zmm27") _, + out("zmm28") _, + out("zmm29") _, + out("zmm30") _, + out("zmm31") _, + + options(nostack, preserves_flags), + }; +} + +#[inline] +pub(crate) unsafe fn decrypt64(keys: &Simd512RoundKeys<15>, blocks: InOut<'_, '_, Block64>) { + let (iptr, optr) = blocks.into_raw(); + asm! { + // load keys + "vmovdqu32 zmm14, [{keys} + 14 * 64]", + "vmovdqu32 zmm13, [{keys} + 13 * 64]", + "vmovdqu32 zmm12, [{keys} + 12 * 64]", + "vmovdqu32 zmm11, [{keys} + 11 * 64]", + "vmovdqu32 zmm10, [{keys} + 10 * 64]", + "vmovdqu32 zmm9 , [{keys} + 9 * 64]", + "vmovdqu32 zmm8 , [{keys} + 8 * 64]", + "vmovdqu32 zmm7 , [{keys} + 7 * 64]", + "vmovdqu32 zmm6 , [{keys} + 6 * 64]", + "vmovdqu32 zmm5 , [{keys} + 5 * 64]", + "vmovdqu32 zmm4 , [{keys} + 4 * 64]", + "vmovdqu32 zmm3 , [{keys} + 3 * 64]", + "vmovdqu32 zmm2 , [{keys} + 2 * 64]", + "vmovdqu32 zmm1 , [{keys} + 1 * 64]", + "vmovdqu32 zmm0 , [{keys} + 0 * 64]", + // load cipher-data + "vmovdqu32 zmm16, [{iptr} + 0 * 64]", + "vmovdqu32 zmm17, [{iptr} + 1 * 64]", + "vmovdqu32 zmm18, [{iptr} + 2 * 64]", + "vmovdqu32 zmm19, [{iptr} + 3 * 64]", + "vmovdqu32 zmm20, [{iptr} + 4 * 64]", + "vmovdqu32 zmm21, [{iptr} + 5 * 64]", + "vmovdqu32 zmm22, [{iptr} + 6 * 64]", + "vmovdqu32 zmm23, [{iptr} + 7 * 64]", + "vmovdqu32 zmm24, [{iptr} + 8 * 64]", + "vmovdqu32 zmm25, [{iptr} + 9 * 64]", + "vmovdqu32 zmm26, [{iptr} + 10 * 64]", + "vmovdqu32 zmm27, [{iptr} + 11 * 64]", + "vmovdqu32 zmm28, [{iptr} + 12 * 64]", + "vmovdqu32 zmm29, [{iptr} + 13 * 64]", + "vmovdqu32 zmm30, [{iptr} + 14 * 64]", + "vmovdqu32 zmm31, [{iptr} + 15 * 64]", + // aes-256 round 14 encrypt + "vpxord zmm16, zmm16, zmm14", + "vpxord zmm17, zmm17, zmm14", + "vpxord zmm18, zmm18, zmm14", + "vpxord zmm19, zmm19, zmm14", + "vpxord zmm20, zmm20, zmm14", + "vpxord zmm21, zmm21, zmm14", + "vpxord zmm22, zmm22, zmm14", + "vpxord zmm23, zmm23, zmm14", + "vpxord zmm24, zmm24, zmm14", + "vpxord zmm25, zmm25, zmm14", + "vpxord zmm26, zmm26, zmm14", + "vpxord zmm27, zmm27, zmm14", + "vpxord zmm28, zmm28, zmm14", + "vpxord zmm29, zmm29, zmm14", + "vpxord zmm30, zmm30, zmm14", + "vpxord zmm31, zmm31, zmm14", + // aes-256 round 13 encrypt + "vaesdec zmm16, zmm16, zmm13", + "vaesdec zmm17, zmm17, zmm13", + "vaesdec zmm18, zmm18, zmm13", + "vaesdec zmm19, zmm19, zmm13", + "vaesdec zmm20, zmm20, zmm13", + "vaesdec zmm21, zmm21, zmm13", + "vaesdec zmm22, zmm22, zmm13", + "vaesdec zmm23, zmm23, zmm13", + "vaesdec zmm24, zmm24, zmm13", + "vaesdec zmm25, zmm25, zmm13", + "vaesdec zmm26, zmm26, zmm13", + "vaesdec zmm27, zmm27, zmm13", + "vaesdec zmm28, zmm28, zmm13", + "vaesdec zmm29, zmm29, zmm13", + "vaesdec zmm30, zmm30, zmm13", + "vaesdec zmm31, zmm31, zmm13", + // aes-256 round 12 encrypt + "vaesdec zmm16, zmm16, zmm12", + "vaesdec zmm17, zmm17, zmm12", + "vaesdec zmm18, zmm18, zmm12", + "vaesdec zmm19, zmm19, zmm12", + "vaesdec zmm20, zmm20, zmm12", + "vaesdec zmm21, zmm21, zmm12", + "vaesdec zmm22, zmm22, zmm12", + "vaesdec zmm23, zmm23, zmm12", + "vaesdec zmm24, zmm24, zmm12", + "vaesdec zmm25, zmm25, zmm12", + "vaesdec zmm26, zmm26, zmm12", + "vaesdec zmm27, zmm27, zmm12", + "vaesdec zmm28, zmm28, zmm12", + "vaesdec zmm29, zmm29, zmm12", + "vaesdec zmm30, zmm30, zmm12", + "vaesdec zmm31, zmm31, zmm12", + // aes-256 round 11 encrypt + "vaesdec zmm16, zmm16, zmm11", + "vaesdec zmm17, zmm17, zmm11", + "vaesdec zmm18, zmm18, zmm11", + "vaesdec zmm19, zmm19, zmm11", + "vaesdec zmm20, zmm20, zmm11", + "vaesdec zmm21, zmm21, zmm11", + "vaesdec zmm22, zmm22, zmm11", + "vaesdec zmm23, zmm23, zmm11", + "vaesdec zmm24, zmm24, zmm11", + "vaesdec zmm25, zmm25, zmm11", + "vaesdec zmm26, zmm26, zmm11", + "vaesdec zmm27, zmm27, zmm11", + "vaesdec zmm28, zmm28, zmm11", + "vaesdec zmm29, zmm29, zmm11", + "vaesdec zmm30, zmm30, zmm11", + "vaesdec zmm31, zmm31, zmm11", + // aes-256 round 10 encrypt + "vaesdec zmm16, zmm16, zmm10", + "vaesdec zmm17, zmm17, zmm10", + "vaesdec zmm18, zmm18, zmm10", + "vaesdec zmm19, zmm19, zmm10", + "vaesdec zmm20, zmm20, zmm10", + "vaesdec zmm21, zmm21, zmm10", + "vaesdec zmm22, zmm22, zmm10", + "vaesdec zmm23, zmm23, zmm10", + "vaesdec zmm24, zmm24, zmm10", + "vaesdec zmm25, zmm25, zmm10", + "vaesdec zmm26, zmm26, zmm10", + "vaesdec zmm27, zmm27, zmm10", + "vaesdec zmm28, zmm28, zmm10", + "vaesdec zmm29, zmm29, zmm10", + "vaesdec zmm30, zmm30, zmm10", + "vaesdec zmm31, zmm31, zmm10", + // aes-256 round 9 encrypt + "vaesdec zmm16, zmm16, zmm9", + "vaesdec zmm17, zmm17, zmm9", + "vaesdec zmm18, zmm18, zmm9", + "vaesdec zmm19, zmm19, zmm9", + "vaesdec zmm20, zmm20, zmm9", + "vaesdec zmm21, zmm21, zmm9", + "vaesdec zmm22, zmm22, zmm9", + "vaesdec zmm23, zmm23, zmm9", + "vaesdec zmm24, zmm24, zmm9", + "vaesdec zmm25, zmm25, zmm9", + "vaesdec zmm26, zmm26, zmm9", + "vaesdec zmm27, zmm27, zmm9", + "vaesdec zmm28, zmm28, zmm9", + "vaesdec zmm29, zmm29, zmm9", + "vaesdec zmm30, zmm30, zmm9", + "vaesdec zmm31, zmm31, zmm9", + // aes-256 round 8 encrypt + "vaesdec zmm16, zmm16, zmm8", + "vaesdec zmm17, zmm17, zmm8", + "vaesdec zmm18, zmm18, zmm8", + "vaesdec zmm19, zmm19, zmm8", + "vaesdec zmm20, zmm20, zmm8", + "vaesdec zmm21, zmm21, zmm8", + "vaesdec zmm22, zmm22, zmm8", + "vaesdec zmm23, zmm23, zmm8", + "vaesdec zmm24, zmm24, zmm8", + "vaesdec zmm25, zmm25, zmm8", + "vaesdec zmm26, zmm26, zmm8", + "vaesdec zmm27, zmm27, zmm8", + "vaesdec zmm28, zmm28, zmm8", + "vaesdec zmm29, zmm29, zmm8", + "vaesdec zmm30, zmm30, zmm8", + "vaesdec zmm31, zmm31, zmm8", + // aes-256 round 7 encrypt + "vaesdec zmm16, zmm16, zmm7", + "vaesdec zmm17, zmm17, zmm7", + "vaesdec zmm18, zmm18, zmm7", + "vaesdec zmm19, zmm19, zmm7", + "vaesdec zmm20, zmm20, zmm7", + "vaesdec zmm21, zmm21, zmm7", + "vaesdec zmm22, zmm22, zmm7", + "vaesdec zmm23, zmm23, zmm7", + "vaesdec zmm24, zmm24, zmm7", + "vaesdec zmm25, zmm25, zmm7", + "vaesdec zmm26, zmm26, zmm7", + "vaesdec zmm27, zmm27, zmm7", + "vaesdec zmm28, zmm28, zmm7", + "vaesdec zmm29, zmm29, zmm7", + "vaesdec zmm30, zmm30, zmm7", + "vaesdec zmm31, zmm31, zmm7", + // aes-256 round 6 encrypt + "vaesdec zmm16, zmm16, zmm6", + "vaesdec zmm17, zmm17, zmm6", + "vaesdec zmm18, zmm18, zmm6", + "vaesdec zmm19, zmm19, zmm6", + "vaesdec zmm20, zmm20, zmm6", + "vaesdec zmm21, zmm21, zmm6", + "vaesdec zmm22, zmm22, zmm6", + "vaesdec zmm23, zmm23, zmm6", + "vaesdec zmm24, zmm24, zmm6", + "vaesdec zmm25, zmm25, zmm6", + "vaesdec zmm26, zmm26, zmm6", + "vaesdec zmm27, zmm27, zmm6", + "vaesdec zmm28, zmm28, zmm6", + "vaesdec zmm29, zmm29, zmm6", + "vaesdec zmm30, zmm30, zmm6", + "vaesdec zmm31, zmm31, zmm6", + // aes-256 round 5 encrypt + "vaesdec zmm16, zmm16, zmm5", + "vaesdec zmm17, zmm17, zmm5", + "vaesdec zmm18, zmm18, zmm5", + "vaesdec zmm19, zmm19, zmm5", + "vaesdec zmm20, zmm20, zmm5", + "vaesdec zmm21, zmm21, zmm5", + "vaesdec zmm22, zmm22, zmm5", + "vaesdec zmm23, zmm23, zmm5", + "vaesdec zmm24, zmm24, zmm5", + "vaesdec zmm25, zmm25, zmm5", + "vaesdec zmm26, zmm26, zmm5", + "vaesdec zmm27, zmm27, zmm5", + "vaesdec zmm28, zmm28, zmm5", + "vaesdec zmm29, zmm29, zmm5", + "vaesdec zmm30, zmm30, zmm5", + "vaesdec zmm31, zmm31, zmm5", + // aes-256 round 4 encrypt + "vaesdec zmm16, zmm16, zmm4", + "vaesdec zmm17, zmm17, zmm4", + "vaesdec zmm18, zmm18, zmm4", + "vaesdec zmm19, zmm19, zmm4", + "vaesdec zmm20, zmm20, zmm4", + "vaesdec zmm21, zmm21, zmm4", + "vaesdec zmm22, zmm22, zmm4", + "vaesdec zmm23, zmm23, zmm4", + "vaesdec zmm24, zmm24, zmm4", + "vaesdec zmm25, zmm25, zmm4", + "vaesdec zmm26, zmm26, zmm4", + "vaesdec zmm27, zmm27, zmm4", + "vaesdec zmm28, zmm28, zmm4", + "vaesdec zmm29, zmm29, zmm4", + "vaesdec zmm30, zmm30, zmm4", + "vaesdec zmm31, zmm31, zmm4", + // aes-256 round 3 encrypt + "vaesdec zmm16, zmm16, zmm3", + "vaesdec zmm17, zmm17, zmm3", + "vaesdec zmm18, zmm18, zmm3", + "vaesdec zmm19, zmm19, zmm3", + "vaesdec zmm20, zmm20, zmm3", + "vaesdec zmm21, zmm21, zmm3", + "vaesdec zmm22, zmm22, zmm3", + "vaesdec zmm23, zmm23, zmm3", + "vaesdec zmm24, zmm24, zmm3", + "vaesdec zmm25, zmm25, zmm3", + "vaesdec zmm26, zmm26, zmm3", + "vaesdec zmm27, zmm27, zmm3", + "vaesdec zmm28, zmm28, zmm3", + "vaesdec zmm29, zmm29, zmm3", + "vaesdec zmm30, zmm30, zmm3", + "vaesdec zmm31, zmm31, zmm3", + // aes-256 round 2 encrypt + "vaesdec zmm16, zmm16, zmm2", + "vaesdec zmm17, zmm17, zmm2", + "vaesdec zmm18, zmm18, zmm2", + "vaesdec zmm19, zmm19, zmm2", + "vaesdec zmm20, zmm20, zmm2", + "vaesdec zmm21, zmm21, zmm2", + "vaesdec zmm22, zmm22, zmm2", + "vaesdec zmm23, zmm23, zmm2", + "vaesdec zmm24, zmm24, zmm2", + "vaesdec zmm25, zmm25, zmm2", + "vaesdec zmm26, zmm26, zmm2", + "vaesdec zmm27, zmm27, zmm2", + "vaesdec zmm28, zmm28, zmm2", + "vaesdec zmm29, zmm29, zmm2", + "vaesdec zmm30, zmm30, zmm2", + "vaesdec zmm31, zmm31, zmm2", + // aes-256 round 1 encrypt + "vaesdec zmm16, zmm16, zmm1", + "vaesdec zmm17, zmm17, zmm1", + "vaesdec zmm18, zmm18, zmm1", + "vaesdec zmm19, zmm19, zmm1", + "vaesdec zmm20, zmm20, zmm1", + "vaesdec zmm21, zmm21, zmm1", + "vaesdec zmm22, zmm22, zmm1", + "vaesdec zmm23, zmm23, zmm1", + "vaesdec zmm24, zmm24, zmm1", + "vaesdec zmm25, zmm25, zmm1", + "vaesdec zmm26, zmm26, zmm1", + "vaesdec zmm27, zmm27, zmm1", + "vaesdec zmm28, zmm28, zmm1", + "vaesdec zmm29, zmm29, zmm1", + "vaesdec zmm30, zmm30, zmm1", + "vaesdec zmm31, zmm31, zmm1", + // aes-256 round 0 encrypt + "vaesdeclast zmm16, zmm16, zmm0", + "vaesdeclast zmm17, zmm17, zmm0", + "vaesdeclast zmm18, zmm18, zmm0", + "vaesdeclast zmm19, zmm19, zmm0", + "vaesdeclast zmm20, zmm20, zmm0", + "vaesdeclast zmm21, zmm21, zmm0", + "vaesdeclast zmm22, zmm22, zmm0", + "vaesdeclast zmm23, zmm23, zmm0", + "vaesdeclast zmm24, zmm24, zmm0", + "vaesdeclast zmm25, zmm25, zmm0", + "vaesdeclast zmm26, zmm26, zmm0", + "vaesdeclast zmm27, zmm27, zmm0", + "vaesdeclast zmm28, zmm28, zmm0", + "vaesdeclast zmm29, zmm29, zmm0", + "vaesdeclast zmm30, zmm30, zmm0", + "vaesdeclast zmm31, zmm31, zmm0", + // save plain-data + "vmovdqu32 [{optr} + 0 * 64], zmm16", + "vmovdqu32 [{optr} + 1 * 64], zmm17", + "vmovdqu32 [{optr} + 2 * 64], zmm18", + "vmovdqu32 [{optr} + 3 * 64], zmm19", + "vmovdqu32 [{optr} + 4 * 64], zmm20", + "vmovdqu32 [{optr} + 5 * 64], zmm21", + "vmovdqu32 [{optr} + 6 * 64], zmm22", + "vmovdqu32 [{optr} + 7 * 64], zmm23", + "vmovdqu32 [{optr} + 8 * 64], zmm24", + "vmovdqu32 [{optr} + 9 * 64], zmm25", + "vmovdqu32 [{optr} + 10 * 64], zmm26", + "vmovdqu32 [{optr} + 11 * 64], zmm27", + "vmovdqu32 [{optr} + 12 * 64], zmm28", + "vmovdqu32 [{optr} + 13 * 64], zmm29", + "vmovdqu32 [{optr} + 14 * 64], zmm30", + "vmovdqu32 [{optr} + 15 * 64], zmm31", + + keys = in(reg) keys.as_ptr(), + iptr = in(reg) iptr, + optr = in(reg) optr, + + out("zmm0") _, + out("zmm1") _, + out("zmm2") _, + out("zmm3") _, + out("zmm4") _, + out("zmm5") _, + out("zmm6") _, + out("zmm7") _, + out("zmm8") _, + out("zmm9") _, + out("zmm10") _, + out("zmm11") _, + out("zmm12") _, + out("zmm13") _, + out("zmm14") _, + + out("zmm16") _, + out("zmm17") _, + out("zmm18") _, + out("zmm19") _, + out("zmm20") _, + out("zmm21") _, + out("zmm22") _, + out("zmm23") _, + out("zmm24") _, + out("zmm25") _, + out("zmm26") _, + out("zmm27") _, + out("zmm28") _, + out("zmm29") _, + out("zmm30") _, + out("zmm31") _, + + options(nostack, preserves_flags), + }; +} From 53e2a4c1e68ebbb7bfa4906d4f63e7d87b00304f Mon Sep 17 00:00:00 2001 From: silvanshade Date: Tue, 6 Feb 2024 13:25:19 -0700 Subject: [PATCH 3/5] Update .gitignore for intel-sde --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 9111bb1d..66ca85cb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ target/ **/Cargo.lock **/target/ +**/pin.log +**/pin-log.txt +**/pin-tool-log.txt From 1af788bf8915db2f779e858e2bfc2cd4789544e4 Mon Sep 17 00:00:00 2001 From: silvanshade Date: Tue, 6 Feb 2024 12:41:34 -0700 Subject: [PATCH 4/5] Add VAES jobs to aes CI --- .github/workflows/aes.yml | 73 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 2 deletions(-) diff --git a/.github/workflows/aes.yml b/.github/workflows/aes.yml index f9edc7cb..7a067c09 100644 --- a/.github/workflows/aes.yml +++ b/.github/workflows/aes.yml @@ -16,6 +16,7 @@ defaults: env: CARGO_INCREMENTAL: 0 RUSTFLAGS: "-Dwarnings" + SDE_FULL_VERSION: "9.33.0-2024-01-07" jobs: # Builds for no_std platforms @@ -59,7 +60,7 @@ jobs: minimal-versions: uses: RustCrypto/actions/.github/workflows/minimal-versions.yml@master with: - working-directory: ${{ github.workflow }} + working-directory: ${{ github.workflow }} # Tests for the AES-NI backend aesni: @@ -96,6 +97,75 @@ jobs: - run: cargo test --target ${{ matrix.target }} --features hazmat - run: cargo test --target ${{ matrix.target }} --all-features + # Tests for the VAES AVX backend + vaes256: + runs-on: ubuntu-latest + env: + CARGO_INCREMENTAL: 0 + RUSTFLAGS: "-Dwarnings --cfg disable_avx512" + strategy: + matrix: + include: + - target: x86_64-unknown-linux-gnu + rust: nightly-2024-02-07 + steps: + - uses: actions/checkout@v4 + - uses: silvanshade/rustcrypto-actions/intel-sde-install@master + with: + sde-full-version: ${{ env.SDE_FULL_VERSION }} + - uses: RustCrypto/actions/cargo-cache@master + - uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ matrix.rust }} + targets: ${{ matrix.target }} + # NOTE: Write a `.cargo/config.toml` to configure the target for VAES + # NOTE: We use intel-sde as the runner since not all GitHub CI hosts support AVX512 + - name: write .cargo/config.toml + shell: bash + run: | + cd ../aes/.. + mkdir -p .cargo + echo '[target.${{ matrix.target }}]' > .cargo/config.toml + echo 'runner = "sde64 -future --"' >> .cargo/config.toml + - run: ${{ matrix.deps }} + - run: cargo test --target ${{ matrix.target }} + - run: cargo test --target ${{ matrix.target }} --features hazmat + - run: cargo test --target ${{ matrix.target }} --all-features + + # Tests for the VAES AVX512 backend + vaes512: + runs-on: ubuntu-latest + env: + CARGO_INCREMENTAL: 0 + strategy: + matrix: + include: + - target: x86_64-unknown-linux-gnu + rust: nightly-2024-02-07 + steps: + - uses: actions/checkout@v4 + - uses: silvanshade/rustcrypto-actions/intel-sde-install@master + with: + sde-full-version: ${{ env.SDE_FULL_VERSION }} + - uses: RustCrypto/actions/cargo-cache@master + - uses: dtolnay/rust-toolchain@master + with: + toolchain: ${{ matrix.rust }} + targets: ${{ matrix.target }} + # NOTE: Write a `.cargo/config.toml` to configure the target for VAES + # NOTE: We use intel-sde as the runner since not all GitHub CI hosts support AVX512 + - name: write .cargo/config.toml + shell: bash + run: | + cd ../aes/.. + mkdir -p .cargo + echo '[target.${{ matrix.target }}]' > .cargo/config.toml + echo 'runner = "sde64 -future --"' >> .cargo/config.toml + - run: ${{ matrix.deps }} + - run: cargo test --target ${{ matrix.target }} + - run: cargo test --target ${{ matrix.target }} --features hazmat + - run: cargo test --target ${{ matrix.target }} --all-features + # Tests for CPU feature autodetection with fallback to portable software implementation autodetect: runs-on: ubuntu-latest @@ -159,7 +229,6 @@ jobs: - run: cargo test --target ${{ matrix.target }} - run: cargo test --target ${{ matrix.target }} --all-features - # Cross-compiled tests cross: strategy: From e52144ae6b014ea08b911fe01590354b5d11a947 Mon Sep 17 00:00:00 2001 From: silvanshade Date: Fri, 31 May 2024 09:20:45 -0600 Subject: [PATCH 5/5] Silence warning about cfg features --- aes/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/aes/src/lib.rs b/aes/src/lib.rs index c0abd046..46140e0b 100644 --- a/aes/src/lib.rs +++ b/aes/src/lib.rs @@ -126,6 +126,7 @@ )] #![cfg_attr(docsrs, feature(doc_cfg))] #![warn(missing_docs, rust_2018_idioms)] +#![allow(unexpected_cfgs)] // Silence warning about use of non-cargo features for cfg #[cfg(feature = "hazmat")] #[cfg_attr(docsrs, doc(cfg(feature = "hazmat")))]