diff --git a/mldsa/src/fips202/keccakf1600.c b/mldsa/src/fips202/keccakf1600.c index e59120c07..5bd6f3ea7 100644 --- a/mldsa/src/fips202/keccakf1600.c +++ b/mldsa/src/fips202/keccakf1600.c @@ -155,7 +155,7 @@ static void mld_keccakf1600_permute_c(uint64_t *state) uint64_t Ema, Eme, Emi, Emo, Emu; uint64_t Esa, Ese, Esi, Eso, Esu; - /* copyFromState(A, state) */ + /* MLD_COPY_FROM_STATE(A, state) */ Aba = state[0]; Abe = state[1]; Abi = state[2]; @@ -185,14 +185,14 @@ static void mld_keccakf1600_permute_c(uint64_t *state) for (round = 0; round < MLD_KECCAK_NROUNDS; round += 2) __loop__(invariant(round <= MLD_KECCAK_NROUNDS && round % 2 == 0)) { - /* prepareTheta */ + /* MLD_prepareTheta */ BCa = Aba ^ Aga ^ Aka ^ Ama ^ Asa; BCe = Abe ^ Age ^ Ake ^ Ame ^ Ase; BCi = Abi ^ Agi ^ Aki ^ Ami ^ Asi; BCo = Abo ^ Ago ^ Ako ^ Amo ^ Aso; BCu = Abu ^ Agu ^ Aku ^ Amu ^ Asu; - /* thetaRhoPiChiIotaPrepareTheta(round, A, E) */ + /* MLD_thetaRhoPiChiIotaPrepareTheta(round, A, E) */ Da = BCu ^ MLD_KECCAK_ROL(BCe, 1); De = BCa ^ MLD_KECCAK_ROL(BCi, 1); Di = BCe ^ MLD_KECCAK_ROL(BCo, 1); @@ -280,14 +280,14 @@ static void mld_keccakf1600_permute_c(uint64_t *state) Eso = BCo ^ ((~BCu) & BCa); Esu = BCu ^ ((~BCa) & BCe); - /* prepareTheta */ + /* MLD_prepareTheta */ BCa = Eba ^ Ega ^ Eka ^ Ema ^ Esa; BCe = Ebe ^ Ege ^ Eke ^ Eme ^ Ese; BCi = Ebi ^ Egi ^ Eki ^ Emi ^ Esi; BCo = Ebo ^ Ego ^ Eko ^ Emo ^ Eso; BCu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu; - /* thetaRhoPiChiIotaPrepareTheta(round+1, E, A) */ + /* MLD_thetaRhoPiChiIotaPrepareTheta(round+1, E, A) */ Da = BCu ^ MLD_KECCAK_ROL(BCe, 1); De = BCa ^ MLD_KECCAK_ROL(BCi, 1); Di = BCe ^ MLD_KECCAK_ROL(BCo, 1); @@ -376,7 +376,7 @@ static void mld_keccakf1600_permute_c(uint64_t *state) Asu = BCu ^ ((~BCa) & BCe); } - /* copyToState(state, A) */ + /* MLD_COPY_TO_STATE(state, A) */ state[0] = Aba; state[1] = Abe; state[2] = Abi; diff --git a/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.c b/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.c index 4eaad2fd6..5df15eb55 100644 --- a/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.c +++ b/mldsa/src/fips202/native/x86_64/src/KeccakP_1600_times4_SIMD256.c @@ -21,7 +21,7 @@ and related or neighboring rights to the source code in this file. /* * Changes for mlkem-native/mldsa-native: - * - copyFromState and copyToState operate on uninterleaved + * - MLD_COPY_FROM_STATE and MLD_COPY_TO_STATE operate on uninterleaved * Keccak states in memory. */ @@ -38,24 +38,24 @@ and related or neighboring rights to the source code in this file. #error Expecting a little-endian platform #endif -#define ANDnu256(a, b) _mm256_andnot_si256(a, b) -#define CONST256(a) _mm256_load_si256((const __m256i *)&(a)) -#define CONST256_64(a) (__m256i) _mm256_broadcast_sd((const double *)(&a)) -#define ROL64in256(d, a, o) \ +#define MLD_ANDNU256(a, b) _mm256_andnot_si256(a, b) +#define MLD_CONST256(a) _mm256_load_si256((const __m256i *)&(a)) +#define MLD_CONST256_64(a) (__m256i) _mm256_broadcast_sd((const double *)(&a)) +#define MLD_ROL64IN256(d, a, o) \ d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64 - (o))) -#define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8)) -#define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56)) +#define MLD_ROL64IN256_8(d, a) d = _mm256_shuffle_epi8(a, MLD_CONST256(rho8)) +#define MLD_ROL64IN256_56(d, a) d = _mm256_shuffle_epi8(a, MLD_CONST256(rho56)) static const uint64_t rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F}; static const uint64_t rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19}; -#define STORE256(a, b) _mm256_store_si256((__m256i *)&(a), b) -#define XOR256(a, b) _mm256_xor_si256(a, b) -#define XOReq256(a, b) a = _mm256_xor_si256(a, b) +#define MLD_STORE256(a, b) _mm256_store_si256((__m256i *)&(a), b) +#define MLD_XOR256(a, b) _mm256_xor_si256(a, b) +#define MLD_XOREQ256(a, b) a = _mm256_xor_si256(a, b) -#define SnP_laneLengthInBytes 8 +#define MLD_SNP_LANELENGTHINBYTES 8 -#define declareABCDE \ +#define MLD_DECLARE_ABCDE \ __m256i Aba, Abe, Abi, Abo, Abu; \ __m256i Aga, Age, Agi, Ago, Agu; \ __m256i Aka, Ake, Aki, Ako, Aku; \ @@ -75,232 +75,236 @@ static const uint64_t rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, __m256i Ema, Eme, Emi, Emo, Emu; \ __m256i Esa, Ese, Esi, Eso, Esu; -#define prepareTheta \ - Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \ - Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \ - Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \ - Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \ - Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); +#define MLD_prepareTheta \ + Ca = \ + MLD_XOR256(Aba, MLD_XOR256(Aga, MLD_XOR256(Aka, MLD_XOR256(Ama, Asa)))); \ + Ce = \ + MLD_XOR256(Abe, MLD_XOR256(Age, MLD_XOR256(Ake, MLD_XOR256(Ame, Ase)))); \ + Ci = \ + MLD_XOR256(Abi, MLD_XOR256(Agi, MLD_XOR256(Aki, MLD_XOR256(Ami, Asi)))); \ + Co = \ + MLD_XOR256(Abo, MLD_XOR256(Ago, MLD_XOR256(Ako, MLD_XOR256(Amo, Aso)))); \ + Cu = MLD_XOR256(Abu, MLD_XOR256(Agu, MLD_XOR256(Aku, MLD_XOR256(Amu, Asu)))); /* * --- Theta Rho Pi Chi Iota Prepare-theta * --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ - ROL64in256(Ce1, Ce, 1); \ - Da = XOR256(Cu, Ce1); \ - ROL64in256(Ci1, Ci, 1); \ - De = XOR256(Ca, Ci1); \ - ROL64in256(Co1, Co, 1); \ - Di = XOR256(Ce, Co1); \ - ROL64in256(Cu1, Cu, 1); \ - Do = XOR256(Ci, Cu1); \ - ROL64in256(Ca1, Ca, 1); \ - Du = XOR256(Co, Ca1); \ - \ - XOReq256(A##ba, Da); \ - Bba = A##ba; \ - XOReq256(A##ge, De); \ - ROL64in256(Bbe, A##ge, 44); \ - XOReq256(A##ki, Di); \ - ROL64in256(Bbi, A##ki, 43); \ - E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \ - XOReq256(E##ba, CONST256_64(keccakf1600RoundConstants[i])); \ - Ca = E##ba; \ - XOReq256(A##mo, Do); \ - ROL64in256(Bbo, A##mo, 21); \ - E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \ - Ce = E##be; \ - XOReq256(A##su, Du); \ - ROL64in256(Bbu, A##su, 14); \ - E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \ - Ci = E##bi; \ - E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \ - Co = E##bo; \ - E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \ - Cu = E##bu; \ - \ - XOReq256(A##bo, Do); \ - ROL64in256(Bga, A##bo, 28); \ - XOReq256(A##gu, Du); \ - ROL64in256(Bge, A##gu, 20); \ - XOReq256(A##ka, Da); \ - ROL64in256(Bgi, A##ka, 3); \ - E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \ - XOReq256(Ca, E##ga); \ - XOReq256(A##me, De); \ - ROL64in256(Bgo, A##me, 45); \ - E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \ - XOReq256(Ce, E##ge); \ - XOReq256(A##si, Di); \ - ROL64in256(Bgu, A##si, 61); \ - E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \ - XOReq256(Ci, E##gi); \ - E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \ - XOReq256(Co, E##go); \ - E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \ - XOReq256(Cu, E##gu); \ - \ - XOReq256(A##be, De); \ - ROL64in256(Bka, A##be, 1); \ - XOReq256(A##gi, Di); \ - ROL64in256(Bke, A##gi, 6); \ - XOReq256(A##ko, Do); \ - ROL64in256(Bki, A##ko, 25); \ - E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \ - XOReq256(Ca, E##ka); \ - XOReq256(A##mu, Du); \ - ROL64in256_8(Bko, A##mu); \ - E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \ - XOReq256(Ce, E##ke); \ - XOReq256(A##sa, Da); \ - ROL64in256(Bku, A##sa, 18); \ - E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \ - XOReq256(Ci, E##ki); \ - E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \ - XOReq256(Co, E##ko); \ - E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \ - XOReq256(Cu, E##ku); \ - \ - XOReq256(A##bu, Du); \ - ROL64in256(Bma, A##bu, 27); \ - XOReq256(A##ga, Da); \ - ROL64in256(Bme, A##ga, 36); \ - XOReq256(A##ke, De); \ - ROL64in256(Bmi, A##ke, 10); \ - E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \ - XOReq256(Ca, E##ma); \ - XOReq256(A##mi, Di); \ - ROL64in256(Bmo, A##mi, 15); \ - E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \ - XOReq256(Ce, E##me); \ - XOReq256(A##so, Do); \ - ROL64in256_56(Bmu, A##so); \ - E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \ - XOReq256(Ci, E##mi); \ - E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \ - XOReq256(Co, E##mo); \ - E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \ - XOReq256(Cu, E##mu); \ - \ - XOReq256(A##bi, Di); \ - ROL64in256(Bsa, A##bi, 62); \ - XOReq256(A##go, Do); \ - ROL64in256(Bse, A##go, 55); \ - XOReq256(A##ku, Du); \ - ROL64in256(Bsi, A##ku, 39); \ - E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \ - XOReq256(Ca, E##sa); \ - XOReq256(A##ma, Da); \ - ROL64in256(Bso, A##ma, 41); \ - E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \ - XOReq256(Ce, E##se); \ - XOReq256(A##se, De); \ - ROL64in256(Bsu, A##se, 2); \ - E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \ - XOReq256(Ci, E##si); \ - E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \ - XOReq256(Co, E##so); \ - E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \ - XOReq256(Cu, E##su); +#define MLD_thetaRhoPiChiIotaPrepareTheta(i, A, E) \ + MLD_ROL64IN256(Ce1, Ce, 1); \ + Da = MLD_XOR256(Cu, Ce1); \ + MLD_ROL64IN256(Ci1, Ci, 1); \ + De = MLD_XOR256(Ca, Ci1); \ + MLD_ROL64IN256(Co1, Co, 1); \ + Di = MLD_XOR256(Ce, Co1); \ + MLD_ROL64IN256(Cu1, Cu, 1); \ + Do = MLD_XOR256(Ci, Cu1); \ + MLD_ROL64IN256(Ca1, Ca, 1); \ + Du = MLD_XOR256(Co, Ca1); \ + \ + MLD_XOREQ256(A##ba, Da); \ + Bba = A##ba; \ + MLD_XOREQ256(A##ge, De); \ + MLD_ROL64IN256(Bbe, A##ge, 44); \ + MLD_XOREQ256(A##ki, Di); \ + MLD_ROL64IN256(Bbi, A##ki, 43); \ + E##ba = MLD_XOR256(Bba, MLD_ANDNU256(Bbe, Bbi)); \ + MLD_XOREQ256(E##ba, MLD_CONST256_64(keccakf1600RoundConstants[i])); \ + Ca = E##ba; \ + MLD_XOREQ256(A##mo, Do); \ + MLD_ROL64IN256(Bbo, A##mo, 21); \ + E##be = MLD_XOR256(Bbe, MLD_ANDNU256(Bbi, Bbo)); \ + Ce = E##be; \ + MLD_XOREQ256(A##su, Du); \ + MLD_ROL64IN256(Bbu, A##su, 14); \ + E##bi = MLD_XOR256(Bbi, MLD_ANDNU256(Bbo, Bbu)); \ + Ci = E##bi; \ + E##bo = MLD_XOR256(Bbo, MLD_ANDNU256(Bbu, Bba)); \ + Co = E##bo; \ + E##bu = MLD_XOR256(Bbu, MLD_ANDNU256(Bba, Bbe)); \ + Cu = E##bu; \ + \ + MLD_XOREQ256(A##bo, Do); \ + MLD_ROL64IN256(Bga, A##bo, 28); \ + MLD_XOREQ256(A##gu, Du); \ + MLD_ROL64IN256(Bge, A##gu, 20); \ + MLD_XOREQ256(A##ka, Da); \ + MLD_ROL64IN256(Bgi, A##ka, 3); \ + E##ga = MLD_XOR256(Bga, MLD_ANDNU256(Bge, Bgi)); \ + MLD_XOREQ256(Ca, E##ga); \ + MLD_XOREQ256(A##me, De); \ + MLD_ROL64IN256(Bgo, A##me, 45); \ + E##ge = MLD_XOR256(Bge, MLD_ANDNU256(Bgi, Bgo)); \ + MLD_XOREQ256(Ce, E##ge); \ + MLD_XOREQ256(A##si, Di); \ + MLD_ROL64IN256(Bgu, A##si, 61); \ + E##gi = MLD_XOR256(Bgi, MLD_ANDNU256(Bgo, Bgu)); \ + MLD_XOREQ256(Ci, E##gi); \ + E##go = MLD_XOR256(Bgo, MLD_ANDNU256(Bgu, Bga)); \ + MLD_XOREQ256(Co, E##go); \ + E##gu = MLD_XOR256(Bgu, MLD_ANDNU256(Bga, Bge)); \ + MLD_XOREQ256(Cu, E##gu); \ + \ + MLD_XOREQ256(A##be, De); \ + MLD_ROL64IN256(Bka, A##be, 1); \ + MLD_XOREQ256(A##gi, Di); \ + MLD_ROL64IN256(Bke, A##gi, 6); \ + MLD_XOREQ256(A##ko, Do); \ + MLD_ROL64IN256(Bki, A##ko, 25); \ + E##ka = MLD_XOR256(Bka, MLD_ANDNU256(Bke, Bki)); \ + MLD_XOREQ256(Ca, E##ka); \ + MLD_XOREQ256(A##mu, Du); \ + MLD_ROL64IN256_8(Bko, A##mu); \ + E##ke = MLD_XOR256(Bke, MLD_ANDNU256(Bki, Bko)); \ + MLD_XOREQ256(Ce, E##ke); \ + MLD_XOREQ256(A##sa, Da); \ + MLD_ROL64IN256(Bku, A##sa, 18); \ + E##ki = MLD_XOR256(Bki, MLD_ANDNU256(Bko, Bku)); \ + MLD_XOREQ256(Ci, E##ki); \ + E##ko = MLD_XOR256(Bko, MLD_ANDNU256(Bku, Bka)); \ + MLD_XOREQ256(Co, E##ko); \ + E##ku = MLD_XOR256(Bku, MLD_ANDNU256(Bka, Bke)); \ + MLD_XOREQ256(Cu, E##ku); \ + \ + MLD_XOREQ256(A##bu, Du); \ + MLD_ROL64IN256(Bma, A##bu, 27); \ + MLD_XOREQ256(A##ga, Da); \ + MLD_ROL64IN256(Bme, A##ga, 36); \ + MLD_XOREQ256(A##ke, De); \ + MLD_ROL64IN256(Bmi, A##ke, 10); \ + E##ma = MLD_XOR256(Bma, MLD_ANDNU256(Bme, Bmi)); \ + MLD_XOREQ256(Ca, E##ma); \ + MLD_XOREQ256(A##mi, Di); \ + MLD_ROL64IN256(Bmo, A##mi, 15); \ + E##me = MLD_XOR256(Bme, MLD_ANDNU256(Bmi, Bmo)); \ + MLD_XOREQ256(Ce, E##me); \ + MLD_XOREQ256(A##so, Do); \ + MLD_ROL64IN256_56(Bmu, A##so); \ + E##mi = MLD_XOR256(Bmi, MLD_ANDNU256(Bmo, Bmu)); \ + MLD_XOREQ256(Ci, E##mi); \ + E##mo = MLD_XOR256(Bmo, MLD_ANDNU256(Bmu, Bma)); \ + MLD_XOREQ256(Co, E##mo); \ + E##mu = MLD_XOR256(Bmu, MLD_ANDNU256(Bma, Bme)); \ + MLD_XOREQ256(Cu, E##mu); \ + \ + MLD_XOREQ256(A##bi, Di); \ + MLD_ROL64IN256(Bsa, A##bi, 62); \ + MLD_XOREQ256(A##go, Do); \ + MLD_ROL64IN256(Bse, A##go, 55); \ + MLD_XOREQ256(A##ku, Du); \ + MLD_ROL64IN256(Bsi, A##ku, 39); \ + E##sa = MLD_XOR256(Bsa, MLD_ANDNU256(Bse, Bsi)); \ + MLD_XOREQ256(Ca, E##sa); \ + MLD_XOREQ256(A##ma, Da); \ + MLD_ROL64IN256(Bso, A##ma, 41); \ + E##se = MLD_XOR256(Bse, MLD_ANDNU256(Bsi, Bso)); \ + MLD_XOREQ256(Ce, E##se); \ + MLD_XOREQ256(A##se, De); \ + MLD_ROL64IN256(Bsu, A##se, 2); \ + E##si = MLD_XOR256(Bsi, MLD_ANDNU256(Bso, Bsu)); \ + MLD_XOREQ256(Ci, E##si); \ + E##so = MLD_XOR256(Bso, MLD_ANDNU256(Bsu, Bsa)); \ + MLD_XOREQ256(Co, E##so); \ + E##su = MLD_XOR256(Bsu, MLD_ANDNU256(Bsa, Bse)); \ + MLD_XOREQ256(Cu, E##su); /* * --- Theta Rho Pi Chi Iota * --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIota(i, A, E) \ - ROL64in256(Ce1, Ce, 1); \ - Da = XOR256(Cu, Ce1); \ - ROL64in256(Ci1, Ci, 1); \ - De = XOR256(Ca, Ci1); \ - ROL64in256(Co1, Co, 1); \ - Di = XOR256(Ce, Co1); \ - ROL64in256(Cu1, Cu, 1); \ - Do = XOR256(Ci, Cu1); \ - ROL64in256(Ca1, Ca, 1); \ - Du = XOR256(Co, Ca1); \ - \ - XOReq256(A##ba, Da); \ - Bba = A##ba; \ - XOReq256(A##ge, De); \ - ROL64in256(Bbe, A##ge, 44); \ - XOReq256(A##ki, Di); \ - ROL64in256(Bbi, A##ki, 43); \ - E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \ - XOReq256(E##ba, CONST256_64(keccakf1600RoundConstants[i])); \ - XOReq256(A##mo, Do); \ - ROL64in256(Bbo, A##mo, 21); \ - E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \ - XOReq256(A##su, Du); \ - ROL64in256(Bbu, A##su, 14); \ - E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \ - E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \ - E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \ - \ - XOReq256(A##bo, Do); \ - ROL64in256(Bga, A##bo, 28); \ - XOReq256(A##gu, Du); \ - ROL64in256(Bge, A##gu, 20); \ - XOReq256(A##ka, Da); \ - ROL64in256(Bgi, A##ka, 3); \ - E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \ - XOReq256(A##me, De); \ - ROL64in256(Bgo, A##me, 45); \ - E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \ - XOReq256(A##si, Di); \ - ROL64in256(Bgu, A##si, 61); \ - E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \ - E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \ - E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \ - \ - XOReq256(A##be, De); \ - ROL64in256(Bka, A##be, 1); \ - XOReq256(A##gi, Di); \ - ROL64in256(Bke, A##gi, 6); \ - XOReq256(A##ko, Do); \ - ROL64in256(Bki, A##ko, 25); \ - E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \ - XOReq256(A##mu, Du); \ - ROL64in256_8(Bko, A##mu); \ - E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \ - XOReq256(A##sa, Da); \ - ROL64in256(Bku, A##sa, 18); \ - E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \ - E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \ - E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \ - \ - XOReq256(A##bu, Du); \ - ROL64in256(Bma, A##bu, 27); \ - XOReq256(A##ga, Da); \ - ROL64in256(Bme, A##ga, 36); \ - XOReq256(A##ke, De); \ - ROL64in256(Bmi, A##ke, 10); \ - E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \ - XOReq256(A##mi, Di); \ - ROL64in256(Bmo, A##mi, 15); \ - E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \ - XOReq256(A##so, Do); \ - ROL64in256_56(Bmu, A##so); \ - E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \ - E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \ - E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \ - \ - XOReq256(A##bi, Di); \ - ROL64in256(Bsa, A##bi, 62); \ - XOReq256(A##go, Do); \ - ROL64in256(Bse, A##go, 55); \ - XOReq256(A##ku, Du); \ - ROL64in256(Bsi, A##ku, 39); \ - E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \ - XOReq256(A##ma, Da); \ - ROL64in256(Bso, A##ma, 41); \ - E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \ - XOReq256(A##se, De); \ - ROL64in256(Bsu, A##se, 2); \ - E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \ - E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \ - E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); +#define MLD_thetaRhoPiChiIota(i, A, E) \ + MLD_ROL64IN256(Ce1, Ce, 1); \ + Da = MLD_XOR256(Cu, Ce1); \ + MLD_ROL64IN256(Ci1, Ci, 1); \ + De = MLD_XOR256(Ca, Ci1); \ + MLD_ROL64IN256(Co1, Co, 1); \ + Di = MLD_XOR256(Ce, Co1); \ + MLD_ROL64IN256(Cu1, Cu, 1); \ + Do = MLD_XOR256(Ci, Cu1); \ + MLD_ROL64IN256(Ca1, Ca, 1); \ + Du = MLD_XOR256(Co, Ca1); \ + \ + MLD_XOREQ256(A##ba, Da); \ + Bba = A##ba; \ + MLD_XOREQ256(A##ge, De); \ + MLD_ROL64IN256(Bbe, A##ge, 44); \ + MLD_XOREQ256(A##ki, Di); \ + MLD_ROL64IN256(Bbi, A##ki, 43); \ + E##ba = MLD_XOR256(Bba, MLD_ANDNU256(Bbe, Bbi)); \ + MLD_XOREQ256(E##ba, MLD_CONST256_64(keccakf1600RoundConstants[i])); \ + MLD_XOREQ256(A##mo, Do); \ + MLD_ROL64IN256(Bbo, A##mo, 21); \ + E##be = MLD_XOR256(Bbe, MLD_ANDNU256(Bbi, Bbo)); \ + MLD_XOREQ256(A##su, Du); \ + MLD_ROL64IN256(Bbu, A##su, 14); \ + E##bi = MLD_XOR256(Bbi, MLD_ANDNU256(Bbo, Bbu)); \ + E##bo = MLD_XOR256(Bbo, MLD_ANDNU256(Bbu, Bba)); \ + E##bu = MLD_XOR256(Bbu, MLD_ANDNU256(Bba, Bbe)); \ + \ + MLD_XOREQ256(A##bo, Do); \ + MLD_ROL64IN256(Bga, A##bo, 28); \ + MLD_XOREQ256(A##gu, Du); \ + MLD_ROL64IN256(Bge, A##gu, 20); \ + MLD_XOREQ256(A##ka, Da); \ + MLD_ROL64IN256(Bgi, A##ka, 3); \ + E##ga = MLD_XOR256(Bga, MLD_ANDNU256(Bge, Bgi)); \ + MLD_XOREQ256(A##me, De); \ + MLD_ROL64IN256(Bgo, A##me, 45); \ + E##ge = MLD_XOR256(Bge, MLD_ANDNU256(Bgi, Bgo)); \ + MLD_XOREQ256(A##si, Di); \ + MLD_ROL64IN256(Bgu, A##si, 61); \ + E##gi = MLD_XOR256(Bgi, MLD_ANDNU256(Bgo, Bgu)); \ + E##go = MLD_XOR256(Bgo, MLD_ANDNU256(Bgu, Bga)); \ + E##gu = MLD_XOR256(Bgu, MLD_ANDNU256(Bga, Bge)); \ + \ + MLD_XOREQ256(A##be, De); \ + MLD_ROL64IN256(Bka, A##be, 1); \ + MLD_XOREQ256(A##gi, Di); \ + MLD_ROL64IN256(Bke, A##gi, 6); \ + MLD_XOREQ256(A##ko, Do); \ + MLD_ROL64IN256(Bki, A##ko, 25); \ + E##ka = MLD_XOR256(Bka, MLD_ANDNU256(Bke, Bki)); \ + MLD_XOREQ256(A##mu, Du); \ + MLD_ROL64IN256_8(Bko, A##mu); \ + E##ke = MLD_XOR256(Bke, MLD_ANDNU256(Bki, Bko)); \ + MLD_XOREQ256(A##sa, Da); \ + MLD_ROL64IN256(Bku, A##sa, 18); \ + E##ki = MLD_XOR256(Bki, MLD_ANDNU256(Bko, Bku)); \ + E##ko = MLD_XOR256(Bko, MLD_ANDNU256(Bku, Bka)); \ + E##ku = MLD_XOR256(Bku, MLD_ANDNU256(Bka, Bke)); \ + \ + MLD_XOREQ256(A##bu, Du); \ + MLD_ROL64IN256(Bma, A##bu, 27); \ + MLD_XOREQ256(A##ga, Da); \ + MLD_ROL64IN256(Bme, A##ga, 36); \ + MLD_XOREQ256(A##ke, De); \ + MLD_ROL64IN256(Bmi, A##ke, 10); \ + E##ma = MLD_XOR256(Bma, MLD_ANDNU256(Bme, Bmi)); \ + MLD_XOREQ256(A##mi, Di); \ + MLD_ROL64IN256(Bmo, A##mi, 15); \ + E##me = MLD_XOR256(Bme, MLD_ANDNU256(Bmi, Bmo)); \ + MLD_XOREQ256(A##so, Do); \ + MLD_ROL64IN256_56(Bmu, A##so); \ + E##mi = MLD_XOR256(Bmi, MLD_ANDNU256(Bmo, Bmu)); \ + E##mo = MLD_XOR256(Bmo, MLD_ANDNU256(Bmu, Bma)); \ + E##mu = MLD_XOR256(Bmu, MLD_ANDNU256(Bma, Bme)); \ + \ + MLD_XOREQ256(A##bi, Di); \ + MLD_ROL64IN256(Bsa, A##bi, 62); \ + MLD_XOREQ256(A##go, Do); \ + MLD_ROL64IN256(Bse, A##go, 55); \ + MLD_XOREQ256(A##ku, Du); \ + MLD_ROL64IN256(Bsi, A##ku, 39); \ + E##sa = MLD_XOR256(Bsa, MLD_ANDNU256(Bse, Bsi)); \ + MLD_XOREQ256(A##ma, Da); \ + MLD_ROL64IN256(Bso, A##ma, 41); \ + E##se = MLD_XOR256(Bse, MLD_ANDNU256(Bsi, Bso)); \ + MLD_XOREQ256(A##se, De); \ + MLD_ROL64IN256(Bsu, A##se, 2); \ + E##si = MLD_XOR256(Bsi, MLD_ANDNU256(Bso, Bsu)); \ + E##so = MLD_XOR256(Bso, MLD_ANDNU256(Bsu, Bsa)); \ + E##su = MLD_XOR256(Bsu, MLD_ANDNU256(Bsa, Bse)); static MLD_ALIGN const uint64_t keccakf1600RoundConstants[24] = { @@ -319,7 +323,7 @@ static MLD_ALIGN const uint64_t keccakf1600RoundConstants[24] = { #include -#define copyFromState(X, state) \ +#define MLD_COPY_FROM_STATE(X, state) \ do \ { \ const uint64_t *state64 = (const uint64_t *)(state); \ @@ -353,7 +357,7 @@ static MLD_ALIGN const uint64_t keccakf1600RoundConstants[24] = { X##su = _mm256_i64gather_epi64((long long *)(24 * 8), _idx, 1); \ } while (0); -#define SCATTER_STORE256(state, idx, v) \ +#define MLD_SCATTER_STORE256(state, idx, v) \ do \ { \ const uint64_t *state64 = (const uint64_t *)(state); \ @@ -365,94 +369,94 @@ static MLD_ALIGN const uint64_t keccakf1600RoundConstants[24] = { _mm_storeh_pd((double *)&state64[75 + (idx)], t); \ } while (0) -#define copyToState(state, X) \ - SCATTER_STORE256(state, 0, X##ba); \ - SCATTER_STORE256(state, 1, X##be); \ - SCATTER_STORE256(state, 2, X##bi); \ - SCATTER_STORE256(state, 3, X##bo); \ - SCATTER_STORE256(state, 4, X##bu); \ - SCATTER_STORE256(state, 5, X##ga); \ - SCATTER_STORE256(state, 6, X##ge); \ - SCATTER_STORE256(state, 7, X##gi); \ - SCATTER_STORE256(state, 8, X##go); \ - SCATTER_STORE256(state, 9, X##gu); \ - SCATTER_STORE256(state, 10, X##ka); \ - SCATTER_STORE256(state, 11, X##ke); \ - SCATTER_STORE256(state, 12, X##ki); \ - SCATTER_STORE256(state, 13, X##ko); \ - SCATTER_STORE256(state, 14, X##ku); \ - SCATTER_STORE256(state, 15, X##ma); \ - SCATTER_STORE256(state, 16, X##me); \ - SCATTER_STORE256(state, 17, X##mi); \ - SCATTER_STORE256(state, 18, X##mo); \ - SCATTER_STORE256(state, 19, X##mu); \ - SCATTER_STORE256(state, 20, X##sa); \ - SCATTER_STORE256(state, 21, X##se); \ - SCATTER_STORE256(state, 22, X##si); \ - SCATTER_STORE256(state, 23, X##so); \ - SCATTER_STORE256(state, 24, X##su); +#define MLD_COPY_TO_STATE(state, X) \ + MLD_SCATTER_STORE256(state, 0, X##ba); \ + MLD_SCATTER_STORE256(state, 1, X##be); \ + MLD_SCATTER_STORE256(state, 2, X##bi); \ + MLD_SCATTER_STORE256(state, 3, X##bo); \ + MLD_SCATTER_STORE256(state, 4, X##bu); \ + MLD_SCATTER_STORE256(state, 5, X##ga); \ + MLD_SCATTER_STORE256(state, 6, X##ge); \ + MLD_SCATTER_STORE256(state, 7, X##gi); \ + MLD_SCATTER_STORE256(state, 8, X##go); \ + MLD_SCATTER_STORE256(state, 9, X##gu); \ + MLD_SCATTER_STORE256(state, 10, X##ka); \ + MLD_SCATTER_STORE256(state, 11, X##ke); \ + MLD_SCATTER_STORE256(state, 12, X##ki); \ + MLD_SCATTER_STORE256(state, 13, X##ko); \ + MLD_SCATTER_STORE256(state, 14, X##ku); \ + MLD_SCATTER_STORE256(state, 15, X##ma); \ + MLD_SCATTER_STORE256(state, 16, X##me); \ + MLD_SCATTER_STORE256(state, 17, X##mi); \ + MLD_SCATTER_STORE256(state, 18, X##mo); \ + MLD_SCATTER_STORE256(state, 19, X##mu); \ + MLD_SCATTER_STORE256(state, 20, X##sa); \ + MLD_SCATTER_STORE256(state, 21, X##se); \ + MLD_SCATTER_STORE256(state, 22, X##si); \ + MLD_SCATTER_STORE256(state, 23, X##so); \ + MLD_SCATTER_STORE256(state, 24, X##su); -#define copyStateVariables(X, Y) \ - X##ba = Y##ba; \ - X##be = Y##be; \ - X##bi = Y##bi; \ - X##bo = Y##bo; \ - X##bu = Y##bu; \ - X##ga = Y##ga; \ - X##ge = Y##ge; \ - X##gi = Y##gi; \ - X##go = Y##go; \ - X##gu = Y##gu; \ - X##ka = Y##ka; \ - X##ke = Y##ke; \ - X##ki = Y##ki; \ - X##ko = Y##ko; \ - X##ku = Y##ku; \ - X##ma = Y##ma; \ - X##me = Y##me; \ - X##mi = Y##mi; \ - X##mo = Y##mo; \ - X##mu = Y##mu; \ - X##sa = Y##sa; \ - X##se = Y##se; \ - X##si = Y##si; \ - X##so = Y##so; \ +#define MLD_COPY_STATE_VARIABLES(X, Y) \ + X##ba = Y##ba; \ + X##be = Y##be; \ + X##bi = Y##bi; \ + X##bo = Y##bo; \ + X##bu = Y##bu; \ + X##ga = Y##ga; \ + X##ge = Y##ge; \ + X##gi = Y##gi; \ + X##go = Y##go; \ + X##gu = Y##gu; \ + X##ka = Y##ka; \ + X##ke = Y##ke; \ + X##ki = Y##ki; \ + X##ko = Y##ko; \ + X##ku = Y##ku; \ + X##ma = Y##ma; \ + X##me = Y##me; \ + X##mi = Y##mi; \ + X##mo = Y##mo; \ + X##mu = Y##mu; \ + X##sa = Y##sa; \ + X##se = Y##se; \ + X##si = Y##si; \ + X##so = Y##so; \ X##su = Y##su; /* clang-format off */ -#define rounds24 \ - prepareTheta \ - thetaRhoPiChiIotaPrepareTheta( 0, A, E) \ - thetaRhoPiChiIotaPrepareTheta( 1, E, A) \ - thetaRhoPiChiIotaPrepareTheta( 2, A, E) \ - thetaRhoPiChiIotaPrepareTheta( 3, E, A) \ - thetaRhoPiChiIotaPrepareTheta( 4, A, E) \ - thetaRhoPiChiIotaPrepareTheta( 5, E, A) \ - thetaRhoPiChiIotaPrepareTheta( 6, A, E) \ - thetaRhoPiChiIotaPrepareTheta( 7, E, A) \ - thetaRhoPiChiIotaPrepareTheta( 8, A, E) \ - thetaRhoPiChiIotaPrepareTheta( 9, E, A) \ - thetaRhoPiChiIotaPrepareTheta(10, A, E) \ - thetaRhoPiChiIotaPrepareTheta(11, E, A) \ - thetaRhoPiChiIotaPrepareTheta(12, A, E) \ - thetaRhoPiChiIotaPrepareTheta(13, E, A) \ - thetaRhoPiChiIotaPrepareTheta(14, A, E) \ - thetaRhoPiChiIotaPrepareTheta(15, E, A) \ - thetaRhoPiChiIotaPrepareTheta(16, A, E) \ - thetaRhoPiChiIotaPrepareTheta(17, E, A) \ - thetaRhoPiChiIotaPrepareTheta(18, A, E) \ - thetaRhoPiChiIotaPrepareTheta(19, E, A) \ - thetaRhoPiChiIotaPrepareTheta(20, A, E) \ - thetaRhoPiChiIotaPrepareTheta(21, E, A) \ - thetaRhoPiChiIotaPrepareTheta(22, A, E) \ - thetaRhoPiChiIota(23, E, A) +#define MLD_ROUNDS24 \ + MLD_prepareTheta \ + MLD_thetaRhoPiChiIotaPrepareTheta( 0, A, E) \ + MLD_thetaRhoPiChiIotaPrepareTheta( 1, E, A) \ + MLD_thetaRhoPiChiIotaPrepareTheta( 2, A, E) \ + MLD_thetaRhoPiChiIotaPrepareTheta( 3, E, A) \ + MLD_thetaRhoPiChiIotaPrepareTheta( 4, A, E) \ + MLD_thetaRhoPiChiIotaPrepareTheta( 5, E, A) \ + MLD_thetaRhoPiChiIotaPrepareTheta( 6, A, E) \ + MLD_thetaRhoPiChiIotaPrepareTheta( 7, E, A) \ + MLD_thetaRhoPiChiIotaPrepareTheta( 8, A, E) \ + MLD_thetaRhoPiChiIotaPrepareTheta( 9, E, A) \ + MLD_thetaRhoPiChiIotaPrepareTheta(10, A, E) \ + MLD_thetaRhoPiChiIotaPrepareTheta(11, E, A) \ + MLD_thetaRhoPiChiIotaPrepareTheta(12, A, E) \ + MLD_thetaRhoPiChiIotaPrepareTheta(13, E, A) \ + MLD_thetaRhoPiChiIotaPrepareTheta(14, A, E) \ + MLD_thetaRhoPiChiIotaPrepareTheta(15, E, A) \ + MLD_thetaRhoPiChiIotaPrepareTheta(16, A, E) \ + MLD_thetaRhoPiChiIotaPrepareTheta(17, E, A) \ + MLD_thetaRhoPiChiIotaPrepareTheta(18, A, E) \ + MLD_thetaRhoPiChiIotaPrepareTheta(19, E, A) \ + MLD_thetaRhoPiChiIotaPrepareTheta(20, A, E) \ + MLD_thetaRhoPiChiIotaPrepareTheta(21, E, A) \ + MLD_thetaRhoPiChiIotaPrepareTheta(22, A, E) \ + MLD_thetaRhoPiChiIota(23, E, A) /* clang-format on */ void mld_keccakf1600x4_permute24(void *states) { __m256i *statesAsLanes = (__m256i *)states; - declareABCDE copyFromState(A, statesAsLanes) - rounds24 copyToState(statesAsLanes, A) + MLD_DECLARE_ABCDE MLD_COPY_FROM_STATE(A, statesAsLanes) + MLD_ROUNDS24 MLD_COPY_TO_STATE(statesAsLanes, A) } #else /* MLD_FIPS202_X86_64_XKCP && !MLD_CONFIG_MULTILEVEL_NO_SHARED */ @@ -463,22 +467,22 @@ MLD_EMPTY_CU(fips202_avx2_keccakx4) /* To facilitate single-compilation-unit (SCU) builds, undefine all macros. * Don't modify by hand -- this is auto-generated by scripts/autogen. */ -#undef ANDnu256 -#undef CONST256 -#undef CONST256_64 -#undef ROL64in256 -#undef ROL64in256_8 -#undef ROL64in256_56 -#undef STORE256 -#undef XOR256 -#undef XOReq256 -#undef SnP_laneLengthInBytes -#undef declareABCDE -#undef prepareTheta -#undef thetaRhoPiChiIotaPrepareTheta -#undef thetaRhoPiChiIota -#undef copyFromState -#undef SCATTER_STORE256 -#undef copyToState -#undef copyStateVariables -#undef rounds24 +#undef MLD_ANDNU256 +#undef MLD_CONST256 +#undef MLD_CONST256_64 +#undef MLD_ROL64IN256 +#undef MLD_ROL64IN256_8 +#undef MLD_ROL64IN256_56 +#undef MLD_STORE256 +#undef MLD_XOR256 +#undef MLD_XOREQ256 +#undef MLD_SNP_LANELENGTHINBYTES +#undef MLD_DECLARE_ABCDE +#undef MLD_prepareTheta +#undef MLD_thetaRhoPiChiIotaPrepareTheta +#undef MLD_thetaRhoPiChiIota +#undef MLD_COPY_FROM_STATE +#undef MLD_SCATTER_STORE256 +#undef MLD_COPY_TO_STATE +#undef MLD_COPY_STATE_VARIABLES +#undef MLD_ROUNDS24