diff --git a/scripts_bench_neon/bench/bench.c b/scripts_bench_neon/bench/bench.c new file mode 100644 index 0000000000..9ba0c6cd7d --- /dev/null +++ b/scripts_bench_neon/bench/bench.c @@ -0,0 +1,136 @@ +// /* +// BLAKE2 reference source code package - benchmark tool + +// Copyright 2012, Samuel Neves . You may use this under the +// terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at +// your option. The terms of these licenses can be found at: + +// - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 +// - OpenSSL license : https://www.openssl.org/source/license.html +// - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + +// More information about the BLAKE2 hash function can be found at +// https://blake2.net. +// */// based on https://github.com/BLAKE2/BLAKE2/tree/master/bench +#include +#include +#include +#include + +#define _GNU_SOURCE +#include +#include +#include +#include +#include + +// int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen ); +#include // Include libsodium header + +// Function to open a perf_event file descriptor +static inline long perf_event_open(struct perf_event_attr *attr, pid_t pid, + int cpu, int group_fd, unsigned long flags) { + return syscall(SYS_perf_event_open, attr, pid, cpu, group_fd, flags); +} +// Global variable to store the perf event file descriptor +static int perf_fd = -1; + + + +// Replace the crypto_hash function with libsodium's crypto_generichash +int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long inlen) { + // Use the generichash function from libsodium (default 64-byte hash length) + return crypto_generichash_blake2b(out, crypto_generichash_BYTES, in, inlen, NULL, 0); +} + +static int bench_cmp( const void *x, const void *y ) +{ + const int64_t *ix = ( const int64_t * )x; + const int64_t *iy = ( const int64_t * )y; + return *ix - *iy; +} + +// Initialize the performance counter (should be called once at startup) +static void cpucycles_init(void) { + struct perf_event_attr pe; + memset(&pe, 0, sizeof(pe)); + pe.type = PERF_TYPE_HARDWARE; + pe.size = sizeof(pe); + pe.config = PERF_COUNT_HW_CPU_CYCLES; + pe.disabled = 0; // Start immediately + pe.exclude_kernel = 1; // User-space only + pe.exclude_hv = 1; // Exclude hypervisor + + perf_fd = perf_event_open(&pe, 0, -1, -1, 0); + if (perf_fd == -1) { + perror("perf_event_open failed"); + } +} + +// Function to get current cycle count +static unsigned long long cpucycles(void) { + if (perf_fd == -1) { + fprintf(stderr, "cpucycles: perf_fd not initialized!\n"); + return 0; + } + + uint64_t cycles; + ssize_t ret = read(perf_fd, &cycles, sizeof(cycles)); + if (ret == -1) { + perror("cpucycles: read failed"); + return 0; + } + + return cycles; +} +// Cleanup function to close perf event file descriptor +static void cpucycles_cleanup(void) { + if (perf_fd != -1) { + close(perf_fd); + perf_fd = -1; + } +} + +void bench() +{ +#define BENCH_TRIALS 32 +#define BENCH_MAXLEN 1536 + static unsigned char in[4096]; + static unsigned long long median[4096 + 1]; + int i, j; + printf( "#bytes median per byte\n" ); + + cpucycles_init(); + /* 1 ... BENCH_MAXLEN */ + for( j = 0; j <= 4096; ++j ) + { + uint64_t cycles[BENCH_TRIALS + 1]; + + for( i = 0; i <= BENCH_TRIALS; ++i ) + { + cycles[i] = cpucycles(); + crypto_hash( in, in, j ); + } + + for( i = 0; i < BENCH_TRIALS; ++i ) + cycles[i] = cycles[i + 1] - cycles[i]; + + qsort( cycles, BENCH_TRIALS, sizeof( uint64_t ), bench_cmp ); + median[j] = cycles[BENCH_TRIALS / 2]; + } + + cpucycles_cleanup(); // Clean up perf event + + for( j = 0; j <= BENCH_MAXLEN; j += 8 ) + printf( "%5d, %7.2f\n", j, ( double )median[j] / j ); + + printf( "#2048 %6llu %7.2f\n", median[2048], ( double )median[2048] / 2048.0 ); + printf( "#4096 %6llu %7.2f\n", median[4096], ( double )median[4096] / 4096.0 ); + printf( "#long long %7.2f\n", ( double )( median[4096] - median[2048] ) / 2048.0 ); +} + +int main() +{ + bench(); + return 0; +} diff --git a/scripts_bench_neon/bench/makefile b/scripts_bench_neon/bench/makefile new file mode 100644 index 0000000000..7e2e47c086 --- /dev/null +++ b/scripts_bench_neon/bench/makefile @@ -0,0 +1,26 @@ +CC=gcc +# # Use gnu99 to support inline asm +# CFLAGS=-O3 -march=native -mavx2 -Wall -Wextra -DSUPERCOP + +# CFLAGS=-O3 -march=native -mssse3 -Wall -Wextra -DSUPERCOP +CFLAGS=-O3 -march=native -mcpu=neoverse-n1 -Wall -Wextra -DSUPERCOP + +# CFLAGS=-O3 -Wall -Wextra -DSUPERCOP +LIBS=-lsodium +INCLUDE_DIR=$(HOME)/include +LIB_DIR=$(HOME)/lib +FILES=bench.c + +# Target for generating the executable +all: bench + +bench: $(FILES) + $(CC) $(FILES) $(CFLAGS) -I$(INCLUDE_DIR) -L$(LIB_DIR) $(LIBS) -o generichash_bench + +# Make the data files by running the benchmark programs +plot: bench + ./generichash_bench > generichash.data + +# Clean up generated files +clean: + rm -f generichash_bench generichash.data plotcycles.pdf diff --git a/scripts_bench_neon/benchmark_throughput/benchmark.c b/scripts_bench_neon/benchmark_throughput/benchmark.c new file mode 100644 index 0000000000..c236b7a4c7 --- /dev/null +++ b/scripts_bench_neon/benchmark_throughput/benchmark.c @@ -0,0 +1,92 @@ +#include +#include +#include +#include + +// Function to run a single benchmark and save results +void run_benchmark(FILE *fp, const char *description, size_t message_len, size_t hash_len, int iterations) { + unsigned char *message = malloc(message_len); + unsigned char *hash = malloc(hash_len); + + if (!message || !hash) { + printf("Memory allocation failed!\n"); + free(message); + free(hash); + return; + } + + randombytes_buf(message, message_len); + + // Start timing + clock_t start = clock(); + for (int i = 0; i < iterations; i++) { + crypto_generichash(hash, hash_len, message, message_len, NULL, 0); + } + clock_t end = clock(); + + double cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC; + double time_per_hash = (cpu_time_used * 1e6) / iterations; // µs per hash + double total_data_processed = (double)(message_len * iterations) / (1024 * 1024); // Convert to MB + double throughput = total_data_processed / cpu_time_used; // MB per second + + // Print results to console + printf("Benchmark: %s\n", description); + printf(" Message size: %zu bytes\n", message_len); + printf(" Hash length: %zu bytes\n", hash_len); + printf(" Iterations: %d\n", iterations); + printf(" Total time: %.6f seconds\n", cpu_time_used); + printf(" Time per hash: %.6f microseconds\n", time_per_hash); + printf(" Throughput: %.2f MB/s\n\n", throughput); + + // Save data for Gnuplot (Format: message_len time_per_hash throughput) + fprintf(fp, "%zu %.6f %.2f\n", message_len, time_per_hash, throughput); + + free(message); + free(hash); +} + +int main() { + if (sodium_init() < 0) { + printf("Libsodium initialization failed!\n"); + return 1; + } + + FILE *fp = fopen("benchmark_results.data", "w"); + if (!fp) { + printf("Error opening file for writing!\n"); + return 1; + } + + // Define power-of-2 message sizes for benchmarking + size_t sizes[] = { + 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, + 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, + 8388608, 16777216 + }; + + int num_sizes = sizeof(sizes) / sizeof(sizes[0]); + + // Adjust iterations to balance test duration for different message sizes + int iterations[num_sizes]; + for (int i = 0; i < num_sizes; i++) { + if (sizes[i] < 1024) + iterations[i] = 200000; + else if (sizes[i] < 65536) + iterations[i] = 50000; + else if (sizes[i] < 1048576) + iterations[i] = 10000; + else if (sizes[i] < 8388608) + iterations[i] = 1000; + else + iterations[i] = 100; + } + + for (int i = 0; i < num_sizes; i++) { + char desc[100]; + snprintf(desc, sizeof(desc), "Message size: %zu bytes", sizes[i]); + run_benchmark(fp, desc, sizes[i], crypto_generichash_BYTES, iterations[i]); + } + + fclose(fp); + return 0; +} \ No newline at end of file diff --git a/scripts_bench_neon/readme.txt b/scripts_bench_neon/readme.txt new file mode 100644 index 0000000000..93d35ed9be --- /dev/null +++ b/scripts_bench_neon/readme.txt @@ -0,0 +1,17 @@ +for the libsodium + +./autogen.sh -s +automake +mkdir builddir +cd builddir +../configure CFLAGS="-DDEV_MODE" CPPFLAGS="-DDEV_MODE" --prefix=$HOME + +make && make check + +#benchmark_throughput +gcc -o benchmark benchmark.c -I $HOME/include -L $HOME/lib -lsodium +LD_LIBRARY_PATH=$HOME/lib ./benchmark + +#bench +make bench +sudo LD_LIBRARY_PATH=$HOME/lib ./generichash_bench > generichash_bench.data diff --git a/src/libsodium/Makefile.am b/src/libsodium/Makefile.am index c1f26e414e..34d1463223 100644 --- a/src/libsodium/Makefile.am +++ b/src/libsodium/Makefile.am @@ -38,6 +38,7 @@ libsodium_la_SOURCES = \ crypto_generichash/blake2b/ref/blake2b-load-sse2.h \ crypto_generichash/blake2b/ref/blake2b-load-sse41.h \ crypto_generichash/blake2b/ref/blake2b-load-avx2.h \ + crypto_generichash/blake2b/ref/blake2b-load-neon.h \ crypto_generichash/blake2b/ref/blake2b-ref.c \ crypto_generichash/blake2b/ref/generichash_blake2b.c \ crypto_hash/crypto_hash.c \ @@ -232,6 +233,8 @@ libarmcrypto_la_LDFLAGS = $(libsodium_la_LDFLAGS) libarmcrypto_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \ @CFLAGS_ARMCRYPTO@ libarmcrypto_la_SOURCES = \ + crypto_generichash/blake2b/ref/blake2b-compress-neon.c \ + crypto_generichash/blake2b/ref/blake2b-compress-neon.h \ crypto_aead/aegis128l/aegis128l_armcrypto.c \ crypto_aead/aegis128l/aegis128l_armcrypto.h \ crypto_aead/aegis256/aegis256_armcrypto.c \ diff --git a/src/libsodium/crypto_generichash/blake2b/ref/blake2.h b/src/libsodium/crypto_generichash/blake2b/ref/blake2.h index eeccdcc99d..b8970ed864 100644 --- a/src/libsodium/crypto_generichash/blake2b/ref/blake2.h +++ b/src/libsodium/crypto_generichash/blake2b/ref/blake2.h @@ -103,5 +103,7 @@ int blake2b_compress_sse41(blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES]); int blake2b_compress_avx2(blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES]); +int blake2b_compress_neon(blake2b_state *S, + const uint8_t block[BLAKE2B_BLOCKBYTES]); #endif diff --git a/src/libsodium/crypto_generichash/blake2b/ref/blake2b-compress-neon.c b/src/libsodium/crypto_generichash/blake2b/ref/blake2b-compress-neon.c new file mode 100644 index 0000000000..22536f5a7a --- /dev/null +++ b/src/libsodium/crypto_generichash/blake2b/ref/blake2b-compress-neon.c @@ -0,0 +1,85 @@ +/* + BLAKE2 reference source code package - reference C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ + +#include +#include + +#include "blake2.h" +#include "private/common.h" + +#if defined(__aarch64__) + +# include + +# include "blake2b-compress-neon.h" + +CRYPTO_ALIGN(64) +static const uint64_t blake2b_IV[8] = { + 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL, + 0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, + 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL +}; + +int +blake2b_compress_neon(blake2b_state *S, + const uint8_t block[BLAKE2B_BLOCKBYTES]) +{ + uint64x2_t row1l, row1h; + uint64x2_t row2l, row2h; + uint64x2_t row3l, row3h; + uint64x2_t row4l, row4h; + uint64x2_t b0, b1; + uint64x2_t t0, t1; + + const uint64x2_t m0 = vreinterpretq_u64_u8(vld1q_u8(block + 00)); + const uint64x2_t m1 = vreinterpretq_u64_u8(vld1q_u8(block + 16)); + const uint64x2_t m2 = vreinterpretq_u64_u8(vld1q_u8(block + 32)); + const uint64x2_t m3 = vreinterpretq_u64_u8(vld1q_u8(block + 48)); + const uint64x2_t m4 = vreinterpretq_u64_u8(vld1q_u8(block + 64)); + const uint64x2_t m5 = vreinterpretq_u64_u8(vld1q_u8(block + 80)); + const uint64x2_t m6 = vreinterpretq_u64_u8(vld1q_u8(block + 96)); + const uint64x2_t m7 = vreinterpretq_u64_u8(vld1q_u8(block + 112)); + + const uint64x2_t h0 = row1l = vld1q_u64(&S->h[0]); + const uint64x2_t h1 = row1h = vld1q_u64(&S->h[2]); + const uint64x2_t h2 = row2l = vld1q_u64(&S->h[4]); + const uint64x2_t h3 = row2h = vld1q_u64(&S->h[6]); + + row3l = vld1q_u64(&blake2b_IV[0]); + row3h = vld1q_u64(&blake2b_IV[2]); + row4l = veorq_u64(vld1q_u64(&blake2b_IV[4]), vld1q_u64(&S->t[0])); + row4h = veorq_u64(vld1q_u64(&blake2b_IV[6]), vld1q_u64(&S->f[0])); + + ROUND(0); + ROUND(1); + ROUND(2); + ROUND(3); + ROUND(4); + ROUND(5); + ROUND(6); + ROUND(7); + ROUND(8); + ROUND(9); + ROUND(10); + ROUND(11); + + vst1q_u64(&S->h[0], veorq_u64(h0, veorq_u64(row1l, row3l))); + vst1q_u64(&S->h[2], veorq_u64(h1, veorq_u64(row1h, row3h))); + vst1q_u64(&S->h[4], veorq_u64(h2, veorq_u64(row2l, row4l))); + vst1q_u64(&S->h[6], veorq_u64(h3, veorq_u64(row2h, row4h))); + return 0; +} + +#endif diff --git a/src/libsodium/crypto_generichash/blake2b/ref/blake2b-compress-neon.h b/src/libsodium/crypto_generichash/blake2b/ref/blake2b-compress-neon.h new file mode 100644 index 0000000000..b076dae733 --- /dev/null +++ b/src/libsodium/crypto_generichash/blake2b/ref/blake2b-compress-neon.h @@ -0,0 +1,123 @@ +/* + BLAKE2 reference source code package - reference C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ + +#ifndef blake2b_compress_neon_H +#define blake2b_compress_neon_H + +#define G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \ + do { \ + row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \ + row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \ + row4l = veorq_u64(row4l, row1l); \ + row4h = veorq_u64(row4h, row1h); \ + row4l = vreinterpretq_u64_u32( \ + vrev64q_u32(vreinterpretq_u32_u64(row4l))); \ + row4h = vreinterpretq_u64_u32( \ + vrev64q_u32(vreinterpretq_u32_u64(row4h))); \ + row3l = vaddq_u64(row3l, row4l); \ + row3h = vaddq_u64(row3h, row4h); \ + row2l = veorq_u64(row2l, row3l); \ + row2h = veorq_u64(row2h, row3h); \ + row2l = vcombine_u64( \ + vreinterpret_u64_u8(vext_u8( \ + vreinterpret_u8_u64(vget_low_u64(row2l)), \ + vreinterpret_u8_u64(vget_low_u64(row2l)), 3)), \ + vreinterpret_u64_u8(vext_u8( \ + vreinterpret_u8_u64(vget_high_u64(row2l)), \ + vreinterpret_u8_u64(vget_high_u64(row2l)), 3))); \ + row2h = vcombine_u64( \ + vreinterpret_u64_u8(vext_u8( \ + vreinterpret_u8_u64(vget_low_u64(row2h)), \ + vreinterpret_u8_u64(vget_low_u64(row2h)), 3)), \ + vreinterpret_u64_u8(vext_u8( \ + vreinterpret_u8_u64(vget_high_u64(row2h)), \ + vreinterpret_u8_u64(vget_high_u64(row2h)), 3))); \ + } while(0) + +#define G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1) \ + do { \ + row1l = vaddq_u64(vaddq_u64(row1l, b0), row2l); \ + row1h = vaddq_u64(vaddq_u64(row1h, b1), row2h); \ + row4l = veorq_u64(row4l, row1l); \ + row4h = veorq_u64(row4h, row1h); \ + row4l = vcombine_u64( \ + vreinterpret_u64_u8(vext_u8( \ + vreinterpret_u8_u64(vget_low_u64(row4l)), \ + vreinterpret_u8_u64(vget_low_u64(row4l)), 2)), \ + vreinterpret_u64_u8(vext_u8( \ + vreinterpret_u8_u64(vget_high_u64(row4l)), \ + vreinterpret_u8_u64(vget_high_u64(row4l)), 2))); \ + row4h = vcombine_u64( \ + vreinterpret_u64_u8(vext_u8( \ + vreinterpret_u8_u64(vget_low_u64(row4h)), \ + vreinterpret_u8_u64(vget_low_u64(row4h)), 2)), \ + vreinterpret_u64_u8(vext_u8( \ + vreinterpret_u8_u64(vget_high_u64(row4h)), \ + vreinterpret_u8_u64(vget_high_u64(row4h)), 2))); \ + row3l = vaddq_u64(row3l, row4l); \ + row3h = vaddq_u64(row3h, row4h); \ + row2l = veorq_u64(row2l, row3l); \ + row2h = veorq_u64(row2h, row3h); \ + row2l = veorq_u64(vaddq_u64(row2l, row2l), vshrq_n_u64(row2l, 63)); \ + row2h = veorq_u64(vaddq_u64(row2h, row2h), vshrq_n_u64(row2h, 63)); \ + } while(0) + +#define DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \ + do { \ + t0 = vextq_u64(row2l, row2h, 1); \ + t1 = vextq_u64(row2h, row2l, 1); \ + row2l = t0; \ + row2h = t1; \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + t0 = vextq_u64(row4h, row4l, 1); \ + t1 = vextq_u64(row4l, row4h, 1); \ + row4l = t0; \ + row4h = t1; \ + } while(0) + +#define UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h) \ + do { \ + t0 = vextq_u64(row2h, row2l, 1); \ + t1 = vextq_u64(row2l, row2h, 1); \ + row2l = t0; \ + row2h = t1; \ + t0 = row3l; \ + row3l = row3h; \ + row3h = t0; \ + t0 = vextq_u64(row4l, row4h, 1); \ + t1 = vextq_u64(row4h, row4l, 1); \ + row4l = t0; \ + row4h = t1; \ + } while(0) + +#include "blake2b-load-neon.h" + +#define ROUND(r) \ + do { \ + LOAD_MSG_ ##r ##_1(b0, b1); \ + G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \ + LOAD_MSG_ ##r ##_2(b0, b1); \ + G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \ + DIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h); \ + LOAD_MSG_ ##r ##_3(b0, b1); \ + G1(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \ + LOAD_MSG_ ##r ##_4(b0, b1); \ + G2(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h, b0, b1); \ + UNDIAGONALIZE(row1l, row2l, row3l, row4l, row1h, row2h, row3h, row4h);\ + } while(0) + +#endif diff --git a/src/libsodium/crypto_generichash/blake2b/ref/blake2b-load-neon.h b/src/libsodium/crypto_generichash/blake2b/ref/blake2b-load-neon.h new file mode 100644 index 0000000000..f6ade8159d --- /dev/null +++ b/src/libsodium/crypto_generichash/blake2b/ref/blake2b-load-neon.h @@ -0,0 +1,307 @@ +/* + BLAKE2 reference source code package - reference C implementations + + Copyright 2012, Samuel Neves . You may use this under the + terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at + your option. The terms of these licenses can be found at: + + - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0 + - OpenSSL license : https://www.openssl.org/source/license.html + - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0 + + More information about the BLAKE2 hash function can be found at + https://blake2.net. +*/ + +#ifndef blake2b_load_neon_H +#define blake2b_load_neon_H + +#define LOAD_MSG_0_1(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); \ + b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); \ + } while(0) + +#define LOAD_MSG_0_2(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); \ + b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); \ + } while(0) + +#define LOAD_MSG_0_3(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); \ + b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); \ + } while(0) + +#define LOAD_MSG_0_4(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); \ + b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); \ + } while(0) + +#define LOAD_MSG_1_1(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); \ + b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); \ + } while(0) + +#define LOAD_MSG_1_2(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); \ + b1 = vextq_u64(m7, m3, 1); \ + } while(0) + +#define LOAD_MSG_1_3(b0, b1) \ + do { \ + b0 = vextq_u64(m0, m0, 1); \ + b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); \ + } while(0) + +#define LOAD_MSG_1_4(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); \ + b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); \ + } while(0) + +#define LOAD_MSG_2_1(b0, b1) \ + do { \ + b0 = vextq_u64(m5, m6, 1); \ + b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); \ + } while(0) + +#define LOAD_MSG_2_2(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m0)); \ + b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m6)); \ + } while(0) + +#define LOAD_MSG_2_3(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m5), vget_high_u64(m1)); \ + b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m4)); \ + } while(0) + +#define LOAD_MSG_2_4(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m3)); \ + b1 = vextq_u64(m0, m2, 1); \ + } while(0) + +#define LOAD_MSG_3_1(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); \ + b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m5)); \ + } while(0) + +#define LOAD_MSG_3_2(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m0)); \ + b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); \ + } while(0) + +#define LOAD_MSG_3_3(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m2)); \ + b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); \ + } while(0) + +#define LOAD_MSG_3_4(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); \ + b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); \ + } while(0) + +#define LOAD_MSG_4_1(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m2)); \ + b1 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m5)); \ + } while(0) + +#define LOAD_MSG_4_2(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m0), vget_high_u64(m3)); \ + b1 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m7)); \ + } while(0) + +#define LOAD_MSG_4_3(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m5)); \ + b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m1)); \ + } while(0) + +#define LOAD_MSG_4_4(b0, b1) \ + do { \ + b0 = vextq_u64(m0, m6, 1); \ + b1 = vcombine_u64(vget_low_u64(m4), vget_high_u64(m6)); \ + } while(0) + +#define LOAD_MSG_5_1(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m3)); \ + b1 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m4)); \ + } while(0) + +#define LOAD_MSG_5_2(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m5)); \ + b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m1)); \ + } while(0) + +#define LOAD_MSG_5_3(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m2), vget_high_u64(m3)); \ + b1 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m0)); \ + } while(0) + +#define LOAD_MSG_5_4(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m2)); \ + b1 = vcombine_u64(vget_low_u64(m7), vget_high_u64(m4)); \ + } while(0) + +#define LOAD_MSG_6_1(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m0)); \ + b1 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); \ + } while(0) + +#define LOAD_MSG_6_2(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); \ + b1 = vextq_u64(m6, m5, 1); \ + } while(0) + +#define LOAD_MSG_6_3(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m3)); \ + b1 = vextq_u64(m4, m4, 1); \ + } while(0) + +#define LOAD_MSG_6_4(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); \ + b1 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m5)); \ + } while(0) + +#define LOAD_MSG_7_1(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m3)); \ + b1 = vcombine_u64(vget_low_u64(m6), vget_high_u64(m1)); \ + } while(0) + +#define LOAD_MSG_7_2(b0, b1) \ + do { \ + b0 = vextq_u64(m5, m7, 1); \ + b1 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m4)); \ + } while(0) + +#define LOAD_MSG_7_3(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m7)); \ + b1 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m1)); \ + } while(0) + +#define LOAD_MSG_7_4(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m2)); \ + b1 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m5)); \ + } while(0) + +#define LOAD_MSG_8_1(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m3), vget_low_u64(m7)); \ + b1 = vextq_u64(m5, m0, 1); \ + } while(0) + +#define LOAD_MSG_8_2(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); \ + b1 = vextq_u64(m1, m4, 1); \ + } while(0) + +#define LOAD_MSG_8_3(b0, b1) \ + do { \ + b0 = m6; \ + b1 = vextq_u64(m0, m5, 1); \ + } while(0) + +#define LOAD_MSG_8_4(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m1), vget_high_u64(m3)); \ + b1 = m2; \ + } while(0) + +#define LOAD_MSG_9_1(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); \ + b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m0)); \ + } while(0) + +#define LOAD_MSG_9_2(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m1), vget_low_u64(m2)); \ + b1 = vcombine_u64(vget_low_u64(m3), vget_high_u64(m2)); \ + } while(0) + +#define LOAD_MSG_9_3(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_high_u64(m7), vget_high_u64(m4)); \ + b1 = vcombine_u64(vget_high_u64(m1), vget_high_u64(m6)); \ + } while(0) + +#define LOAD_MSG_9_4(b0, b1) \ + do { \ + b0 = vextq_u64(m5, m7, 1); \ + b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m0)); \ + } while(0) + +#define LOAD_MSG_10_1(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m0), vget_low_u64(m1)); \ + b1 = vcombine_u64(vget_low_u64(m2), vget_low_u64(m3)); \ + } while(0) + +#define LOAD_MSG_10_2(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_high_u64(m0), vget_high_u64(m1)); \ + b1 = vcombine_u64(vget_high_u64(m2), vget_high_u64(m3)); \ + } while(0) + +#define LOAD_MSG_10_3(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m4), vget_low_u64(m5)); \ + b1 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m7)); \ + } while(0) + +#define LOAD_MSG_10_4(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m5)); \ + b1 = vcombine_u64(vget_high_u64(m6), vget_high_u64(m7)); \ + } while(0) + +#define LOAD_MSG_11_1(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m7), vget_low_u64(m2)); \ + b1 = vcombine_u64(vget_high_u64(m4), vget_high_u64(m6)); \ + } while(0) + +#define LOAD_MSG_11_2(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m5), vget_low_u64(m4)); \ + b1 = vextq_u64(m7, m3, 1); \ + } while(0) + +#define LOAD_MSG_11_3(b0, b1) \ + do { \ + b0 = vextq_u64(m0, m0, 1); \ + b1 = vcombine_u64(vget_high_u64(m5), vget_high_u64(m2)); \ + } while(0) + +#define LOAD_MSG_11_4(b0, b1) \ + do { \ + b0 = vcombine_u64(vget_low_u64(m6), vget_low_u64(m1)); \ + b1 = vcombine_u64(vget_high_u64(m3), vget_high_u64(m1)); \ + } while(0) + +#endif diff --git a/src/libsodium/crypto_generichash/blake2b/ref/blake2b-ref.c b/src/libsodium/crypto_generichash/blake2b/ref/blake2b-ref.c index a1beacf3c0..a9a2559210 100644 --- a/src/libsodium/crypto_generichash/blake2b/ref/blake2b-ref.c +++ b/src/libsodium/crypto_generichash/blake2b/ref/blake2b-ref.c @@ -430,6 +430,12 @@ blake2b_pick_best_implementation(void) blake2b_compress = blake2b_compress_ssse3; return 0; } +#endif +#if defined(__aarch64__) + if (sodium_runtime_has_neon()) { + blake2b_compress = blake2b_compress_neon; + return 0; + } #endif blake2b_compress = blake2b_compress_ref;