Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 136 additions & 0 deletions scripts_bench_neon/bench/bench.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
// /*
// BLAKE2 reference source code package - benchmark tool

// Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
// terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
// your option. The terms of these licenses can be found at:

// - CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
// - OpenSSL license : https://www.openssl.org/source/license.html
// - Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0

// More information about the BLAKE2 hash function can be found at
// https://blake2.net.
// */// based on https://github.com/BLAKE2/BLAKE2/tree/master/bench
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define _GNU_SOURCE
#include <unistd.h>
#include <sys/ioctl.h>
#include <linux/perf_event.h>
#include <sys/syscall.h>
#include <fcntl.h>

// int crypto_hash( unsigned char *out, const unsigned char *in, unsigned long long inlen );
#include <sodium.h> // Include libsodium header

// Function to open a perf_event file descriptor
static inline long perf_event_open(struct perf_event_attr *attr, pid_t pid,
int cpu, int group_fd, unsigned long flags) {
return syscall(SYS_perf_event_open, attr, pid, cpu, group_fd, flags);
}
// Global variable to store the perf event file descriptor
static int perf_fd = -1;



// Replace the crypto_hash function with libsodium's crypto_generichash
int crypto_hash(unsigned char *out, const unsigned char *in, unsigned long long inlen) {
// Use the generichash function from libsodium (default 64-byte hash length)
return crypto_generichash_blake2b(out, crypto_generichash_BYTES, in, inlen, NULL, 0);
}

static int bench_cmp( const void *x, const void *y )
{
const int64_t *ix = ( const int64_t * )x;
const int64_t *iy = ( const int64_t * )y;
return *ix - *iy;
}

// Initialize the performance counter (should be called once at startup)
static void cpucycles_init(void) {
struct perf_event_attr pe;
memset(&pe, 0, sizeof(pe));
pe.type = PERF_TYPE_HARDWARE;
pe.size = sizeof(pe);
pe.config = PERF_COUNT_HW_CPU_CYCLES;
pe.disabled = 0; // Start immediately
pe.exclude_kernel = 1; // User-space only
pe.exclude_hv = 1; // Exclude hypervisor

perf_fd = perf_event_open(&pe, 0, -1, -1, 0);
if (perf_fd == -1) {
perror("perf_event_open failed");
}
}

// Function to get current cycle count
static unsigned long long cpucycles(void) {
if (perf_fd == -1) {
fprintf(stderr, "cpucycles: perf_fd not initialized!\n");
return 0;
}

uint64_t cycles;
ssize_t ret = read(perf_fd, &cycles, sizeof(cycles));
if (ret == -1) {
perror("cpucycles: read failed");
return 0;
}

return cycles;
}
// Cleanup function to close perf event file descriptor
static void cpucycles_cleanup(void) {
if (perf_fd != -1) {
close(perf_fd);
perf_fd = -1;
}
}

void bench()
{
#define BENCH_TRIALS 32
#define BENCH_MAXLEN 1536
static unsigned char in[4096];
static unsigned long long median[4096 + 1];
int i, j;
printf( "#bytes median per byte\n" );

cpucycles_init();
/* 1 ... BENCH_MAXLEN */
for( j = 0; j <= 4096; ++j )
{
uint64_t cycles[BENCH_TRIALS + 1];

for( i = 0; i <= BENCH_TRIALS; ++i )
{
cycles[i] = cpucycles();
crypto_hash( in, in, j );
}

for( i = 0; i < BENCH_TRIALS; ++i )
cycles[i] = cycles[i + 1] - cycles[i];

qsort( cycles, BENCH_TRIALS, sizeof( uint64_t ), bench_cmp );
median[j] = cycles[BENCH_TRIALS / 2];
}

cpucycles_cleanup(); // Clean up perf event

for( j = 0; j <= BENCH_MAXLEN; j += 8 )
printf( "%5d, %7.2f\n", j, ( double )median[j] / j );

printf( "#2048 %6llu %7.2f\n", median[2048], ( double )median[2048] / 2048.0 );
printf( "#4096 %6llu %7.2f\n", median[4096], ( double )median[4096] / 4096.0 );
printf( "#long long %7.2f\n", ( double )( median[4096] - median[2048] ) / 2048.0 );
}

int main()
{
bench();
return 0;
}
26 changes: 26 additions & 0 deletions scripts_bench_neon/bench/makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
CC=gcc
# # Use gnu99 to support inline asm
# CFLAGS=-O3 -march=native -mavx2 -Wall -Wextra -DSUPERCOP

# CFLAGS=-O3 -march=native -mssse3 -Wall -Wextra -DSUPERCOP
CFLAGS=-O3 -march=native -mcpu=neoverse-n1 -Wall -Wextra -DSUPERCOP

# CFLAGS=-O3 -Wall -Wextra -DSUPERCOP
LIBS=-lsodium
INCLUDE_DIR=$(HOME)/include
LIB_DIR=$(HOME)/lib
FILES=bench.c

# Target for generating the executable
all: bench

bench: $(FILES)
$(CC) $(FILES) $(CFLAGS) -I$(INCLUDE_DIR) -L$(LIB_DIR) $(LIBS) -o generichash_bench

# Make the data files by running the benchmark programs
plot: bench
./generichash_bench > generichash.data

# Clean up generated files
clean:
rm -f generichash_bench generichash.data plotcycles.pdf
92 changes: 92 additions & 0 deletions scripts_bench_neon/benchmark_throughput/benchmark.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#include <sodium.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

// Function to run a single benchmark and save results
void run_benchmark(FILE *fp, const char *description, size_t message_len, size_t hash_len, int iterations) {
unsigned char *message = malloc(message_len);
unsigned char *hash = malloc(hash_len);

if (!message || !hash) {
printf("Memory allocation failed!\n");
free(message);
free(hash);
return;
}

randombytes_buf(message, message_len);

// Start timing
clock_t start = clock();
for (int i = 0; i < iterations; i++) {
crypto_generichash(hash, hash_len, message, message_len, NULL, 0);
}
clock_t end = clock();

double cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
double time_per_hash = (cpu_time_used * 1e6) / iterations; // µs per hash
double total_data_processed = (double)(message_len * iterations) / (1024 * 1024); // Convert to MB
double throughput = total_data_processed / cpu_time_used; // MB per second

// Print results to console
printf("Benchmark: %s\n", description);
printf(" Message size: %zu bytes\n", message_len);
printf(" Hash length: %zu bytes\n", hash_len);
printf(" Iterations: %d\n", iterations);
printf(" Total time: %.6f seconds\n", cpu_time_used);
printf(" Time per hash: %.6f microseconds\n", time_per_hash);
printf(" Throughput: %.2f MB/s\n\n", throughput);

// Save data for Gnuplot (Format: message_len time_per_hash throughput)
fprintf(fp, "%zu %.6f %.2f\n", message_len, time_per_hash, throughput);

free(message);
free(hash);
}

int main() {
if (sodium_init() < 0) {
printf("Libsodium initialization failed!\n");
return 1;
}

FILE *fp = fopen("benchmark_results.data", "w");
if (!fp) {
printf("Error opening file for writing!\n");
return 1;
}

// Define power-of-2 message sizes for benchmarking
size_t sizes[] = {
16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192,
16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304,
8388608, 16777216
};

int num_sizes = sizeof(sizes) / sizeof(sizes[0]);

// Adjust iterations to balance test duration for different message sizes
int iterations[num_sizes];
for (int i = 0; i < num_sizes; i++) {
if (sizes[i] < 1024)
iterations[i] = 200000;
else if (sizes[i] < 65536)
iterations[i] = 50000;
else if (sizes[i] < 1048576)
iterations[i] = 10000;
else if (sizes[i] < 8388608)
iterations[i] = 1000;
else
iterations[i] = 100;
}

for (int i = 0; i < num_sizes; i++) {
char desc[100];
snprintf(desc, sizeof(desc), "Message size: %zu bytes", sizes[i]);
run_benchmark(fp, desc, sizes[i], crypto_generichash_BYTES, iterations[i]);
}

fclose(fp);
return 0;
}
17 changes: 17 additions & 0 deletions scripts_bench_neon/readme.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
for the libsodium

./autogen.sh -s
automake
mkdir builddir
cd builddir
../configure CFLAGS="-DDEV_MODE" CPPFLAGS="-DDEV_MODE" --prefix=$HOME

make && make check

#benchmark_throughput
gcc -o benchmark benchmark.c -I $HOME/include -L $HOME/lib -lsodium
LD_LIBRARY_PATH=$HOME/lib ./benchmark

#bench
make bench
sudo LD_LIBRARY_PATH=$HOME/lib ./generichash_bench > generichash_bench.data
3 changes: 3 additions & 0 deletions src/libsodium/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ libsodium_la_SOURCES = \
crypto_generichash/blake2b/ref/blake2b-load-sse2.h \
crypto_generichash/blake2b/ref/blake2b-load-sse41.h \
crypto_generichash/blake2b/ref/blake2b-load-avx2.h \
crypto_generichash/blake2b/ref/blake2b-load-neon.h \
crypto_generichash/blake2b/ref/blake2b-ref.c \
crypto_generichash/blake2b/ref/generichash_blake2b.c \
crypto_hash/crypto_hash.c \
Expand Down Expand Up @@ -232,6 +233,8 @@ libarmcrypto_la_LDFLAGS = $(libsodium_la_LDFLAGS)
libarmcrypto_la_CPPFLAGS = $(libsodium_la_CPPFLAGS) \
@CFLAGS_ARMCRYPTO@
libarmcrypto_la_SOURCES = \
crypto_generichash/blake2b/ref/blake2b-compress-neon.c \
crypto_generichash/blake2b/ref/blake2b-compress-neon.h \
crypto_aead/aegis128l/aegis128l_armcrypto.c \
crypto_aead/aegis128l/aegis128l_armcrypto.h \
crypto_aead/aegis256/aegis256_armcrypto.c \
Expand Down
2 changes: 2 additions & 0 deletions src/libsodium/crypto_generichash/blake2b/ref/blake2.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,5 +103,7 @@ int blake2b_compress_sse41(blake2b_state *S,
const uint8_t block[BLAKE2B_BLOCKBYTES]);
int blake2b_compress_avx2(blake2b_state *S,
const uint8_t block[BLAKE2B_BLOCKBYTES]);
int blake2b_compress_neon(blake2b_state *S,
const uint8_t block[BLAKE2B_BLOCKBYTES]);

#endif
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/*
BLAKE2 reference source code package - reference C implementations

Copyright 2012, Samuel Neves <sneves@dei.uc.pt>. You may use this under the
terms of the CC0, the OpenSSL Licence, or the Apache Public License 2.0, at
your option. The terms of these licenses can be found at:

- CC0 1.0 Universal : http://creativecommons.org/publicdomain/zero/1.0
- OpenSSL license : https://www.openssl.org/source/license.html
- Apache 2.0 : http://www.apache.org/licenses/LICENSE-2.0

More information about the BLAKE2 hash function can be found at
https://blake2.net.
*/

#include <stdint.h>
#include <string.h>

#include "blake2.h"
#include "private/common.h"

#if defined(__aarch64__)

# include <arm_neon.h>

# include "blake2b-compress-neon.h"

CRYPTO_ALIGN(64)
static const uint64_t blake2b_IV[8] = {
0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL,
0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
};

int
blake2b_compress_neon(blake2b_state *S,
const uint8_t block[BLAKE2B_BLOCKBYTES])
{
uint64x2_t row1l, row1h;
uint64x2_t row2l, row2h;
uint64x2_t row3l, row3h;
uint64x2_t row4l, row4h;
uint64x2_t b0, b1;
uint64x2_t t0, t1;

const uint64x2_t m0 = vreinterpretq_u64_u8(vld1q_u8(block + 00));
const uint64x2_t m1 = vreinterpretq_u64_u8(vld1q_u8(block + 16));
const uint64x2_t m2 = vreinterpretq_u64_u8(vld1q_u8(block + 32));
const uint64x2_t m3 = vreinterpretq_u64_u8(vld1q_u8(block + 48));
const uint64x2_t m4 = vreinterpretq_u64_u8(vld1q_u8(block + 64));
const uint64x2_t m5 = vreinterpretq_u64_u8(vld1q_u8(block + 80));
const uint64x2_t m6 = vreinterpretq_u64_u8(vld1q_u8(block + 96));
const uint64x2_t m7 = vreinterpretq_u64_u8(vld1q_u8(block + 112));

const uint64x2_t h0 = row1l = vld1q_u64(&S->h[0]);
const uint64x2_t h1 = row1h = vld1q_u64(&S->h[2]);
const uint64x2_t h2 = row2l = vld1q_u64(&S->h[4]);
const uint64x2_t h3 = row2h = vld1q_u64(&S->h[6]);

row3l = vld1q_u64(&blake2b_IV[0]);
row3h = vld1q_u64(&blake2b_IV[2]);
row4l = veorq_u64(vld1q_u64(&blake2b_IV[4]), vld1q_u64(&S->t[0]));
row4h = veorq_u64(vld1q_u64(&blake2b_IV[6]), vld1q_u64(&S->f[0]));

ROUND(0);
ROUND(1);
ROUND(2);
ROUND(3);
ROUND(4);
ROUND(5);
ROUND(6);
ROUND(7);
ROUND(8);
ROUND(9);
ROUND(10);
ROUND(11);

vst1q_u64(&S->h[0], veorq_u64(h0, veorq_u64(row1l, row3l)));
vst1q_u64(&S->h[2], veorq_u64(h1, veorq_u64(row1h, row3h)));
vst1q_u64(&S->h[4], veorq_u64(h2, veorq_u64(row2l, row4l)));
vst1q_u64(&S->h[6], veorq_u64(h3, veorq_u64(row2h, row4h)));
return 0;
}

#endif
Loading