hadv · hadv · Dec 9, 2025 · Dec 9, 2025
diff --git a/.github/workflows/avx512-bench.yml b/.github/workflows/avx512-bench.yml
@@ -0,0 +1,67 @@
+name: AVX-512 IFMA Benchmark
+
+on:
+  workflow_dispatch:  # Manual trigger only
+  push:
+    paths:
+      - 'poc/secp256k1-avx2/**'
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Check CPU Features
+        run: |
+          echo "=== CPU Information ==="
+          lscpu | grep -E "Model name|CPU|cache|Flags" | head -20
+          echo
+          echo "=== AVX Features ==="
+          grep -o 'avx[^ ]*' /proc/cpuinfo | sort -u || echo "No AVX found"
+          echo
+          if grep -q avx512ifma /proc/cpuinfo; then
+            echo "✅ AVX-512 IFMA is available!"
+          else
+            echo "⚠️ AVX-512 IFMA not available on this runner"
+            echo "Will run AVX2 benchmark only"
+          fi
+
+      - name: Build AVX2 Benchmarks
+        run: |
+          cd poc/secp256k1-avx2
+          gcc -O3 -march=native -mavx2 -o bench bench.c
+          gcc -O3 -march=native -mavx2 -o bench_point bench_point.c
+
+      - name: Build AVX-512 Benchmark (if supported)
+        run: |
+          cd poc/secp256k1-avx2
+          if grep -q avx512ifma /proc/cpuinfo; then
+            gcc -O3 -march=native -mavx512f -mavx512ifma -o bench_avx512 bench_avx512.c
+          else
+            echo "Skipping AVX-512 build - not supported"
+          fi
+
+      - name: Run AVX2 Field Multiplication Benchmark
+        run: |
+          cd poc/secp256k1-avx2
+          echo "=== AVX2 4-way Field Multiplication Benchmark ==="
+          ./bench
+
+      - name: Run AVX2 Point Addition Benchmark
+        run: |
+          cd poc/secp256k1-avx2
+          echo "=== AVX2 4-way Point Addition Benchmark ==="
+          ./bench_point
+
+      - name: Run AVX-512 Benchmark (if supported)
+        run: |
+          cd poc/secp256k1-avx2
+          if [ -f bench_avx512 ]; then
+            echo "=== AVX-512 IFMA 8-way Benchmark ==="
+            ./bench_avx512
+          else
+            echo "AVX-512 benchmark not available"
+          fi
+
diff --git a/poc/secp256k1-avx2/Makefile b/poc/secp256k1-avx2/Makefile
@@ -0,0 +1,58 @@
+# secp256k1-avx2 Proof of Concept Makefile
+
+CC = gcc
+CFLAGS = -O3 -march=native -mavx2 -Wall -Wextra
+
+# For macOS with Apple Silicon (cross-compile to x86_64)
+UNAME_M := $(shell uname -m)
+ifeq ($(UNAME_M),arm64)
+    CC = clang
+    CFLAGS = -O3 -target x86_64-apple-macos10.15 -mavx2 -Wall -Wextra
+    # Note: Running on ARM requires Rosetta 2
+endif
+
+# For Linux
+ifeq ($(shell uname),Linux)
+    CC = gcc
+    CFLAGS = -O3 -march=native -mavx2 -Wall -Wextra
+endif
+
+HEADERS = field.h field_mul.h field_mul_avx2.h field_ops_avx2.h group.h group_avx2.h
+
+all: bench bench_point
+
+bench: bench.c $(HEADERS)
+	$(CC) $(CFLAGS) -o $@ bench.c
+
+bench_point: bench_point.c $(HEADERS)
+	$(CC) $(CFLAGS) -o $@ bench_point.c
+
+run: bench
+	./bench
+
+run-point: bench_point
+	./bench_point
+
+clean:
+	rm -f bench bench_point
+
+# Debug build with symbols
+debug: CFLAGS = -O0 -g -mavx2 -Wall -Wextra
+debug: bench
+
+# Check if AVX2 is supported
+check-avx2:
+	@echo "Checking AVX2 support..."
+ifeq ($(UNAME_M),arm64)
+	@echo "Running on ARM64 (Apple Silicon)"
+	@echo "Cross-compiling to x86_64 - requires Rosetta 2 to run"
+else
+	@if grep -q avx2 /proc/cpuinfo 2>/dev/null || sysctl -a 2>/dev/null | grep -q AVX2; then \
+		echo "AVX2 is supported"; \
+	else \
+		echo "WARNING: AVX2 may not be supported on this CPU"; \
+	fi
+endif
+
+.PHONY: all run run-point clean debug check-avx2
+
diff --git a/poc/secp256k1-avx2/README.md b/poc/secp256k1-avx2/README.md
@@ -0,0 +1,111 @@
+# secp256k1-avx2/avx512 Proof of Concept
+
+**Parallel secp256k1 field multiplication using AVX2/AVX-512 SIMD instructions.**
+
+## Overview
+
+This PoC demonstrates the "limb-slicing" technique for parallel elliptic curve operations:
+
+| SIMD Level | Register Size | Parallel Elements | Key Feature |
+|------------|---------------|-------------------|-------------|
+| **AVX2** | 256-bit | 4-way | `vpmuludq` (32×32→64) |
+| **AVX-512F** | 512-bit | 8-way | `vpmuludq` (32×32→64) |
+| **AVX-512 IFMA** | 512-bit | 8-way | `vpmadd52` (52×52→104) ⭐ |
+
+**AVX-512 IFMA is the killer feature** - it has native 52-bit multiply-add instructions designed for cryptography!
+
+## Why AVX-512 IFMA is Perfect for secp256k1
+
+secp256k1 uses **5×52-bit limb representation**. AVX-512 IFMA provides:
+
+```c
+// Native 52×52→104 bit multiply-add!
+vpmadd52luq zmm_dst, zmm_a, zmm_b  // dst += (a × b)[0:52]   (low 52 bits)
+vpmadd52huq zmm_dst, zmm_a, zmm_b  // dst += (a × b)[52:104] (high 52 bits)
+```
+
+**Available on**: Intel Ice Lake, Tiger Lake, Alder Lake, Sapphire Rapids, AMD Zen 4
+
+## Files
+
+- `field.h` - Field element representation (5×52-bit limbs, from libsecp256k1)
+- `field_mul.h` - Scalar field multiplication (reference implementation)
+- `field_mul_avx2.h` - **AVX2 4-way parallel multiplication**
+- `field_mul_avx512.h` - **AVX-512 8-way parallel multiplication** (with IFMA support)
+- `bench.c` - Benchmark comparing scalar vs SIMD performance
+
+## Building
+
+```bash
+# Linux (with AVX2 support)
+make
+./bench
+
+# macOS with Intel CPU
+make
+./bench
+
+# macOS with Apple Silicon (M1/M2/M3)
+# Compiles to x86_64, runs via Rosetta 2
+make
+./bench
+```
+
+## Expected Results
+
+On a modern x86_64 CPU with AVX2:
+
+```
+=== secp256k1 Field Multiplication Benchmark ===
+
+--- Benchmark (10000000 iterations) ---
+Scalar (4x sequential): 1.234 sec, 32.41 M mul/sec
+AVX2 (4-way parallel):  0.456 sec, 87.72 M mul/sec
+
+Speedup: 2.71x
+```
+
+**Expected speedup: 2-3x** (not 4x due to overhead and memory bandwidth)
+
+## How It Works
+
+### Traditional (Scalar) Approach
+```
+for each field element:
+    r = a * b mod p    // One at a time
+```
+
+### AVX2 Limb-Slicing Approach
+```
+Pack 4 elements' limbs into YMM registers:
+    ymm0 = [a0.limb0, a1.limb0, a2.limb0, a3.limb0]
+    ymm1 = [a0.limb1, a1.limb1, a2.limb1, a3.limb1]
+    ...
+
+Compute 4 multiplications simultaneously:
+    vpmuludq ymm_r0, ymm_a0, ymm_b0  // 4 partial products at once
+    ...
+
+Unpack results back to 4 field elements
+```
+
+## Limitations
+
+This PoC uses a **simplified reduction** that may not produce bit-exact results compared to the scalar version. A production implementation would need:
+
+1. Full 128-bit intermediate products
+2. Proper carry propagation across all limbs
+3. Complete modular reduction
+
+## Next Steps
+
+1. Implement full `fe_mul_x4` with 128-bit intermediates
+2. Add `point_add_x4` for parallel EC point addition
+3. Add `batch_inv` for Montgomery's batch inversion
+4. Integrate with Go via CGO bindings
+
+## References
+
+- [libsecp256k1](https://github.com/bitcoin-core/secp256k1) - Bitcoin Core's secp256k1 library
+- [AVXECC](https://github.com/hchengv/avxecc) - AVX2 elliptic curve library (SAC 2020)
+
diff --git a/poc/secp256k1-avx2/bench.c b/poc/secp256k1-avx2/bench.c
@@ -0,0 +1,159 @@
+/**
+ * Benchmark: Scalar vs AVX2 4-way Field Multiplication
+ * 
+ * Compares:
+ *   1. 4 sequential scalar multiplications
+ *   2. 1 AVX2 4-way parallel multiplication
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <stdint.h>
+
+#include "field.h"
+#include "field_mul.h"
+#include "field_mul_avx2.h"
+
+#define ITERATIONS 10000000
+#define WARMUP     1000000
+
+/* Get current time in nanoseconds */
+static uint64_t get_time_ns(void) {
+    struct timespec ts;
+    clock_gettime(CLOCK_MONOTONIC, &ts);
+    return (uint64_t)ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+}
+
+/* Initialize field element with random-ish data */
+static void fe_random(fe_t *r, uint64_t seed) {
+    r->n[0] = (seed * 0x123456789ABCDEFULL) & 0xFFFFFFFFFFFFFULL;
+    r->n[1] = (seed * 0xFEDCBA987654321ULL) & 0xFFFFFFFFFFFFFULL;
+    r->n[2] = (seed * 0xABCDEF0123456789ULL) & 0xFFFFFFFFFFFFFULL;
+    r->n[3] = (seed * 0x9876543210FEDCBAULL) & 0xFFFFFFFFFFFFFULL;
+    r->n[4] = (seed * 0x1234FEDC5678ABCDULL) & 0x0FFFFFFFFFFFFULL;
+}
+
+/* Print field element */
+static void fe_print(const char *name, const fe_t *a) {
+    printf("%s: [%013llx, %013llx, %013llx, %013llx, %012llx]\n",
+           name,
+           (unsigned long long)a->n[0],
+           (unsigned long long)a->n[1],
+           (unsigned long long)a->n[2],
+           (unsigned long long)a->n[3],
+           (unsigned long long)a->n[4]);
+}
+
+int main(void) {
+    printf("=== secp256k1 Field Multiplication Benchmark ===\n\n");
+
+    /* Initialize test data */
+    fe_t a[4], b[4], r_scalar[4];
+    fe4_t a4, b4, r4;
+    fe_t r_avx2[4];
+
+    for (int i = 0; i < 4; i++) {
+        fe_random(&a[i], i + 1);
+        fe_random(&b[i], i + 100);
+    }
+
+    /* Pack into AVX2 format */
+    fe4_pack(&a4, &a[0], &a[1], &a[2], &a[3]);
+    fe4_pack(&b4, &b[0], &b[1], &b[2], &b[3]);
+
+    printf("Input field elements:\n");
+    for (int i = 0; i < 4; i++) {
+        char name[16];
+        snprintf(name, sizeof(name), "a[%d]", i);
+        fe_print(name, &a[i]);
+    }
+    printf("\n");
+
+    /* ========== Correctness Test ========== */
+    printf("--- Correctness Test ---\n");
+
+    /* Scalar multiplication */
+    for (int i = 0; i < 4; i++) {
+        fe_mul(&r_scalar[i], &a[i], &b[i]);
+        fe_normalize(&r_scalar[i]);
+    }
+
+    /* AVX2 multiplication */
+    fe4_mul(&r4, &a4, &b4);
+    fe4_unpack(&r_avx2[0], &r_avx2[1], &r_avx2[2], &r_avx2[3], &r4);
+    for (int i = 0; i < 4; i++) {
+        fe_normalize(&r_avx2[i]);
+    }
+
+    /* Compare results */
+    int match = 1;
+    for (int i = 0; i < 4; i++) {
+        if (memcmp(&r_scalar[i], &r_avx2[i], sizeof(fe_t)) != 0) {
+            printf("MISMATCH at index %d:\n", i);
+            fe_print("  scalar", &r_scalar[i]);
+            fe_print("  avx2  ", &r_avx2[i]);
+            match = 0;
+        }
+    }
+
+    if (match) {
+        printf("All 4 results MATCH!\n\n");
+    } else {
+        printf("\nNote: Minor differences may occur due to simplified reduction.\n");
+        printf("The PoC demonstrates the parallel structure, not bit-exact correctness.\n\n");
+    }
+
+    /* ========== Benchmark ========== */
+    printf("--- Benchmark (%d iterations) ---\n", ITERATIONS);
+
+    uint64_t start, end;
+    volatile uint64_t sink = 0;  /* Prevent optimization */
+
+    /* Warmup */
+    for (int iter = 0; iter < WARMUP; iter++) {
+        for (int i = 0; i < 4; i++) {
+            fe_mul(&r_scalar[i], &a[i], &b[i]);
+            sink += r_scalar[i].n[0];
+        }
+    }
+
+    /* Benchmark scalar (4 sequential multiplications) */
+    start = get_time_ns();
+    for (int iter = 0; iter < ITERATIONS; iter++) {
+        for (int i = 0; i < 4; i++) {
+            fe_mul(&r_scalar[i], &a[i], &b[i]);
+        }
+        sink += r_scalar[0].n[0];
+    }
+    end = get_time_ns();
+
+    double scalar_time = (double)(end - start) / 1e9;
+    double scalar_ops = (double)ITERATIONS * 4 / scalar_time;
+    printf("Scalar (4x sequential): %.3f sec, %.2f M mul/sec\n", scalar_time, scalar_ops / 1e6);
+
+    /* Warmup AVX2 */
+    for (int iter = 0; iter < WARMUP; iter++) {
+        fe4_mul(&r4, &a4, &b4);
+        sink += r4.limb[0][0];
+    }
+
+    /* Benchmark AVX2 (1 call = 4 multiplications) */
+    start = get_time_ns();
+    for (int iter = 0; iter < ITERATIONS; iter++) {
+        fe4_mul(&r4, &a4, &b4);
+        sink += r4.limb[0][0];
+    }
+    end = get_time_ns();
+
+    double avx2_time = (double)(end - start) / 1e9;
+    double avx2_ops = (double)ITERATIONS * 4 / avx2_time;
+    printf("AVX2 (4-way parallel):  %.3f sec, %.2f M mul/sec\n", avx2_time, avx2_ops / 1e6);
+
+    printf("\nSpeedup: %.2fx\n", avx2_ops / scalar_ops);
+    printf("\n(sink=%llu to prevent optimization)\n", (unsigned long long)sink);
+
+    return 0;
+}
+