diff --git a/provekit/common/Cargo.toml b/provekit/common/Cargo.toml index d39db74e..92faae9c 100644 --- a/provekit/common/Cargo.toml +++ b/provekit/common/Cargo.toml @@ -14,7 +14,6 @@ skyscraper.workspace = true # Noir language acir.workspace = true -noir_artifact_cli.workspace = true noirc_abi.workspace = true # Cryptography and proof systems diff --git a/skyscraper/block-multiplier/Cargo.toml b/skyscraper/block-multiplier/Cargo.toml index 469c97d6..ab66b0aa 100644 --- a/skyscraper/block-multiplier/Cargo.toml +++ b/skyscraper/block-multiplier/Cargo.toml @@ -8,10 +8,11 @@ license.workspace = true homepage.workspace = true repository.workspace = true -[dependencies] +[target.'cfg(not(target_arch = "wasm32"))'.dependencies] # Workspace crates fp-rounding.workspace = true +[dependencies] # 3rd party seq-macro.workspace = true diff --git a/skyscraper/block-multiplier/src/block_simd.rs b/skyscraper/block-multiplier/src/block_simd.rs index fc2cb21d..e770f557 100644 --- a/skyscraper/block-multiplier/src/block_simd.rs +++ b/skyscraper/block-multiplier/src/block_simd.rs @@ -1,12 +1,12 @@ use { crate::{ constants::*, - subarray, - utils::{ - addv, addv_simd, carrying_mul_add, make_initial, reduce_ct, reduce_ct_simd, - smult_noinit_simd, transpose_simd_to_u256, transpose_u256_to_simd, - u256_to_u260_shl2_simd, u260_to_u256_simd, + simd_utils::{ + addv_simd, make_initial, reduce_ct_simd, smult_noinit_simd, transpose_simd_to_u256, + transpose_u256_to_simd, u256_to_u260_shl2_simd, u260_to_u256_simd, }, + subarray, + utils::{addv, carrying_mul_add, reduce_ct}, }, core::{ arch::aarch64::vcvtq_f64_u64, diff --git a/skyscraper/block-multiplier/src/lib.rs b/skyscraper/block-multiplier/src/lib.rs index 904616c2..fe54fa53 100644 --- a/skyscraper/block-multiplier/src/lib.rs +++ b/skyscraper/block-multiplier/src/lib.rs @@ -1,14 +1,27 @@ #![feature(portable_simd)] #![feature(bigint_helper_methods)] +//#![no_std] This crate can technically be no_std. However this requires +// replacing StdFloat.mul_add with intrinsics. +#[cfg(target_arch = "aarch64")] mod aarch64; + +// These can be made to work on x86, +// but for now it uses an ARM NEON intrinsic. +#[cfg(target_arch = "aarch64")] mod block_simd; -pub mod constants; +#[cfg(target_arch = "aarch64")] mod portable_simd; +#[cfg(target_arch = "aarch64")] +mod simd_utils; + +pub mod constants; mod scalar; mod test_utils; mod utils; +pub use crate::scalar::{scalar_mul, scalar_sqr}; +#[cfg(target_arch = "aarch64")] pub use crate::{ aarch64::{ montgomery_interleaved_3, montgomery_interleaved_4, montgomery_square_interleaved_3, @@ -17,5 +30,4 @@ pub use crate::{ }, block_simd::{block_mul, block_sqr}, portable_simd::{simd_mul, simd_sqr}, - scalar::{scalar_mul, scalar_sqr}, }; diff --git a/skyscraper/block-multiplier/src/portable_simd.rs b/skyscraper/block-multiplier/src/portable_simd.rs index 513eb982..39ca34f2 100644 --- a/skyscraper/block-multiplier/src/portable_simd.rs +++ b/skyscraper/block-multiplier/src/portable_simd.rs @@ -1,16 +1,17 @@ use { crate::{ constants::*, - utils::{ + simd_utils::{ addv_simd, make_initial, reduce_ct_simd, smult_noinit_simd, transpose_simd_to_u256, transpose_u256_to_simd, u256_to_u260_shl2_simd, u260_to_u256_simd, }, }, - std::{ + core::{ arch::aarch64::vcvtq_f64_u64, ops::BitAnd, - simd::{num::SimdFloat, Simd, StdFloat}, + simd::{num::SimdFloat, Simd}, }, + std::simd::StdFloat, }; #[inline] diff --git a/skyscraper/block-multiplier/src/simd_utils.rs b/skyscraper/block-multiplier/src/simd_utils.rs new file mode 100644 index 00000000..9ce3b4f6 --- /dev/null +++ b/skyscraper/block-multiplier/src/simd_utils.rs @@ -0,0 +1,138 @@ +use { + crate::constants::{C1, C2, MASK52, U52_2P}, + core::{ + arch::aarch64::vcvtq_f64_u64, + array, + ops::BitAnd, + simd::{ + cmp::SimdPartialEq, + num::{SimdFloat, SimdInt, SimdUint}, + Simd, + }, + }, + std::simd::StdFloat, +}; + +// -- [SIMD UTILS] +// --------------------------------------------------------------------------------- +#[inline(always)] +pub const fn make_initial(low_count: usize, high_count: usize) -> u64 { + let val = high_count * 0x467 + low_count * 0x433; + -((val as i64 & 0xfff) << 52) as u64 +} + +#[inline(always)] +pub fn transpose_u256_to_simd(limbs: [[u64; 4]; 2]) -> [Simd; 4] { + // This does not issue multiple ldp and zip which might be marginally faster. + [ + Simd::from_array([limbs[0][0], limbs[1][0]]), + Simd::from_array([limbs[0][1], limbs[1][1]]), + Simd::from_array([limbs[0][2], limbs[1][2]]), + Simd::from_array([limbs[0][3], limbs[1][3]]), + ] +} + +#[inline(always)] +pub fn transpose_simd_to_u256(limbs: [Simd; 4]) -> [[u64; 4]; 2] { + let tmp0 = limbs[0].to_array(); + let tmp1 = limbs[1].to_array(); + let tmp2 = limbs[2].to_array(); + let tmp3 = limbs[3].to_array(); + [[tmp0[0], tmp1[0], tmp2[0], tmp3[0]], [ + tmp0[1], tmp1[1], tmp2[1], tmp3[1], + ]] +} + +#[inline(always)] +pub fn u256_to_u260_shl2_simd(limbs: [Simd; 4]) -> [Simd; 5] { + let [l0, l1, l2, l3] = limbs; + [ + (l0 << 2) & Simd::splat(MASK52), + ((l0 >> 50) | (l1 << 14)) & Simd::splat(MASK52), + ((l1 >> 38) | (l2 << 26)) & Simd::splat(MASK52), + ((l2 >> 26) | (l3 << 38)) & Simd::splat(MASK52), + l3 >> 14, + ] +} + +#[inline(always)] +pub fn u260_to_u256_simd(limbs: [Simd; 5]) -> [Simd; 4] { + let [l0, l1, l2, l3, l4] = limbs; + [ + l0 | (l1 << 52), + (l1 >> 12) | (l2 << 40), + (l2 >> 24) | (l3 << 28), + (l3 >> 36) | (l4 << 16), + ] +} + +#[inline(always)] +pub fn smult_noinit_simd(s: Simd, v: [u64; 5]) -> [Simd; 6] { + let mut t = [Simd::splat(0); 6]; + let s: Simd = unsafe { vcvtq_f64_u64(s.into()).into() }; + + let p_hi_0 = s.mul_add(Simd::splat(v[0] as f64), Simd::splat(C1)); + let p_lo_0 = s.mul_add(Simd::splat(v[0] as f64), Simd::splat(C2) - p_hi_0); + t[1] += p_hi_0.to_bits(); + t[0] += p_lo_0.to_bits(); + + let p_hi_1 = s.mul_add(Simd::splat(v[1] as f64), Simd::splat(C1)); + let p_lo_1 = s.mul_add(Simd::splat(v[1] as f64), Simd::splat(C2) - p_hi_1); + t[2] += p_hi_1.to_bits(); + t[1] += p_lo_1.to_bits(); + + let p_hi_2 = s.mul_add(Simd::splat(v[2] as f64), Simd::splat(C1)); + let p_lo_2 = s.mul_add(Simd::splat(v[2] as f64), Simd::splat(C2) - p_hi_2); + t[3] += p_hi_2.to_bits(); + t[2] += p_lo_2.to_bits(); + + let p_hi_3 = s.mul_add(Simd::splat(v[3] as f64), Simd::splat(C1)); + let p_lo_3 = s.mul_add(Simd::splat(v[3] as f64), Simd::splat(C2) - p_hi_3); + t[4] += p_hi_3.to_bits(); + t[3] += p_lo_3.to_bits(); + + let p_hi_4 = s.mul_add(Simd::splat(v[4] as f64), Simd::splat(C1)); + let p_lo_4 = s.mul_add(Simd::splat(v[4] as f64), Simd::splat(C2) - p_hi_4); + t[5] += p_hi_4.to_bits(); + t[4] += p_lo_4.to_bits(); + + t +} + +#[inline(always)] +/// Resolve the carry bits in the upper parts 12b and reduce the result to +/// within < 3p +pub fn reduce_ct_simd(red: [Simd; 6]) -> [Simd; 5] { + // The lowest limb contains carries that still need to be applied. + let mut borrow: Simd = (red[0] >> 52).cast(); + let a = [red[1], red[2], red[3], red[4], red[5]]; + + // To reduce Check whether the most significant bit is set + let mask = (a[4] >> 47).bitand(Simd::splat(1)).simd_eq(Simd::splat(0)); + + // Select values based on the mask: if mask lane is true, use zeros, else use + // U52_2P + let zeros = [Simd::splat(0); 5]; + let twop = U52_2P.map(Simd::splat); + let b: [_; 5] = array::from_fn(|i| mask.select(zeros[i], twop[i])); + + let mut c = [Simd::splat(0); 5]; + for i in 0..c.len() { + let tmp: Simd = a[i].cast::() - b[i].cast() + borrow; + c[i] = tmp.cast().bitand(Simd::splat(MASK52)); + borrow = tmp >> 52 + } + + c +} + +#[inline(always)] +pub fn addv_simd( + mut va: [Simd; N], + vb: [Simd; N], +) -> [Simd; N] { + for i in 0..va.len() { + va[i] += vb[i]; + } + va +} diff --git a/skyscraper/block-multiplier/src/utils.rs b/skyscraper/block-multiplier/src/utils.rs index 774d54bf..b4e92777 100644 --- a/skyscraper/block-multiplier/src/utils.rs +++ b/skyscraper/block-multiplier/src/utils.rs @@ -1,16 +1,4 @@ -use { - crate::constants::{C1, C2, MASK52, U52_2P, U64_2P}, - std::{ - arch::aarch64::vcvtq_f64_u64, - array, - ops::BitAnd, - simd::{ - cmp::SimdPartialEq, - num::{SimdFloat, SimdInt, SimdUint}, - Simd, StdFloat, - }, - }, -}; +use crate::constants::U64_2P; /// Macro to extract a subarray from an array. /// @@ -60,119 +48,6 @@ pub fn addv(mut a: [u64; N], b: [u64; N]) -> [u64; N] { a } -// -- [SIMD UTILS] -// --------------------------------------------------------------------------------- -#[inline(always)] -pub const fn make_initial(low_count: usize, high_count: usize) -> u64 { - let val = high_count * 0x467 + low_count * 0x433; - -((val as i64 & 0xfff) << 52) as u64 -} - -#[inline(always)] -pub fn transpose_u256_to_simd(limbs: [[u64; 4]; 2]) -> [Simd; 4] { - // This does not issue multiple ldp and zip which might be marginally faster. - [ - Simd::from_array([limbs[0][0], limbs[1][0]]), - Simd::from_array([limbs[0][1], limbs[1][1]]), - Simd::from_array([limbs[0][2], limbs[1][2]]), - Simd::from_array([limbs[0][3], limbs[1][3]]), - ] -} - -#[inline(always)] -pub fn transpose_simd_to_u256(limbs: [Simd; 4]) -> [[u64; 4]; 2] { - let tmp0 = limbs[0].to_array(); - let tmp1 = limbs[1].to_array(); - let tmp2 = limbs[2].to_array(); - let tmp3 = limbs[3].to_array(); - [[tmp0[0], tmp1[0], tmp2[0], tmp3[0]], [ - tmp0[1], tmp1[1], tmp2[1], tmp3[1], - ]] -} - -#[inline(always)] -pub fn u256_to_u260_shl2_simd(limbs: [Simd; 4]) -> [Simd; 5] { - let [l0, l1, l2, l3] = limbs; - [ - (l0 << 2) & Simd::splat(MASK52), - ((l0 >> 50) | (l1 << 14)) & Simd::splat(MASK52), - ((l1 >> 38) | (l2 << 26)) & Simd::splat(MASK52), - ((l2 >> 26) | (l3 << 38)) & Simd::splat(MASK52), - l3 >> 14, - ] -} - -#[inline(always)] -pub fn u260_to_u256_simd(limbs: [Simd; 5]) -> [Simd; 4] { - let [l0, l1, l2, l3, l4] = limbs; - [ - l0 | (l1 << 52), - (l1 >> 12) | (l2 << 40), - (l2 >> 24) | (l3 << 28), - (l3 >> 36) | (l4 << 16), - ] -} - -#[inline(always)] -pub fn smult_noinit_simd(s: Simd, v: [u64; 5]) -> [Simd; 6] { - let mut t = [Simd::splat(0); 6]; - let s: Simd = unsafe { vcvtq_f64_u64(s.into()).into() }; - - let p_hi_0 = s.mul_add(Simd::splat(v[0] as f64), Simd::splat(C1)); - let p_lo_0 = s.mul_add(Simd::splat(v[0] as f64), Simd::splat(C2) - p_hi_0); - t[1] += p_hi_0.to_bits(); - t[0] += p_lo_0.to_bits(); - - let p_hi_1 = s.mul_add(Simd::splat(v[1] as f64), Simd::splat(C1)); - let p_lo_1 = s.mul_add(Simd::splat(v[1] as f64), Simd::splat(C2) - p_hi_1); - t[2] += p_hi_1.to_bits(); - t[1] += p_lo_1.to_bits(); - - let p_hi_2 = s.mul_add(Simd::splat(v[2] as f64), Simd::splat(C1)); - let p_lo_2 = s.mul_add(Simd::splat(v[2] as f64), Simd::splat(C2) - p_hi_2); - t[3] += p_hi_2.to_bits(); - t[2] += p_lo_2.to_bits(); - - let p_hi_3 = s.mul_add(Simd::splat(v[3] as f64), Simd::splat(C1)); - let p_lo_3 = s.mul_add(Simd::splat(v[3] as f64), Simd::splat(C2) - p_hi_3); - t[4] += p_hi_3.to_bits(); - t[3] += p_lo_3.to_bits(); - - let p_hi_4 = s.mul_add(Simd::splat(v[4] as f64), Simd::splat(C1)); - let p_lo_4 = s.mul_add(Simd::splat(v[4] as f64), Simd::splat(C2) - p_hi_4); - t[5] += p_hi_4.to_bits(); - t[4] += p_lo_4.to_bits(); - - t -} - -#[inline(always)] -/// Resolve the carry bits in the upper parts 12b and reduce the result to -/// within < 3p -pub fn reduce_ct_simd(red: [Simd; 6]) -> [Simd; 5] { - // The lowest limb contains carries that still need to be applied. - let mut borrow: Simd = (red[0] >> 52).cast(); - let a = [red[1], red[2], red[3], red[4], red[5]]; - - // To reduce Check whether the most significant bit is set - let mask = (a[4] >> 47).bitand(Simd::splat(1)).simd_eq(Simd::splat(0)); - - // Select values based on the mask: if mask lane is true, use zeros, else use - // U52_2P - let zeros = [Simd::splat(0); 5]; - let twop = U52_2P.map(Simd::splat); - let b: [_; 5] = array::from_fn(|i| mask.select(zeros[i], twop[i])); - - let mut c = [Simd::splat(0); 5]; - for i in 0..c.len() { - let tmp: Simd = a[i].cast::() - b[i].cast() + borrow; - c[i] = tmp.cast().bitand(Simd::splat(MASK52)); - borrow = tmp >> 52 - } - - c -} - #[inline(always)] pub fn reduce_ct(a: [u64; 4]) -> [u64; 4] { let b = [[0_u64; 4], U64_2P]; @@ -192,17 +67,6 @@ pub fn sub(a: [u64; N], b: [u64; N]) -> [u64; N] { c } -#[inline(always)] -pub fn addv_simd( - mut va: [Simd; N], - vb: [Simd; N], -) -> [Simd; N] { - for i in 0..va.len() { - va[i] += vb[i]; - } - va -} - #[inline(always)] pub fn carrying_mul_add(a: u64, b: u64, add: u64, carry: u64) -> (u64, u64) { let c: u128 = a as u128 * b as u128 + carry as u128 + add as u128; diff --git a/skyscraper/core/Cargo.toml b/skyscraper/core/Cargo.toml index 25df09d3..aa14dee4 100644 --- a/skyscraper/core/Cargo.toml +++ b/skyscraper/core/Cargo.toml @@ -11,22 +11,23 @@ repository.workspace = true [dependencies] # Workspace crates block-multiplier.workspace = true -fp-rounding.workspace = true # Cryptography and proof systems ark-bn254.workspace = true ark-ff.workspace = true # 3rd party -proptest.workspace = true rayon.workspace = true seq-macro.workspace = true zerocopy.workspace = true +[target.'cfg(not(target_arch = "wasm32"))'.dependencies] +fp-rounding.workspace = true + [dev-dependencies] divan.workspace = true -primitive-types.workspace = true rand.workspace = true +proptest.workspace = true [lints] workspace = true diff --git a/skyscraper/core/src/lib.rs b/skyscraper/core/src/lib.rs index 939feb73..912fd7a1 100644 --- a/skyscraper/core/src/lib.rs +++ b/skyscraper/core/src/lib.rs @@ -4,8 +4,6 @@ pub mod arithmetic; pub mod bar; -pub mod block3; -pub mod block4; pub mod constants; pub mod generic; pub mod pow; @@ -14,6 +12,11 @@ pub mod reference; pub mod simple; pub mod v1; +#[cfg(target_arch = "aarch64")] +pub mod block3; +#[cfg(target_arch = "aarch64")] +pub mod block4; + /// The least common multiple of the implementation widths. /// /// Doing this many compressions in parallel will make optimal use of resources diff --git a/skyscraper/core/src/pow.rs b/skyscraper/core/src/pow.rs index 1a1181f0..e2526b64 100644 --- a/skyscraper/core/src/pow.rs +++ b/skyscraper/core/src/pow.rs @@ -1,5 +1,9 @@ +#[cfg(target_arch = "aarch64")] +use crate::block4::compress_many; +#[cfg(not(target_arch = "aarch64"))] +use crate::simple::compress_many; use { - crate::{arithmetic::less_than, block4::compress_many, generic, simple::compress, WIDTH_LCM}, + crate::{arithmetic::less_than, generic, simple::compress, WIDTH_LCM}, ark_ff::Zero, }; @@ -35,6 +39,7 @@ pub fn solve(challenge: [u64; 4], difficulty: f64) -> u64 { return 0; } let threshold = threshold(difficulty + PROVER_BIAS); + let nonce = generic::solve::<_, { WIDTH_LCM * 10 }>(compress_many, challenge, threshold); debug_assert!(verify(challenge, difficulty, nonce)); nonce diff --git a/skyscraper/fp-rounding/src/lib.rs b/skyscraper/fp-rounding/src/lib.rs index 351eaf94..a9089aec 100644 --- a/skyscraper/fp-rounding/src/lib.rs +++ b/skyscraper/fp-rounding/src/lib.rs @@ -1,4 +1,5 @@ #![allow(unsafe_code)] +#![no_std] //! Round Toward Zero (RTZ) floating-point rounding mode control //! //! Rust/LLVM does not support different float point mode rounding modes and diff --git a/skyscraper/fp-rounding/src/utils.rs b/skyscraper/fp-rounding/src/utils.rs index 69456426..ba482cd2 100644 --- a/skyscraper/fp-rounding/src/utils.rs +++ b/skyscraper/fp-rounding/src/utils.rs @@ -37,6 +37,6 @@ pub fn fence(val: T) -> T { // read_volatile makes a copy, but this is an unintentional side effect. // Since running the destructor/Drop twice is undesirable, the memory is // freed up here. - std::mem::forget(val); + core::mem::forget(val); copy }