diff --git a/provekit/common/Cargo.toml b/provekit/common/Cargo.toml
index d39db74e..92faae9c 100644
--- a/provekit/common/Cargo.toml
+++ b/provekit/common/Cargo.toml
@@ -14,7 +14,6 @@ skyscraper.workspace = true
 
 # Noir language
 acir.workspace = true
-noir_artifact_cli.workspace = true
 noirc_abi.workspace = true
 
 # Cryptography and proof systems
diff --git a/skyscraper/block-multiplier/Cargo.toml b/skyscraper/block-multiplier/Cargo.toml
index 469c97d6..ab66b0aa 100644
--- a/skyscraper/block-multiplier/Cargo.toml
+++ b/skyscraper/block-multiplier/Cargo.toml
@@ -8,10 +8,11 @@ license.workspace = true
 homepage.workspace = true
 repository.workspace = true
 
-[dependencies]
+[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
 # Workspace crates
 fp-rounding.workspace = true
 
+[dependencies]
 # 3rd party
 seq-macro.workspace = true
 
diff --git a/skyscraper/block-multiplier/src/block_simd.rs b/skyscraper/block-multiplier/src/block_simd.rs
index fc2cb21d..e770f557 100644
--- a/skyscraper/block-multiplier/src/block_simd.rs
+++ b/skyscraper/block-multiplier/src/block_simd.rs
@@ -1,12 +1,12 @@
 use {
     crate::{
         constants::*,
-        subarray,
-        utils::{
-            addv, addv_simd, carrying_mul_add, make_initial, reduce_ct, reduce_ct_simd,
-            smult_noinit_simd, transpose_simd_to_u256, transpose_u256_to_simd,
-            u256_to_u260_shl2_simd, u260_to_u256_simd,
+        simd_utils::{
+            addv_simd, make_initial, reduce_ct_simd, smult_noinit_simd, transpose_simd_to_u256,
+            transpose_u256_to_simd, u256_to_u260_shl2_simd, u260_to_u256_simd,
         },
+        subarray,
+        utils::{addv, carrying_mul_add, reduce_ct},
     },
     core::{
         arch::aarch64::vcvtq_f64_u64,
diff --git a/skyscraper/block-multiplier/src/lib.rs b/skyscraper/block-multiplier/src/lib.rs
index 904616c2..fe54fa53 100644
--- a/skyscraper/block-multiplier/src/lib.rs
+++ b/skyscraper/block-multiplier/src/lib.rs
@@ -1,14 +1,27 @@
 #![feature(portable_simd)]
 #![feature(bigint_helper_methods)]
+//#![no_std] This crate can technically be no_std. However this requires
+// replacing StdFloat.mul_add with intrinsics.
 
+#[cfg(target_arch = "aarch64")]
 mod aarch64;
+
+// These can be made to work on x86,
+// but for now it uses an ARM NEON intrinsic.
+#[cfg(target_arch = "aarch64")]
 mod block_simd;
-pub mod constants;
+#[cfg(target_arch = "aarch64")]
 mod portable_simd;
+#[cfg(target_arch = "aarch64")]
+mod simd_utils;
+
+pub mod constants;
 mod scalar;
 mod test_utils;
 mod utils;
 
+pub use crate::scalar::{scalar_mul, scalar_sqr};
+#[cfg(target_arch = "aarch64")]
 pub use crate::{
     aarch64::{
         montgomery_interleaved_3, montgomery_interleaved_4, montgomery_square_interleaved_3,
@@ -17,5 +30,4 @@ pub use crate::{
     },
     block_simd::{block_mul, block_sqr},
     portable_simd::{simd_mul, simd_sqr},
-    scalar::{scalar_mul, scalar_sqr},
 };
diff --git a/skyscraper/block-multiplier/src/portable_simd.rs b/skyscraper/block-multiplier/src/portable_simd.rs
index 513eb982..39ca34f2 100644
--- a/skyscraper/block-multiplier/src/portable_simd.rs
+++ b/skyscraper/block-multiplier/src/portable_simd.rs
@@ -1,16 +1,17 @@
 use {
     crate::{
         constants::*,
-        utils::{
+        simd_utils::{
             addv_simd, make_initial, reduce_ct_simd, smult_noinit_simd, transpose_simd_to_u256,
             transpose_u256_to_simd, u256_to_u260_shl2_simd, u260_to_u256_simd,
         },
     },
-    std::{
+    core::{
         arch::aarch64::vcvtq_f64_u64,
         ops::BitAnd,
-        simd::{num::SimdFloat, Simd, StdFloat},
+        simd::{num::SimdFloat, Simd},
     },
+    std::simd::StdFloat,
 };
 
 #[inline]
diff --git a/skyscraper/block-multiplier/src/simd_utils.rs b/skyscraper/block-multiplier/src/simd_utils.rs
new file mode 100644
index 00000000..9ce3b4f6
--- /dev/null
+++ b/skyscraper/block-multiplier/src/simd_utils.rs
@@ -0,0 +1,138 @@
+use {
+    crate::constants::{C1, C2, MASK52, U52_2P},
+    core::{
+        arch::aarch64::vcvtq_f64_u64,
+        array,
+        ops::BitAnd,
+        simd::{
+            cmp::SimdPartialEq,
+            num::{SimdFloat, SimdInt, SimdUint},
+            Simd,
+        },
+    },
+    std::simd::StdFloat,
+};
+
+// -- [SIMD UTILS]
+// ---------------------------------------------------------------------------------
+#[inline(always)]
+pub const fn make_initial(low_count: usize, high_count: usize) -> u64 {
+    let val = high_count * 0x467 + low_count * 0x433;
+    -((val as i64 & 0xfff) << 52) as u64
+}
+
+#[inline(always)]
+pub fn transpose_u256_to_simd(limbs: [[u64; 4]; 2]) -> [Simd<u64, 2>; 4] {
+    // This does not issue multiple ldp and zip which might be marginally faster.
+    [
+        Simd::from_array([limbs[0][0], limbs[1][0]]),
+        Simd::from_array([limbs[0][1], limbs[1][1]]),
+        Simd::from_array([limbs[0][2], limbs[1][2]]),
+        Simd::from_array([limbs[0][3], limbs[1][3]]),
+    ]
+}
+
+#[inline(always)]
+pub fn transpose_simd_to_u256(limbs: [Simd<u64, 2>; 4]) -> [[u64; 4]; 2] {
+    let tmp0 = limbs[0].to_array();
+    let tmp1 = limbs[1].to_array();
+    let tmp2 = limbs[2].to_array();
+    let tmp3 = limbs[3].to_array();
+    [[tmp0[0], tmp1[0], tmp2[0], tmp3[0]], [
+        tmp0[1], tmp1[1], tmp2[1], tmp3[1],
+    ]]
+}
+
+#[inline(always)]
+pub fn u256_to_u260_shl2_simd(limbs: [Simd<u64, 2>; 4]) -> [Simd<u64, 2>; 5] {
+    let [l0, l1, l2, l3] = limbs;
+    [
+        (l0 << 2) & Simd::splat(MASK52),
+        ((l0 >> 50) | (l1 << 14)) & Simd::splat(MASK52),
+        ((l1 >> 38) | (l2 << 26)) & Simd::splat(MASK52),
+        ((l2 >> 26) | (l3 << 38)) & Simd::splat(MASK52),
+        l3 >> 14,
+    ]
+}
+
+#[inline(always)]
+pub fn u260_to_u256_simd(limbs: [Simd<u64, 2>; 5]) -> [Simd<u64, 2>; 4] {
+    let [l0, l1, l2, l3, l4] = limbs;
+    [
+        l0 | (l1 << 52),
+        (l1 >> 12) | (l2 << 40),
+        (l2 >> 24) | (l3 << 28),
+        (l3 >> 36) | (l4 << 16),
+    ]
+}
+
+#[inline(always)]
+pub fn smult_noinit_simd(s: Simd<u64, 2>, v: [u64; 5]) -> [Simd<u64, 2>; 6] {
+    let mut t = [Simd::splat(0); 6];
+    let s: Simd<f64, 2> = unsafe { vcvtq_f64_u64(s.into()).into() };
+
+    let p_hi_0 = s.mul_add(Simd::splat(v[0] as f64), Simd::splat(C1));
+    let p_lo_0 = s.mul_add(Simd::splat(v[0] as f64), Simd::splat(C2) - p_hi_0);
+    t[1] += p_hi_0.to_bits();
+    t[0] += p_lo_0.to_bits();
+
+    let p_hi_1 = s.mul_add(Simd::splat(v[1] as f64), Simd::splat(C1));
+    let p_lo_1 = s.mul_add(Simd::splat(v[1] as f64), Simd::splat(C2) - p_hi_1);
+    t[2] += p_hi_1.to_bits();
+    t[1] += p_lo_1.to_bits();
+
+    let p_hi_2 = s.mul_add(Simd::splat(v[2] as f64), Simd::splat(C1));
+    let p_lo_2 = s.mul_add(Simd::splat(v[2] as f64), Simd::splat(C2) - p_hi_2);
+    t[3] += p_hi_2.to_bits();
+    t[2] += p_lo_2.to_bits();
+
+    let p_hi_3 = s.mul_add(Simd::splat(v[3] as f64), Simd::splat(C1));
+    let p_lo_3 = s.mul_add(Simd::splat(v[3] as f64), Simd::splat(C2) - p_hi_3);
+    t[4] += p_hi_3.to_bits();
+    t[3] += p_lo_3.to_bits();
+
+    let p_hi_4 = s.mul_add(Simd::splat(v[4] as f64), Simd::splat(C1));
+    let p_lo_4 = s.mul_add(Simd::splat(v[4] as f64), Simd::splat(C2) - p_hi_4);
+    t[5] += p_hi_4.to_bits();
+    t[4] += p_lo_4.to_bits();
+
+    t
+}
+
+#[inline(always)]
+/// Resolve the carry bits in the upper parts 12b and reduce the result to
+/// within < 3p
+pub fn reduce_ct_simd(red: [Simd<u64, 2>; 6]) -> [Simd<u64, 2>; 5] {
+    // The lowest limb contains carries that still need to be applied.
+    let mut borrow: Simd<i64, 2> = (red[0] >> 52).cast();
+    let a = [red[1], red[2], red[3], red[4], red[5]];
+
+    // To reduce Check whether the most significant bit is set
+    let mask = (a[4] >> 47).bitand(Simd::splat(1)).simd_eq(Simd::splat(0));
+
+    // Select values based on the mask: if mask lane is true, use zeros, else use
+    // U52_2P
+    let zeros = [Simd::splat(0); 5];
+    let twop = U52_2P.map(Simd::splat);
+    let b: [_; 5] = array::from_fn(|i| mask.select(zeros[i], twop[i]));
+
+    let mut c = [Simd::splat(0); 5];
+    for i in 0..c.len() {
+        let tmp: Simd<i64, 2> = a[i].cast::<i64>() - b[i].cast() + borrow;
+        c[i] = tmp.cast().bitand(Simd::splat(MASK52));
+        borrow = tmp >> 52
+    }
+
+    c
+}
+
+#[inline(always)]
+pub fn addv_simd<const N: usize>(
+    mut va: [Simd<u64, 2>; N],
+    vb: [Simd<u64, 2>; N],
+) -> [Simd<u64, 2>; N] {
+    for i in 0..va.len() {
+        va[i] += vb[i];
+    }
+    va
+}
diff --git a/skyscraper/block-multiplier/src/utils.rs b/skyscraper/block-multiplier/src/utils.rs
index 774d54bf..b4e92777 100644
--- a/skyscraper/block-multiplier/src/utils.rs
+++ b/skyscraper/block-multiplier/src/utils.rs
@@ -1,16 +1,4 @@
-use {
-    crate::constants::{C1, C2, MASK52, U52_2P, U64_2P},
-    std::{
-        arch::aarch64::vcvtq_f64_u64,
-        array,
-        ops::BitAnd,
-        simd::{
-            cmp::SimdPartialEq,
-            num::{SimdFloat, SimdInt, SimdUint},
-            Simd, StdFloat,
-        },
-    },
-};
+use crate::constants::U64_2P;
 
 /// Macro to extract a subarray from an array.
 ///
@@ -60,119 +48,6 @@ pub fn addv<const N: usize>(mut a: [u64; N], b: [u64; N]) -> [u64; N] {
     a
 }
 
-// -- [SIMD UTILS]
-// ---------------------------------------------------------------------------------
-#[inline(always)]
-pub const fn make_initial(low_count: usize, high_count: usize) -> u64 {
-    let val = high_count * 0x467 + low_count * 0x433;
-    -((val as i64 & 0xfff) << 52) as u64
-}
-
-#[inline(always)]
-pub fn transpose_u256_to_simd(limbs: [[u64; 4]; 2]) -> [Simd<u64, 2>; 4] {
-    // This does not issue multiple ldp and zip which might be marginally faster.
-    [
-        Simd::from_array([limbs[0][0], limbs[1][0]]),
-        Simd::from_array([limbs[0][1], limbs[1][1]]),
-        Simd::from_array([limbs[0][2], limbs[1][2]]),
-        Simd::from_array([limbs[0][3], limbs[1][3]]),
-    ]
-}
-
-#[inline(always)]
-pub fn transpose_simd_to_u256(limbs: [Simd<u64, 2>; 4]) -> [[u64; 4]; 2] {
-    let tmp0 = limbs[0].to_array();
-    let tmp1 = limbs[1].to_array();
-    let tmp2 = limbs[2].to_array();
-    let tmp3 = limbs[3].to_array();
-    [[tmp0[0], tmp1[0], tmp2[0], tmp3[0]], [
-        tmp0[1], tmp1[1], tmp2[1], tmp3[1],
-    ]]
-}
-
-#[inline(always)]
-pub fn u256_to_u260_shl2_simd(limbs: [Simd<u64, 2>; 4]) -> [Simd<u64, 2>; 5] {
-    let [l0, l1, l2, l3] = limbs;
-    [
-        (l0 << 2) & Simd::splat(MASK52),
-        ((l0 >> 50) | (l1 << 14)) & Simd::splat(MASK52),
-        ((l1 >> 38) | (l2 << 26)) & Simd::splat(MASK52),
-        ((l2 >> 26) | (l3 << 38)) & Simd::splat(MASK52),
-        l3 >> 14,
-    ]
-}
-
-#[inline(always)]
-pub fn u260_to_u256_simd(limbs: [Simd<u64, 2>; 5]) -> [Simd<u64, 2>; 4] {
-    let [l0, l1, l2, l3, l4] = limbs;
-    [
-        l0 | (l1 << 52),
-        (l1 >> 12) | (l2 << 40),
-        (l2 >> 24) | (l3 << 28),
-        (l3 >> 36) | (l4 << 16),
-    ]
-}
-
-#[inline(always)]
-pub fn smult_noinit_simd(s: Simd<u64, 2>, v: [u64; 5]) -> [Simd<u64, 2>; 6] {
-    let mut t = [Simd::splat(0); 6];
-    let s: Simd<f64, 2> = unsafe { vcvtq_f64_u64(s.into()).into() };
-
-    let p_hi_0 = s.mul_add(Simd::splat(v[0] as f64), Simd::splat(C1));
-    let p_lo_0 = s.mul_add(Simd::splat(v[0] as f64), Simd::splat(C2) - p_hi_0);
-    t[1] += p_hi_0.to_bits();
-    t[0] += p_lo_0.to_bits();
-
-    let p_hi_1 = s.mul_add(Simd::splat(v[1] as f64), Simd::splat(C1));
-    let p_lo_1 = s.mul_add(Simd::splat(v[1] as f64), Simd::splat(C2) - p_hi_1);
-    t[2] += p_hi_1.to_bits();
-    t[1] += p_lo_1.to_bits();
-
-    let p_hi_2 = s.mul_add(Simd::splat(v[2] as f64), Simd::splat(C1));
-    let p_lo_2 = s.mul_add(Simd::splat(v[2] as f64), Simd::splat(C2) - p_hi_2);
-    t[3] += p_hi_2.to_bits();
-    t[2] += p_lo_2.to_bits();
-
-    let p_hi_3 = s.mul_add(Simd::splat(v[3] as f64), Simd::splat(C1));
-    let p_lo_3 = s.mul_add(Simd::splat(v[3] as f64), Simd::splat(C2) - p_hi_3);
-    t[4] += p_hi_3.to_bits();
-    t[3] += p_lo_3.to_bits();
-
-    let p_hi_4 = s.mul_add(Simd::splat(v[4] as f64), Simd::splat(C1));
-    let p_lo_4 = s.mul_add(Simd::splat(v[4] as f64), Simd::splat(C2) - p_hi_4);
-    t[5] += p_hi_4.to_bits();
-    t[4] += p_lo_4.to_bits();
-
-    t
-}
-
-#[inline(always)]
-/// Resolve the carry bits in the upper parts 12b and reduce the result to
-/// within < 3p
-pub fn reduce_ct_simd(red: [Simd<u64, 2>; 6]) -> [Simd<u64, 2>; 5] {
-    // The lowest limb contains carries that still need to be applied.
-    let mut borrow: Simd<i64, 2> = (red[0] >> 52).cast();
-    let a = [red[1], red[2], red[3], red[4], red[5]];
-
-    // To reduce Check whether the most significant bit is set
-    let mask = (a[4] >> 47).bitand(Simd::splat(1)).simd_eq(Simd::splat(0));
-
-    // Select values based on the mask: if mask lane is true, use zeros, else use
-    // U52_2P
-    let zeros = [Simd::splat(0); 5];
-    let twop = U52_2P.map(Simd::splat);
-    let b: [_; 5] = array::from_fn(|i| mask.select(zeros[i], twop[i]));
-
-    let mut c = [Simd::splat(0); 5];
-    for i in 0..c.len() {
-        let tmp: Simd<i64, 2> = a[i].cast::<i64>() - b[i].cast() + borrow;
-        c[i] = tmp.cast().bitand(Simd::splat(MASK52));
-        borrow = tmp >> 52
-    }
-
-    c
-}
-
 #[inline(always)]
 pub fn reduce_ct(a: [u64; 4]) -> [u64; 4] {
     let b = [[0_u64; 4], U64_2P];
@@ -192,17 +67,6 @@ pub fn sub<const N: usize>(a: [u64; N], b: [u64; N]) -> [u64; N] {
     c
 }
 
-#[inline(always)]
-pub fn addv_simd<const N: usize>(
-    mut va: [Simd<u64, 2>; N],
-    vb: [Simd<u64, 2>; N],
-) -> [Simd<u64, 2>; N] {
-    for i in 0..va.len() {
-        va[i] += vb[i];
-    }
-    va
-}
-
 #[inline(always)]
 pub fn carrying_mul_add(a: u64, b: u64, add: u64, carry: u64) -> (u64, u64) {
     let c: u128 = a as u128 * b as u128 + carry as u128 + add as u128;
diff --git a/skyscraper/core/Cargo.toml b/skyscraper/core/Cargo.toml
index 25df09d3..aa14dee4 100644
--- a/skyscraper/core/Cargo.toml
+++ b/skyscraper/core/Cargo.toml
@@ -11,22 +11,23 @@ repository.workspace = true
 [dependencies]
 # Workspace crates
 block-multiplier.workspace = true
-fp-rounding.workspace = true
 
 # Cryptography and proof systems
 ark-bn254.workspace = true
 ark-ff.workspace = true
 
 # 3rd party
-proptest.workspace = true
 rayon.workspace = true
 seq-macro.workspace = true
 zerocopy.workspace = true
 
+[target.'cfg(not(target_arch = "wasm32"))'.dependencies]
+fp-rounding.workspace = true
+
 [dev-dependencies]
 divan.workspace = true
-primitive-types.workspace = true
 rand.workspace = true
+proptest.workspace = true
 
 [lints]
 workspace = true
diff --git a/skyscraper/core/src/lib.rs b/skyscraper/core/src/lib.rs
index 939feb73..912fd7a1 100644
--- a/skyscraper/core/src/lib.rs
+++ b/skyscraper/core/src/lib.rs
@@ -4,8 +4,6 @@
 
 pub mod arithmetic;
 pub mod bar;
-pub mod block3;
-pub mod block4;
 pub mod constants;
 pub mod generic;
 pub mod pow;
@@ -14,6 +12,11 @@ pub mod reference;
 pub mod simple;
 pub mod v1;
 
+#[cfg(target_arch = "aarch64")]
+pub mod block3;
+#[cfg(target_arch = "aarch64")]
+pub mod block4;
+
 /// The least common multiple of the implementation widths.
 ///
 /// Doing this many compressions in parallel will make optimal use of resources
diff --git a/skyscraper/core/src/pow.rs b/skyscraper/core/src/pow.rs
index 1a1181f0..e2526b64 100644
--- a/skyscraper/core/src/pow.rs
+++ b/skyscraper/core/src/pow.rs
@@ -1,5 +1,9 @@
+#[cfg(target_arch = "aarch64")]
+use crate::block4::compress_many;
+#[cfg(not(target_arch = "aarch64"))]
+use crate::simple::compress_many;
 use {
-    crate::{arithmetic::less_than, block4::compress_many, generic, simple::compress, WIDTH_LCM},
+    crate::{arithmetic::less_than, generic, simple::compress, WIDTH_LCM},
     ark_ff::Zero,
 };
 
@@ -35,6 +39,7 @@ pub fn solve(challenge: [u64; 4], difficulty: f64) -> u64 {
         return 0;
     }
     let threshold = threshold(difficulty + PROVER_BIAS);
+
     let nonce = generic::solve::<_, { WIDTH_LCM * 10 }>(compress_many, challenge, threshold);
     debug_assert!(verify(challenge, difficulty, nonce));
     nonce
diff --git a/skyscraper/fp-rounding/src/lib.rs b/skyscraper/fp-rounding/src/lib.rs
index 351eaf94..a9089aec 100644
--- a/skyscraper/fp-rounding/src/lib.rs
+++ b/skyscraper/fp-rounding/src/lib.rs
@@ -1,4 +1,5 @@
 #![allow(unsafe_code)]
+#![no_std]
 //! Round Toward Zero (RTZ) floating-point rounding mode control
 //!
 //! Rust/LLVM does not support different float point mode rounding modes and
diff --git a/skyscraper/fp-rounding/src/utils.rs b/skyscraper/fp-rounding/src/utils.rs
index 69456426..ba482cd2 100644
--- a/skyscraper/fp-rounding/src/utils.rs
+++ b/skyscraper/fp-rounding/src/utils.rs
@@ -37,6 +37,6 @@ pub fn fence<T>(val: T) -> T {
     // read_volatile makes a copy, but this is an unintentional side effect.
     // Since running the destructor/Drop twice is undesirable, the memory is
     // freed up here.
-    std::mem::forget(val);
+    core::mem::forget(val);
     copy
 }