From ba510ab85c6024ac7e9507bff482076f8e1df9ba Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 18 Dec 2025 12:10:51 +0100
Subject: [PATCH 01/15] avoid heap allocations in `poseidon_sponge`

In the 2^32 benchmark during key generation this avoids about 500k
temporary heap allocations when running for about 30s.

Likely only a small performance cost, but we can avoid them without
making the code much more complicated.
---
 src/symmetric/tweak_hash/poseidon.rs | 45 ++++++++++++++++++----------
 1 file changed, 29 insertions(+), 16 deletions(-)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 7ab2d7b..775e1e8 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -201,34 +201,47 @@ where
     );
     let rate = WIDTH - capacity_value.len();
 
-    let extra_elements = (rate - (input.len() % rate)) % rate;
-    let mut input_vector = input.to_vec();
-    // We pad the input with zeros to make its length a multiple of the rate.
-    //
-    // This is safe because the input's original length is effectively encoded
-    // in the `capacity_value`, which serves as a domain separator.
-    input_vector.resize(input.len() + extra_elements, A::ZERO);
-
     // initialize
     let mut state = [A::ZERO; WIDTH];
     state[rate..].copy_from_slice(capacity_value);
 
-    // absorb
-    for chunk in input_vector.chunks(rate) {
+    let extra_elements = (rate - (input.len() % rate)) % rate;
+    // Instead of converting the input to a vector, resizing and feeding the data into the
+    // sponge, we instead fill in the vector from all chunks until we are left with a non
+    // full chunk. We only add to the state, so padded data does not mutate `state` at all.
+
+    // 1. fill in all full chunks and permute
+    let mut it = input.chunks_exact(rate);
+    for chunk in &mut it {
+        //input.chunks_exact(rate) {
+        // iterate the chunks
         for i in 0..chunk.len() {
             state[i] += chunk[i];
         }
         perm.permute_mut(&mut state);
     }
+    // 2. fill the remainder and extend with zeros
+    let remainder = rate - extra_elements;
+    if remainder > 0 {
+        for (i, x) in it.remainder().iter().enumerate() {
+            state[i] += *x;
+        }
+        // was a remainder, so permute. No need to mutate `state` as we *add* only anyway
+        perm.permute_mut(&mut state);
+    }
 
     // squeeze
-    let mut out = vec![];
-    while out.len() < OUT_LEN {
-        out.extend_from_slice(&state[..rate]);
-        perm.permute_mut(&mut state);
+    let mut out = [A::ZERO; OUT_LEN];
+    let mut out_idx = 0;
+    while out_idx < OUT_LEN {
+        let chunk_size = (OUT_LEN - out_idx).min(rate);
+        out[out_idx..out_idx + chunk_size].copy_from_slice(&state[..chunk_size]);
+        out_idx += chunk_size;
+        if out_idx < OUT_LEN {
+            perm.permute_mut(&mut state);
+        }
     }
-    let slice = &out[0..OUT_LEN];
-    slice.try_into().expect("Length mismatch")
+    out
 }
 
 /// A tweakable hash function implemented using Poseidon2

From e3e5ceb234da7063803e380ba32d3b521efa54d3 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 18 Dec 2025 12:12:02 +0100
Subject: [PATCH 02/15] avoid heap allocations in `compute_tree_leaves` by
 using `for_each_init`

Each rayon worker job had to allocate the full `packed_leaf_input`. We
now use `for_each_init` to preallocate a vector for every Rayon worker
instead. We overwrite the entire vector in every job, so not even a
need to `fill(0)` the vector in each job.

This drops another ~100k allocations when running the 2^32 bench over
30s.

Brings us down to only 3k temporary allocations total in that time frame.
---
 src/symmetric/tweak_hash/poseidon.rs | 235 ++++++++++++++-------------
 1 file changed, 123 insertions(+), 112 deletions(-)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 775e1e8..50005a1 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -505,6 +505,11 @@ impl<
         let capacity_val: [PackedF; CAPACITY] =
             poseidon_safe_domain_separator::<CAPACITY>(&sponge_perm, &lengths).map(PackedF::from);
 
+        // Compute sponge input length. Required to init packed input vector for each rayon worker
+        let sponge_tweak_offset = PARAMETER_LEN;
+        let sponge_chains_offset = PARAMETER_LEN + TWEAK_LEN;
+        let sponge_input_len = PARAMETER_LEN + TWEAK_LEN + NUM_CHUNKS * HASH_LEN;
+
         // PARALLEL SIMD PROCESSING
         //
         // Process epochs in batches of size `width`.
@@ -513,126 +518,132 @@ impl<
         epochs
             .par_chunks_exact(width)
             .zip(leaves.par_chunks_exact_mut(width))
-            .for_each(|(epoch_chunk, leaves_chunk)| {
-                // STEP 1: GENERATE AND PACK CHAIN STARTING POINTS
-                //
-                // For each chain, generate starting points for all epochs in the chunk.
-                // Use vertical packing: transpose from [lane][element] to [element][lane].
-                //
-                // This layout enables efficient SIMD operations across epochs.
-
-                let mut packed_chains: [[PackedF; HASH_LEN]; NUM_CHUNKS] =
-                    array::from_fn(|c_idx| {
-                        // Generate starting points for this chain across all epochs.
-                        let starts: [_; PackedF::WIDTH] = array::from_fn(|lane| {
-                            PRF::get_domain_element(prf_key, epoch_chunk[lane], c_idx as u64).into()
+            .for_each_init(
+                || vec![PackedF::ZERO; sponge_input_len],
+                |packed_leaf_input, (epoch_chunk, leaves_chunk)| {
+                    // STEP 1: GENERATE AND PACK CHAIN STARTING POINTS
+                    //
+                    // For each chain, generate starting points for all epochs in the chunk.
+                    // Use vertical packing: transpose from [lane][element] to [element][lane].
+                    //
+                    // This layout enables efficient SIMD operations across epochs.
+
+                    let mut packed_chains: [[PackedF; HASH_LEN]; NUM_CHUNKS] =
+                        array::from_fn(|c_idx| {
+                            // Generate starting points for this chain across all epochs.
+                            let starts: [_; PackedF::WIDTH] = array::from_fn(|lane| {
+                                PRF::get_domain_element(prf_key, epoch_chunk[lane], c_idx as u64)
+                                    .into()
+                            });
+
+                            // Transpose to vertical packing for SIMD efficiency.
+                            pack_array(&starts)
                         });
 
-                        // Transpose to vertical packing for SIMD efficiency.
-                        pack_array(&starts)
-                    });
-
-                // STEP 2: WALK CHAINS IN PARALLEL USING SIMD
-                //
-                // For each chain, walk all epochs simultaneously using SIMD.
-                // The chains start at their initial values and are walked step-by-step
-                // until they reach their endpoints.
-                //
-                // Cache strategy: process one chain at a time to maximize locality.
-                // All epochs for that chain stay in registers across iterations.
-
-                // Offsets for chain compression: [parameter | tweak | current_value]
-                let chain_tweak_offset = PARAMETER_LEN;
-                let chain_value_offset = PARAMETER_LEN + TWEAK_LEN;
-
-                for (chain_index, packed_chain) in
-                    packed_chains.iter_mut().enumerate().take(num_chains)
-                {
-                    // Walk this chain for `chain_length - 1` steps.
-                    // The starting point is step 0, so we need `chain_length - 1` iterations.
-                    for step in 0..chain_length - 1 {
-                        // Current position in the chain.
-                        let pos = (step + 1) as u8;
-
-                        // Assemble the packed input for the hash function.
-                        // Layout: [parameter | tweak | current_value]
-                        let mut packed_input = [PackedF::ZERO; CHAIN_COMPRESSION_WIDTH];
-
-                        // Copy pre-packed parameter
-                        packed_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
-
-                        // Pack tweaks directly into destination
-                        pack_fn_into::<TWEAK_LEN>(
-                            &mut packed_input,
-                            chain_tweak_offset,
-                            |t_idx, lane| {
-                                Self::chain_tweak(epoch_chunk[lane], chain_index as u8, pos)
-                                    .to_field_elements::<TWEAK_LEN>()[t_idx]
-                            },
-                        );
-
-                        // Copy current chain value (already packed)
-                        packed_input[chain_value_offset..chain_value_offset + HASH_LEN]
-                            .copy_from_slice(packed_chain);
-
-                        // Apply the hash function to advance the chain.
-                        // This single call processes all epochs in parallel.
-                        *packed_chain =
-                            poseidon_compress::<PackedF, _, CHAIN_COMPRESSION_WIDTH, HASH_LEN>(
-                                &chain_perm,
-                                &packed_input,
+                    // STEP 2: WALK CHAINS IN PARALLEL USING SIMD
+                    //
+                    // For each chain, walk all epochs simultaneously using SIMD.
+                    // The chains start at their initial values and are walked step-by-step
+                    // until they reach their endpoints.
+                    //
+                    // Cache strategy: process one chain at a time to maximize locality.
+                    // All epochs for that chain stay in registers across iterations.
+
+                    // Offsets for chain compression: [parameter | tweak | current_value]
+                    let chain_tweak_offset = PARAMETER_LEN;
+                    let chain_value_offset = PARAMETER_LEN + TWEAK_LEN;
+
+                    for (chain_index, packed_chain) in
+                        packed_chains.iter_mut().enumerate().take(num_chains)
+                    {
+                        // Walk this chain for `chain_length - 1` steps.
+                        // The starting point is step 0, so we need `chain_length - 1` iterations.
+                        for step in 0..chain_length - 1 {
+                            // Current position in the chain.
+                            let pos = (step + 1) as u8;
+
+                            // Assemble the packed input for the hash function.
+                            // Layout: [parameter | tweak | current_value]
+                            let mut packed_input = [PackedF::ZERO; CHAIN_COMPRESSION_WIDTH];
+
+                            // Copy pre-packed parameter
+                            packed_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
+
+                            // Pack tweaks directly into destination
+                            pack_fn_into::<TWEAK_LEN>(
+                                &mut packed_input,
+                                chain_tweak_offset,
+                                |t_idx, lane| {
+                                    Self::chain_tweak(epoch_chunk[lane], chain_index as u8, pos)
+                                        .to_field_elements::<TWEAK_LEN>()[t_idx]
+                                },
                             );
-                    }
-                }
-
-                // STEP 3: HASH CHAIN ENDS TO PRODUCE TREE LEAVES
-                //
-                // All chains have been walked to their endpoints.
-                // Now hash all chain ends together to form the tree leaf.
-                //
-                // This uses the sponge construction for variable-length input.
 
-                // Assemble the sponge input.
-                // Layout: [parameter | tree_tweak | all_chain_ends]
-                let sponge_tweak_offset = PARAMETER_LEN;
-                let sponge_chains_offset = PARAMETER_LEN + TWEAK_LEN;
-                let sponge_input_len = PARAMETER_LEN + TWEAK_LEN + NUM_CHUNKS * HASH_LEN;
+                            // Copy current chain value (already packed)
+                            packed_input[chain_value_offset..chain_value_offset + HASH_LEN]
+                                .copy_from_slice(packed_chain);
+
+                            // Apply the hash function to advance the chain.
+                            // This single call processes all epochs in parallel.
+                            *packed_chain =
+                                poseidon_compress::<PackedF, _, CHAIN_COMPRESSION_WIDTH, HASH_LEN>(
+                                    &chain_perm,
+                                    &packed_input,
+                                );
+                        }
+                    }
 
-                let mut packed_leaf_input = vec![PackedF::ZERO; sponge_input_len];
+                    // STEP 3: HASH CHAIN ENDS TO PRODUCE TREE LEAVES
+                    //
+                    // All chains have been walked to their endpoints.
+                    // Now hash all chain ends together to form the tree leaf.
+                    //
+                    // This uses the sponge construction for variable-length input.
+
+                    // Assemble the sponge input.
+                    // Layout: [parameter | tree_tweak | all_chain_ends]
+                    // NOTE: `packed_leaf_input` is preallocated per worker. We overwrite the entire
+                    // vector in each iteration, so no need to `fill(0)`!
+                    //let mut packed_leaf_input = vec![PackedF::ZERO; sponge_input_len];
+
+                    // Copy pre-packed parameter
+                    packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
+
+                    // Pack tree tweaks directly (level 0 for bottom-layer leaves)
+                    pack_fn_into::<TWEAK_LEN>(
+                        packed_leaf_input,
+                        sponge_tweak_offset,
+                        |t_idx, lane| {
+                            Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::<TWEAK_LEN>()
+                                [t_idx]
+                        },
+                    );
 
-                // Copy pre-packed parameter
-                packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
-
-                // Pack tree tweaks directly (level 0 for bottom-layer leaves)
-                pack_fn_into::<TWEAK_LEN>(
-                    &mut packed_leaf_input,
-                    sponge_tweak_offset,
-                    |t_idx, lane| {
-                        Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::<TWEAK_LEN>()
-                            [t_idx]
-                    },
-                );
+                    // Copy all chain ends (already packed)
+                    let dst = &mut packed_leaf_input[sponge_chains_offset
+                        ..sponge_chains_offset + packed_chains.len() * HASH_LEN];
+                    for (dst_chunk, src_chain) in
+                        dst.chunks_exact_mut(HASH_LEN).zip(packed_chains.iter())
+                    {
+                        dst_chunk.copy_from_slice(src_chain);
+                    }
 
-                // Copy all chain ends (already packed)
-                let dst = &mut packed_leaf_input[sponge_chains_offset .. sponge_chains_offset + packed_chains.len() * HASH_LEN];
-                for (dst_chunk, src_chain) in dst.chunks_exact_mut(HASH_LEN).zip(packed_chains.iter()) {
-                    dst_chunk.copy_from_slice(src_chain);
-                }
-
-                // Apply the sponge hash to produce the leaf.
-                // This absorbs all chain ends and squeezes out the final hash.
-                let packed_leaves = poseidon_sponge::<PackedF, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
-                    &sponge_perm,
-                    &capacity_val,
-                    &packed_leaf_input,
-                );
+                    // Apply the sponge hash to produce the leaf.
+                    // This absorbs all chain ends and squeezes out the final hash.
+                    let packed_leaves =
+                        poseidon_sponge::<PackedF, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
+                            &sponge_perm,
+                            &capacity_val,
+                            &packed_leaf_input,
+                        );
 
-                // STEP 4: UNPACK RESULTS TO SCALAR REPRESENTATION
-                //
-                // Convert from vertical packing back to scalar layout.
-                // Each lane becomes one leaf in the output slice.
-                unpack_array(&packed_leaves, leaves_chunk);
-            });
+                    // STEP 4: UNPACK RESULTS TO SCALAR REPRESENTATION
+                    //
+                    // Convert from vertical packing back to scalar layout.
+                    // Each lane becomes one leaf in the output slice.
+                    unpack_array(&packed_leaves, leaves_chunk);
+                },
+            );
 
         // HANDLE REMAINDER EPOCHS
         //

From ed2f132e26d73fd280a20693d83dd86e549b3239 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 18 Dec 2025 12:52:47 +0100
Subject: [PATCH 03/15] alternative implementation using thread local storage

This way we essentially avoid all allocations, i.e. we get a single
allocation per thread.

`for_each_init` is known to allocate multiple times due to the rayon
work stealing / splitting approach. See:

https://github.com/rayon-rs/rayon/issues/742
---
 Cargo.toml                           |   2 +
 src/symmetric/tweak_hash/poseidon.rs | 164 ++++++++++++++-------------
 2 files changed, 88 insertions(+), 78 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 5962f18..ae304e8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -46,6 +46,8 @@ p3-baby-bear = { git = "https://github.com/Plonky3/Plonky3.git", rev = "a33a312"
 p3-koala-bear = { git = "https://github.com/Plonky3/Plonky3.git", rev = "a33a312" }
 p3-symmetric = { git = "https://github.com/Plonky3/Plonky3.git", rev = "a33a312" }
 
+thread_local = "1.1.9"
+
 [dev-dependencies]
 criterion = "0.7"
 proptest = "1.7"
diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 50005a1..4d36f31 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -17,6 +17,8 @@ use crate::{F, PackedF};
 use super::TweakableHash;
 
 use p3_koala_bear::Poseidon2KoalaBear;
+use std::cell::RefCell;
+use thread_local::ThreadLocal;
 
 const DOMAIN_PARAMETERS_LENGTH: usize = 4;
 /// The state width for compressing a single hash in a chain.
@@ -510,6 +512,8 @@ impl<
         let sponge_chains_offset = PARAMETER_LEN + TWEAK_LEN;
         let sponge_input_len = PARAMETER_LEN + TWEAK_LEN + NUM_CHUNKS * HASH_LEN;
 
+        let tls: ThreadLocal<RefCell<Vec<PackedF>>> = ThreadLocal::new();
+
         // PARALLEL SIMD PROCESSING
         //
         // Process epochs in batches of size `width`.
@@ -518,42 +522,46 @@ impl<
         epochs
             .par_chunks_exact(width)
             .zip(leaves.par_chunks_exact_mut(width))
-            .for_each_init(
-                || vec![PackedF::ZERO; sponge_input_len],
-                |packed_leaf_input, (epoch_chunk, leaves_chunk)| {
-                    // STEP 1: GENERATE AND PACK CHAIN STARTING POINTS
-                    //
-                    // For each chain, generate starting points for all epochs in the chunk.
-                    // Use vertical packing: transpose from [lane][element] to [element][lane].
-                    //
-                    // This layout enables efficient SIMD operations across epochs.
-
-                    let mut packed_chains: [[PackedF; HASH_LEN]; NUM_CHUNKS] =
-                        array::from_fn(|c_idx| {
-                            // Generate starting points for this chain across all epochs.
-                            let starts: [_; PackedF::WIDTH] = array::from_fn(|lane| {
-                                PRF::get_domain_element(prf_key, epoch_chunk[lane], c_idx as u64)
-                                    .into()
-                            });
-
-                            // Transpose to vertical packing for SIMD efficiency.
-                            pack_array(&starts)
+            .for_each(|(epoch_chunk, leaves_chunk)| {
+                // STEP 1: GENERATE AND PACK CHAIN STARTING POINTS
+                //
+                // For each chain, generate starting points for all epochs in the chunk.
+                // Use vertical packing: transpose from [lane][element] to [element][lane].
+                //
+                // This layout enables efficient SIMD operations across epochs.
+
+                let cell = tls.get_or(|| {
+                    RefCell::new(vec![PackedF::ZERO; sponge_input_len])
+                });
+                let mut packed_leaf_input = cell.borrow_mut();
+                // reset not needed
+
+                let mut packed_chains: [[PackedF; HASH_LEN]; NUM_CHUNKS] =
+                    array::from_fn(|c_idx| {
+                        // Generate starting points for this chain across all epochs.
+                        let starts: [_; PackedF::WIDTH] = array::from_fn(|lane| {
+                            PRF::get_domain_element(prf_key, epoch_chunk[lane], c_idx as u64)
+                                .into()
                         });
 
-                    // STEP 2: WALK CHAINS IN PARALLEL USING SIMD
-                    //
-                    // For each chain, walk all epochs simultaneously using SIMD.
-                    // The chains start at their initial values and are walked step-by-step
-                    // until they reach their endpoints.
-                    //
-                    // Cache strategy: process one chain at a time to maximize locality.
-                    // All epochs for that chain stay in registers across iterations.
+                        // Transpose to vertical packing for SIMD efficiency.
+                        pack_array(&starts)
+                    });
 
-                    // Offsets for chain compression: [parameter | tweak | current_value]
-                    let chain_tweak_offset = PARAMETER_LEN;
-                    let chain_value_offset = PARAMETER_LEN + TWEAK_LEN;
+                // STEP 2: WALK CHAINS IN PARALLEL USING SIMD
+                //
+                // For each chain, walk all epochs simultaneously using SIMD.
+                // The chains start at their initial values and are walked step-by-step
+                // until they reach their endpoints.
+                //
+                // Cache strategy: process one chain at a time to maximize locality.
+                // All epochs for that chain stay in registers across iterations.
 
-                    for (chain_index, packed_chain) in
+                // Offsets for chain compression: [parameter | tweak | current_value]
+                let chain_tweak_offset = PARAMETER_LEN;
+                let chain_value_offset = PARAMETER_LEN + TWEAK_LEN;
+
+                for (chain_index, packed_chain) in
                         packed_chains.iter_mut().enumerate().take(num_chains)
                     {
                         // Walk this chain for `chain_length - 1` steps.
@@ -593,56 +601,56 @@ impl<
                         }
                     }
 
-                    // STEP 3: HASH CHAIN ENDS TO PRODUCE TREE LEAVES
-                    //
-                    // All chains have been walked to their endpoints.
-                    // Now hash all chain ends together to form the tree leaf.
-                    //
-                    // This uses the sponge construction for variable-length input.
-
-                    // Assemble the sponge input.
-                    // Layout: [parameter | tree_tweak | all_chain_ends]
-                    // NOTE: `packed_leaf_input` is preallocated per worker. We overwrite the entire
-                    // vector in each iteration, so no need to `fill(0)`!
-                    //let mut packed_leaf_input = vec![PackedF::ZERO; sponge_input_len];
-
-                    // Copy pre-packed parameter
-                    packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
-
-                    // Pack tree tweaks directly (level 0 for bottom-layer leaves)
-                    pack_fn_into::<TWEAK_LEN>(
-                        packed_leaf_input,
-                        sponge_tweak_offset,
-                        |t_idx, lane| {
-                            Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::<TWEAK_LEN>()
+                // STEP 3: HASH CHAIN ENDS TO PRODUCE TREE LEAVES
+                //
+                // All chains have been walked to their endpoints.
+                // Now hash all chain ends together to form the tree leaf.
+                //
+                // This uses the sponge construction for variable-length input.
+
+                // Assemble the sponge input.
+                // Layout: [parameter | tree_tweak | all_chain_ends]
+                // NOTE: `packed_leaf_input` is preallocated per worker. We overwrite the entire
+                // vector in each iteration, so no need to `fill(0)`!
+                //let mut packed_leaf_input = vec![PackedF::ZERO; sponge_input_len];
+
+                // Copy pre-packed parameter
+                packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
+
+                // Pack tree tweaks directly (level 0 for bottom-layer leaves)
+                pack_fn_into::<TWEAK_LEN>(
+                    &mut packed_leaf_input,
+                    sponge_tweak_offset,
+                    |t_idx, lane| {
+                        Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::<TWEAK_LEN>()
                                 [t_idx]
-                        },
-                    );
+                    },
+                );
 
-                    // Copy all chain ends (already packed)
-                    let dst = &mut packed_leaf_input[sponge_chains_offset
+                // Copy all chain ends (already packed)
+                let dst = &mut packed_leaf_input[sponge_chains_offset
                         ..sponge_chains_offset + packed_chains.len() * HASH_LEN];
-                    for (dst_chunk, src_chain) in
+                for (dst_chunk, src_chain) in
                         dst.chunks_exact_mut(HASH_LEN).zip(packed_chains.iter())
                     {
                         dst_chunk.copy_from_slice(src_chain);
                     }
 
-                    // Apply the sponge hash to produce the leaf.
-                    // This absorbs all chain ends and squeezes out the final hash.
-                    let packed_leaves =
-                        poseidon_sponge::<PackedF, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
-                            &sponge_perm,
-                            &capacity_val,
-                            &packed_leaf_input,
-                        );
-
-                    // STEP 4: UNPACK RESULTS TO SCALAR REPRESENTATION
-                    //
-                    // Convert from vertical packing back to scalar layout.
-                    // Each lane becomes one leaf in the output slice.
-                    unpack_array(&packed_leaves, leaves_chunk);
-                },
+                // Apply the sponge hash to produce the leaf.
+                // This absorbs all chain ends and squeezes out the final hash.
+                let packed_leaves =
+                    poseidon_sponge::<PackedF, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
+                        &sponge_perm,
+                        &capacity_val,
+                        &packed_leaf_input,
+                    );
+
+                // STEP 4: UNPACK RESULTS TO SCALAR REPRESENTATION
+                //
+                // Convert from vertical packing back to scalar layout.
+                // Each lane becomes one leaf in the output slice.
+                unpack_array(&packed_leaves, leaves_chunk);
+            },
             );
 
         // HANDLE REMAINDER EPOCHS
@@ -1679,13 +1687,13 @@ mod tests {
 
             let parameter = PoseidonTweak44::rand_parameter(&mut rng);
             let children: Vec<_> = (0..num_pairs * 2)
-                .map(|_| PoseidonTweak44::rand_domain(&mut rng))
-                .collect();
+            .map(|_| PoseidonTweak44::rand_domain(&mut rng))
+            .collect();
 
             let simd_result =
-                PoseidonTweak44::compute_tree_layer(&parameter, level, parent_start, &children);
+            PoseidonTweak44::compute_tree_layer(&parameter, level, parent_start, &children);
             let scalar_result =
-                compute_tree_layer_scalar::<PoseidonTweak44>(&parameter, level, parent_start, &children);
+            compute_tree_layer_scalar::<PoseidonTweak44>(&parameter, level, parent_start, &children);
 
             prop_assert_eq!(simd_result.len(), num_pairs);
             prop_assert_eq!(simd_result, scalar_result);

From 595dbe088ca12e7f00bbc6c430b19d6e2b67f4d9 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 18 Dec 2025 16:38:29 +0100
Subject: [PATCH 04/15] avoid heap allocations in 2/3 branches of `apply`

No need for a `Vec` in these two branches as we know at compile time
how much data is required for each input.

Only relevant if `apply` is part of a hot code path, which normally is
unlikely to be the case. Still, the code is not significantly more,
only more ugly :(

It gets rid of a large number of allocations when running the 2^8
benchmark case.
---
 src/symmetric/tweak_hash/poseidon.rs | 54 +++++++++++++++-------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 4d36f31..19f4dfe 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -312,36 +312,40 @@ impl<
             [single] => {
                 // we compress parameter, tweak, message
                 let perm = poseidon2_16();
-                let combined_input: Vec<F> = parameter
-                    .iter()
-                    .chain(tweak_fe.iter())
-                    .chain(single.iter())
-                    .copied()
-                    .collect();
-                FieldArray(
-                    poseidon_compress::<F, _, CHAIN_COMPRESSION_WIDTH, HASH_LEN>(
-                        &perm,
-                        &combined_input,
-                    ),
-                )
+
+                // Build input on stack: [parameter | tweak | message]
+                let mut combined_input = [F::ZERO; CHAIN_COMPRESSION_WIDTH];
+                combined_input[..PARAMETER_LEN].copy_from_slice(&parameter.0);
+                combined_input[PARAMETER_LEN..PARAMETER_LEN + TWEAK_LEN]
+                    .copy_from_slice(&tweak_fe);
+                combined_input[PARAMETER_LEN + TWEAK_LEN..PARAMETER_LEN + TWEAK_LEN + HASH_LEN]
+                    .copy_from_slice(&single.0);
+
+                FieldArray(poseidon_compress::<F, _, CHAIN_COMPRESSION_WIDTH, HASH_LEN>(
+                    &perm,
+                    &combined_input,
+                ))
             }
 
             [left, right] => {
                 // we compress parameter, tweak, message (now containing two parts)
                 let perm = poseidon2_24();
-                let combined_input: Vec<F> = parameter
-                    .iter()
-                    .chain(tweak_fe.iter())
-                    .chain(left.iter())
-                    .chain(right.iter())
-                    .copied()
-                    .collect();
-                FieldArray(
-                    poseidon_compress::<F, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
-                        &perm,
-                        &combined_input,
-                    ),
-                )
+
+                // Build input on stack: [parameter | tweak | left | right]
+                let mut combined_input = [F::ZERO; MERGE_COMPRESSION_WIDTH];
+                combined_input[..PARAMETER_LEN].copy_from_slice(&parameter.0);
+                combined_input[PARAMETER_LEN..PARAMETER_LEN + TWEAK_LEN]
+                    .copy_from_slice(&tweak_fe);
+                combined_input[PARAMETER_LEN + TWEAK_LEN..PARAMETER_LEN + TWEAK_LEN + HASH_LEN]
+                    .copy_from_slice(&left.0);
+                combined_input
+                    [PARAMETER_LEN + TWEAK_LEN + HASH_LEN..PARAMETER_LEN + TWEAK_LEN + 2 * HASH_LEN]
+                    .copy_from_slice(&right.0);
+
+                FieldArray(poseidon_compress::<F, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
+                    &perm,
+                    &combined_input,
+                ))
             }
 
             _ if message.len() > 2 => {

From 41d240eeb7aa6875c864341c71a7522c7ec280a0 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 18 Dec 2025 16:50:21 +0100
Subject: [PATCH 05/15] add profiling Cargo profile

Can't hurt to have this in here.
---
 Cargo.toml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Cargo.toml b/Cargo.toml
index ae304e8..60002f8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -62,3 +62,7 @@ with-gen-benches-poseidon-top-level = []
 [[bench]]
 name = "benchmark"
 harness = false
+
+[profile.profiling]
+inherits = "release"
+debug = true
\ No newline at end of file

From 816fbbef94711cf81cbac52de2f43df246cc20e2 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 18 Dec 2025 17:27:21 +0100
Subject: [PATCH 06/15] cargo fmt fixes

---
 src/symmetric/tweak_hash/poseidon.rs | 30 +++++++++++++++-------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 19f4dfe..9aaff45 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -316,15 +316,16 @@ impl<
                 // Build input on stack: [parameter | tweak | message]
                 let mut combined_input = [F::ZERO; CHAIN_COMPRESSION_WIDTH];
                 combined_input[..PARAMETER_LEN].copy_from_slice(&parameter.0);
-                combined_input[PARAMETER_LEN..PARAMETER_LEN + TWEAK_LEN]
-                    .copy_from_slice(&tweak_fe);
+                combined_input[PARAMETER_LEN..PARAMETER_LEN + TWEAK_LEN].copy_from_slice(&tweak_fe);
                 combined_input[PARAMETER_LEN + TWEAK_LEN..PARAMETER_LEN + TWEAK_LEN + HASH_LEN]
                     .copy_from_slice(&single.0);
 
-                FieldArray(poseidon_compress::<F, _, CHAIN_COMPRESSION_WIDTH, HASH_LEN>(
-                    &perm,
-                    &combined_input,
-                ))
+                FieldArray(
+                    poseidon_compress::<F, _, CHAIN_COMPRESSION_WIDTH, HASH_LEN>(
+                        &perm,
+                        &combined_input,
+                    ),
+                )
             }
 
             [left, right] => {
@@ -334,18 +335,19 @@ impl<
                 // Build input on stack: [parameter | tweak | left | right]
                 let mut combined_input = [F::ZERO; MERGE_COMPRESSION_WIDTH];
                 combined_input[..PARAMETER_LEN].copy_from_slice(&parameter.0);
-                combined_input[PARAMETER_LEN..PARAMETER_LEN + TWEAK_LEN]
-                    .copy_from_slice(&tweak_fe);
+                combined_input[PARAMETER_LEN..PARAMETER_LEN + TWEAK_LEN].copy_from_slice(&tweak_fe);
                 combined_input[PARAMETER_LEN + TWEAK_LEN..PARAMETER_LEN + TWEAK_LEN + HASH_LEN]
                     .copy_from_slice(&left.0);
-                combined_input
-                    [PARAMETER_LEN + TWEAK_LEN + HASH_LEN..PARAMETER_LEN + TWEAK_LEN + 2 * HASH_LEN]
+                combined_input[PARAMETER_LEN + TWEAK_LEN + HASH_LEN
+                    ..PARAMETER_LEN + TWEAK_LEN + 2 * HASH_LEN]
                     .copy_from_slice(&right.0);
 
-                FieldArray(poseidon_compress::<F, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
-                    &perm,
-                    &combined_input,
-                ))
+                FieldArray(
+                    poseidon_compress::<F, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
+                        &perm,
+                        &combined_input,
+                    ),
+                )
             }
 
             _ if message.len() > 2 => {

From 945320812aa862a7e4f87272e57afc91a21df5a5 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 18 Dec 2025 17:28:49 +0100
Subject: [PATCH 07/15] remove dead line & update comment

---
 src/symmetric/tweak_hash/poseidon.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 9aaff45..8d079f2 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -518,6 +518,8 @@ impl<
         let sponge_chains_offset = PARAMETER_LEN + TWEAK_LEN;
         let sponge_input_len = PARAMETER_LEN + TWEAK_LEN + NUM_CHUNKS * HASH_LEN;
 
+        // We use a thread local storage to guarantee the `packed_leaf_input` vector is only allocated
+        // once per thread
         let tls: ThreadLocal<RefCell<Vec<PackedF>>> = ThreadLocal::new();
 
         // PARALLEL SIMD PROCESSING
@@ -616,9 +618,8 @@ impl<
 
                 // Assemble the sponge input.
                 // Layout: [parameter | tree_tweak | all_chain_ends]
-                // NOTE: `packed_leaf_input` is preallocated per worker. We overwrite the entire
+                // NOTE: `packed_leaf_input` is preallocated per thread. We overwrite the entire
                 // vector in each iteration, so no need to `fill(0)`!
-                //let mut packed_leaf_input = vec![PackedF::ZERO; sponge_input_len];
 
                 // Copy pre-packed parameter
                 packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);

From a1abd1e49774c91f0395bd2a65e12132c95134ec Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 18 Dec 2025 17:30:01 +0100
Subject: [PATCH 08/15] fix indentation of inner for loop

Somehow this is a case where cargo fmt has no opinion about
it. Earlier when using `for_each_init` the indentation was changed,
but this part didn't want to "come back" to what it was before...
---
 src/symmetric/tweak_hash/poseidon.rs | 66 ++++++++++++++--------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 8d079f2..c702d2e 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -570,44 +570,44 @@ impl<
                 let chain_value_offset = PARAMETER_LEN + TWEAK_LEN;
 
                 for (chain_index, packed_chain) in
-                        packed_chains.iter_mut().enumerate().take(num_chains)
-                    {
-                        // Walk this chain for `chain_length - 1` steps.
-                        // The starting point is step 0, so we need `chain_length - 1` iterations.
-                        for step in 0..chain_length - 1 {
-                            // Current position in the chain.
-                            let pos = (step + 1) as u8;
-
-                            // Assemble the packed input for the hash function.
-                            // Layout: [parameter | tweak | current_value]
-                            let mut packed_input = [PackedF::ZERO; CHAIN_COMPRESSION_WIDTH];
-
-                            // Copy pre-packed parameter
-                            packed_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
-
-                            // Pack tweaks directly into destination
-                            pack_fn_into::<TWEAK_LEN>(
-                                &mut packed_input,
-                                chain_tweak_offset,
-                                |t_idx, lane| {
-                                    Self::chain_tweak(epoch_chunk[lane], chain_index as u8, pos)
-                                        .to_field_elements::<TWEAK_LEN>()[t_idx]
-                                },
-                            );
-
-                            // Copy current chain value (already packed)
-                            packed_input[chain_value_offset..chain_value_offset + HASH_LEN]
-                                .copy_from_slice(packed_chain);
-
-                            // Apply the hash function to advance the chain.
-                            // This single call processes all epochs in parallel.
-                            *packed_chain =
+                    packed_chains.iter_mut().enumerate().take(num_chains)
+                {
+                    // Walk this chain for `chain_length - 1` steps.
+                    // The starting point is step 0, so we need `chain_length - 1` iterations.
+                    for step in 0..chain_length - 1 {
+                        // Current position in the chain.
+                        let pos = (step + 1) as u8;
+
+                        // Assemble the packed input for the hash function.
+                        // Layout: [parameter | tweak | current_value]
+                        let mut packed_input = [PackedF::ZERO; CHAIN_COMPRESSION_WIDTH];
+
+                        // Copy pre-packed parameter
+                        packed_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
+
+                        // Pack tweaks directly into destination
+                        pack_fn_into::<TWEAK_LEN>(
+                            &mut packed_input,
+                            chain_tweak_offset,
+                            |t_idx, lane| {
+                                Self::chain_tweak(epoch_chunk[lane], chain_index as u8, pos)
+                                    .to_field_elements::<TWEAK_LEN>()[t_idx]
+                            },
+                        );
+
+                        // Copy current chain value (already packed)
+                        packed_input[chain_value_offset..chain_value_offset + HASH_LEN]
+                            .copy_from_slice(packed_chain);
+
+                        // Apply the hash function to advance the chain.
+                        // This single call processes all epochs in parallel.
+                        *packed_chain =
                                 poseidon_compress::<PackedF, _, CHAIN_COMPRESSION_WIDTH, HASH_LEN>(
                                     &chain_perm,
                                     &packed_input,
                                 );
-                        }
                     }
+                }
 
                 // STEP 3: HASH CHAIN ENDS TO PRODUCE TREE LEAVES
                 //

From 7d7d0aad55a3e9aa4e15672792729370d11c4be2 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 18 Dec 2025 17:35:44 +0100
Subject: [PATCH 09/15] [examples] add two examples for key gen for 2^8 and
 2^32 elements

following the benchmarks for the smallest and largest case
---
 examples/single_keygen.rs      | 24 ++++++++++++++++++++++++
 examples/single_keygen_2_32.rs | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+)
 create mode 100644 examples/single_keygen.rs
 create mode 100644 examples/single_keygen_2_32.rs

diff --git a/examples/single_keygen.rs b/examples/single_keygen.rs
new file mode 100644
index 0000000..449c5c2
--- /dev/null
+++ b/examples/single_keygen.rs
@@ -0,0 +1,24 @@
+use std::hint::black_box;
+
+use leansig::signature::{
+    SignatureScheme,
+    generalized_xmss::instantiations_poseidon_top_level::lifetime_2_to_the_8::SIGTopLevelTargetSumLifetime8Dim64Base8,
+};
+
+fn main() {
+    let mut rng = rand::rng();
+
+    // 2^8 lifetime, full activation
+    let activation_duration = SIGTopLevelTargetSumLifetime8Dim64Base8::LIFETIME as usize;
+
+    eprintln!("Running single key_gen for 2^8 lifetime...");
+    let (pk, sk) = black_box(SIGTopLevelTargetSumLifetime8Dim64Base8::key_gen(
+        &mut rng,
+        0,
+        activation_duration,
+    ));
+    eprintln!("Done. pk size: {} bytes", std::mem::size_of_val(&pk));
+
+    // Prevent optimization from removing the key_gen call
+    black_box((pk, sk));
+}
diff --git a/examples/single_keygen_2_32.rs b/examples/single_keygen_2_32.rs
new file mode 100644
index 0000000..4bc0b39
--- /dev/null
+++ b/examples/single_keygen_2_32.rs
@@ -0,0 +1,33 @@
+use std::hint::black_box;
+
+use leansig::signature::{
+    SignatureScheme,
+    generalized_xmss::instantiations_poseidon_top_level::lifetime_2_to_the_32::size_optimized::SIGTopLevelTargetSumLifetime32Dim32Base26,
+};
+
+/// Cap activation duration to 2^18 to keep runtime reasonable (same as benchmark)
+const MAX_LOG_ACTIVATION_DURATION: usize = 18;
+
+fn main() {
+    let mut rng = rand::rng();
+
+    // 2^32 lifetime, activation capped at 2^18
+    let activation_duration = std::cmp::min(
+        1 << MAX_LOG_ACTIVATION_DURATION,
+        SIGTopLevelTargetSumLifetime32Dim32Base26::LIFETIME as usize,
+    );
+
+    eprintln!(
+        "Running single key_gen for 2^32 lifetime (activation 2^{})...",
+        MAX_LOG_ACTIVATION_DURATION
+    );
+    let (pk, sk) = black_box(SIGTopLevelTargetSumLifetime32Dim32Base26::key_gen(
+        &mut rng,
+        0,
+        activation_duration,
+    ));
+    eprintln!("Done. pk size: {} bytes", std::mem::size_of_val(&pk));
+
+    // Prevent optimization from removing the key_gen call
+    black_box((pk, sk));
+}

From d01fa2c708abb89b363fc522fa497e9741449ed2 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 22 Dec 2025 16:23:03 +0100
Subject: [PATCH 10/15] use iterator approach when adding chunks to state

---
 src/symmetric/tweak_hash/poseidon.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index c702d2e..5035c10 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -215,10 +215,9 @@ where
     // 1. fill in all full chunks and permute
     let mut it = input.chunks_exact(rate);
     for chunk in &mut it {
-        //input.chunks_exact(rate) {
         // iterate the chunks
-        for i in 0..chunk.len() {
-            state[i] += chunk[i];
+        for (s, &x) in state.iter_mut().take(rate).zip(chunk) {
+            *s += x;
         }
         perm.permute_mut(&mut state);
     }

From ceab87d5e3cb397538db610cc3c63ae28713e76a Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 22 Dec 2025 16:24:41 +0100
Subject: [PATCH 11/15] delete keygen examples / profiling helpers

---
 examples/single_keygen.rs      | 24 ------------------------
 examples/single_keygen_2_32.rs | 33 ---------------------------------
 2 files changed, 57 deletions(-)
 delete mode 100644 examples/single_keygen.rs
 delete mode 100644 examples/single_keygen_2_32.rs

diff --git a/examples/single_keygen.rs b/examples/single_keygen.rs
deleted file mode 100644
index 449c5c2..0000000
--- a/examples/single_keygen.rs
+++ /dev/null
@@ -1,24 +0,0 @@
-use std::hint::black_box;
-
-use leansig::signature::{
-    SignatureScheme,
-    generalized_xmss::instantiations_poseidon_top_level::lifetime_2_to_the_8::SIGTopLevelTargetSumLifetime8Dim64Base8,
-};
-
-fn main() {
-    let mut rng = rand::rng();
-
-    // 2^8 lifetime, full activation
-    let activation_duration = SIGTopLevelTargetSumLifetime8Dim64Base8::LIFETIME as usize;
-
-    eprintln!("Running single key_gen for 2^8 lifetime...");
-    let (pk, sk) = black_box(SIGTopLevelTargetSumLifetime8Dim64Base8::key_gen(
-        &mut rng,
-        0,
-        activation_duration,
-    ));
-    eprintln!("Done. pk size: {} bytes", std::mem::size_of_val(&pk));
-
-    // Prevent optimization from removing the key_gen call
-    black_box((pk, sk));
-}
diff --git a/examples/single_keygen_2_32.rs b/examples/single_keygen_2_32.rs
deleted file mode 100644
index 4bc0b39..0000000
--- a/examples/single_keygen_2_32.rs
+++ /dev/null
@@ -1,33 +0,0 @@
-use std::hint::black_box;
-
-use leansig::signature::{
-    SignatureScheme,
-    generalized_xmss::instantiations_poseidon_top_level::lifetime_2_to_the_32::size_optimized::SIGTopLevelTargetSumLifetime32Dim32Base26,
-};
-
-/// Cap activation duration to 2^18 to keep runtime reasonable (same as benchmark)
-const MAX_LOG_ACTIVATION_DURATION: usize = 18;
-
-fn main() {
-    let mut rng = rand::rng();
-
-    // 2^32 lifetime, activation capped at 2^18
-    let activation_duration = std::cmp::min(
-        1 << MAX_LOG_ACTIVATION_DURATION,
-        SIGTopLevelTargetSumLifetime32Dim32Base26::LIFETIME as usize,
-    );
-
-    eprintln!(
-        "Running single key_gen for 2^32 lifetime (activation 2^{})...",
-        MAX_LOG_ACTIVATION_DURATION
-    );
-    let (pk, sk) = black_box(SIGTopLevelTargetSumLifetime32Dim32Base26::key_gen(
-        &mut rng,
-        0,
-        activation_duration,
-    ));
-    eprintln!("Done. pk size: {} bytes", std::mem::size_of_val(&pk));
-
-    // Prevent optimization from removing the key_gen call
-    black_box((pk, sk));
-}

From 1305e33560fb375f2beced732527a934837bc613 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Mon, 22 Dec 2025 18:19:26 +0100
Subject: [PATCH 12/15] use stdlib `thread_local!` macro instead of
 thread_local crate

---
 Cargo.toml                           |  2 -
 src/symmetric/tweak_hash/poseidon.rs | 70 ++++++++++++++--------------
 2 files changed, 34 insertions(+), 38 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 60002f8..41637e8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -46,8 +46,6 @@ p3-baby-bear = { git = "https://github.com/Plonky3/Plonky3.git", rev = "a33a312"
 p3-koala-bear = { git = "https://github.com/Plonky3/Plonky3.git", rev = "a33a312" }
 p3-symmetric = { git = "https://github.com/Plonky3/Plonky3.git", rev = "a33a312" }
 
-thread_local = "1.1.9"
-
 [dev-dependencies]
 criterion = "0.7"
 proptest = "1.7"
diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 5035c10..e3e5395 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -18,7 +18,6 @@ use super::TweakableHash;
 
 use p3_koala_bear::Poseidon2KoalaBear;
 use std::cell::RefCell;
-use thread_local::ThreadLocal;
 
 const DOMAIN_PARAMETERS_LENGTH: usize = 4;
 /// The state width for compressing a single hash in a chain.
@@ -517,9 +516,11 @@ impl<
         let sponge_chains_offset = PARAMETER_LEN + TWEAK_LEN;
         let sponge_input_len = PARAMETER_LEN + TWEAK_LEN + NUM_CHUNKS * HASH_LEN;
 
-        // We use a thread local storage to guarantee the `packed_leaf_input` vector is only allocated
+        // We use thread-local storage to guarantee the `packed_leaf_input` vector is only allocated
         // once per thread
-        let tls: ThreadLocal<RefCell<Vec<PackedF>>> = ThreadLocal::new();
+        thread_local! {
+            static PACKED_LEAF_INPUT: RefCell<Vec<PackedF>> = const { RefCell::new(Vec::new()) };
+        }
 
         // PARALLEL SIMD PROCESSING
         //
@@ -537,18 +538,11 @@ impl<
                 //
                 // This layout enables efficient SIMD operations across epochs.
 
-                let cell = tls.get_or(|| {
-                    RefCell::new(vec![PackedF::ZERO; sponge_input_len])
-                });
-                let mut packed_leaf_input = cell.borrow_mut();
-                // reset not needed
-
                 let mut packed_chains: [[PackedF; HASH_LEN]; NUM_CHUNKS] =
                     array::from_fn(|c_idx| {
                         // Generate starting points for this chain across all epochs.
                         let starts: [_; PackedF::WIDTH] = array::from_fn(|lane| {
-                            PRF::get_domain_element(prf_key, epoch_chunk[lane], c_idx as u64)
-                                .into()
+                            PRF::get_domain_element(prf_key, epoch_chunk[lane], c_idx as u64).into()
                         });
 
                         // Transpose to vertical packing for SIMD efficiency.
@@ -601,10 +595,10 @@ impl<
                         // Apply the hash function to advance the chain.
                         // This single call processes all epochs in parallel.
                         *packed_chain =
-                                poseidon_compress::<PackedF, _, CHAIN_COMPRESSION_WIDTH, HASH_LEN>(
-                                    &chain_perm,
-                                    &packed_input,
-                                );
+                            poseidon_compress::<PackedF, _, CHAIN_COMPRESSION_WIDTH, HASH_LEN>(
+                                &chain_perm,
+                                &packed_input,
+                            );
                     }
                 }
 
@@ -619,45 +613,49 @@ impl<
                 // Layout: [parameter | tree_tweak | all_chain_ends]
                 // NOTE: `packed_leaf_input` is preallocated per thread. We overwrite the entire
                 // vector in each iteration, so no need to `fill(0)`!
+                let packed_leaves = PACKED_LEAF_INPUT.with_borrow_mut(|packed_leaf_input| {
+                    // Resize on first use for this thread
+                    if packed_leaf_input.len() != sponge_input_len {
+                        packed_leaf_input.resize(sponge_input_len, PackedF::ZERO);
+                    }
 
-                // Copy pre-packed parameter
-                packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
-
-                // Pack tree tweaks directly (level 0 for bottom-layer leaves)
-                pack_fn_into::<TWEAK_LEN>(
-                    &mut packed_leaf_input,
-                    sponge_tweak_offset,
-                    |t_idx, lane| {
-                        Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::<TWEAK_LEN>()
+                    // Copy pre-packed parameter
+                    packed_leaf_input[..PARAMETER_LEN].copy_from_slice(&packed_parameter);
+
+                    // Pack tree tweaks directly (level 0 for bottom-layer leaves)
+                    pack_fn_into::<TWEAK_LEN>(
+                        packed_leaf_input,
+                        sponge_tweak_offset,
+                        |t_idx, lane| {
+                            Self::tree_tweak(0, epoch_chunk[lane]).to_field_elements::<TWEAK_LEN>()
                                 [t_idx]
-                    },
-                );
+                        },
+                    );
 
-                // Copy all chain ends (already packed)
-                let dst = &mut packed_leaf_input[sponge_chains_offset
+                    // Copy all chain ends (already packed)
+                    let dst = &mut packed_leaf_input[sponge_chains_offset
                         ..sponge_chains_offset + packed_chains.len() * HASH_LEN];
-                for (dst_chunk, src_chain) in
+                    for (dst_chunk, src_chain) in
                         dst.chunks_exact_mut(HASH_LEN).zip(packed_chains.iter())
                     {
                         dst_chunk.copy_from_slice(src_chain);
                     }
 
-                // Apply the sponge hash to produce the leaf.
-                // This absorbs all chain ends and squeezes out the final hash.
-                let packed_leaves =
+                    // Apply the sponge hash to produce the leaf.
+                    // This absorbs all chain ends and squeezes out the final hash.
                     poseidon_sponge::<PackedF, _, MERGE_COMPRESSION_WIDTH, HASH_LEN>(
                         &sponge_perm,
                         &capacity_val,
-                        &packed_leaf_input,
-                    );
+                        packed_leaf_input,
+                    )
+                });
 
                 // STEP 4: UNPACK RESULTS TO SCALAR REPRESENTATION
                 //
                 // Convert from vertical packing back to scalar layout.
                 // Each lane becomes one leaf in the output slice.
                 unpack_array(&packed_leaves, leaves_chunk);
-            },
-            );
+            });
 
         // HANDLE REMAINDER EPOCHS
         //

From e3c931655fcb4fb51af2742d460903d7e9cb5e0a Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Wed, 7 Jan 2026 15:03:01 +0100
Subject: [PATCH 13/15] add comment about why permutation is unnecessary

---
 src/symmetric/tweak_hash/poseidon.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index e3e5395..7ea9819 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -230,7 +230,7 @@ where
         perm.permute_mut(&mut state);
     }
 
-    // squeeze
+    // 3. squeeze
     let mut out = [A::ZERO; OUT_LEN];
     let mut out_idx = 0;
     while out_idx < OUT_LEN {
@@ -238,6 +238,7 @@ where
         out[out_idx..out_idx + chunk_size].copy_from_slice(&state[..chunk_size]);
         out_idx += chunk_size;
         if out_idx < OUT_LEN {
+            // no need to permute in last iteration, `state` is local variable
             perm.permute_mut(&mut state);
         }
     }

From 64c173c447c778ed99e3386934e34902f70a6bd1 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Wed, 7 Jan 2026 15:03:34 +0100
Subject: [PATCH 14/15] fix actual bug of remainder permutation

When I wrote the code I read `extra_elements` as the remainder
elements and not the elements to be padded. Oops. In the previous
commit `remainder` would be non zero when the input length was an
exact multiple of the rate.

Cleaner to just use the remainder iterator directly here too and get
rid of the variable.
---
 src/symmetric/tweak_hash/poseidon.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index 7ea9819..e1397ef 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -206,7 +206,6 @@ where
     let mut state = [A::ZERO; WIDTH];
     state[rate..].copy_from_slice(capacity_value);
 
-    let extra_elements = (rate - (input.len() % rate)) % rate;
     // Instead of converting the input to a vector, resizing and feeding the data into the
     // sponge, we instead fill in the vector from all chunks until we are left with a non
     // full chunk. We only add to the state, so padded data does not mutate `state` at all.
@@ -221,8 +220,7 @@ where
         perm.permute_mut(&mut state);
     }
     // 2. fill the remainder and extend with zeros
-    let remainder = rate - extra_elements;
-    if remainder > 0 {
+    if !it.remainder().is_empty() {
         for (i, x) in it.remainder().iter().enumerate() {
             state[i] += *x;
         }

From 9738a19eceb7eeda31b4dcfae0d71fa0040498f0 Mon Sep 17 00:00:00 2001
From: Vindaar <basti90@gmail.com>
Date: Thu, 8 Jan 2026 09:28:23 +0100
Subject: [PATCH 15/15] improve doc comment & rename out_idx -> out_index

---
 src/symmetric/tweak_hash/poseidon.rs | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/symmetric/tweak_hash/poseidon.rs b/src/symmetric/tweak_hash/poseidon.rs
index e1397ef..ff20eb2 100644
--- a/src/symmetric/tweak_hash/poseidon.rs
+++ b/src/symmetric/tweak_hash/poseidon.rs
@@ -213,7 +213,7 @@ where
     // 1. fill in all full chunks and permute
     let mut it = input.chunks_exact(rate);
     for chunk in &mut it {
-        // iterate the chunks
+        // add chunk elements into the first `rate` many elements of the `state`
         for (s, &x) in state.iter_mut().take(rate).zip(chunk) {
             *s += x;
         }
@@ -230,12 +230,12 @@ where
 
     // 3. squeeze
     let mut out = [A::ZERO; OUT_LEN];
-    let mut out_idx = 0;
-    while out_idx < OUT_LEN {
-        let chunk_size = (OUT_LEN - out_idx).min(rate);
-        out[out_idx..out_idx + chunk_size].copy_from_slice(&state[..chunk_size]);
-        out_idx += chunk_size;
-        if out_idx < OUT_LEN {
+    let mut out_index = 0;
+    while out_index < OUT_LEN {
+        let chunk_size = (OUT_LEN - out_index).min(rate);
+        out[out_index..out_index + chunk_size].copy_from_slice(&state[..chunk_size]);
+        out_index += chunk_size;
+        if out_index < OUT_LEN {
             // no need to permute in last iteration, `state` is local variable
             perm.permute_mut(&mut state);
         }