From 3d463b3d7948520cca82afe67df7d67c4163b897 Mon Sep 17 00:00:00 2001
From: laurilaatu <l.laatu@imperial.ac.uk>
Date: Mon, 26 Jan 2026 20:37:28 +0000
Subject: [PATCH 1/6] weights for dense

---
 hls4ml/backends/oneapi/passes/core_templates.py   | 15 +++++++++++----
 hls4ml/templates/oneapi/firmware/myproject.cpp    |  5 ++++-
 hls4ml/templates/oneapi/firmware/myproject.h      |  3 +++
 .../oneapi/firmware/nnet_utils/nnet_dense.h       |  7 +++----
 hls4ml/writer/oneapi_writer.py                    |  8 ++++++++
 5 files changed, 29 insertions(+), 9 deletions(-)
diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 9602b2d0fc..64a4c7097a 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -6,6 +6,7 @@
 # Dense templates
 
 dense_config_template = """struct config{index} : nnet::dense_config {{
+
     static constexpr unsigned n_in = {n_in};
     static constexpr unsigned n_out = {n_out};
     static constexpr unsigned io_type = nnet::{iotype};
@@ -30,13 +31,16 @@
     typedef {weight_t.name} weight_t;
     typedef {index_t.name} index_t;
 
+    static constexpr weight_t weights = {weights};
+    static constexpr bias_t biases = {biases};
+
     template<class x_T, class y_T>
     using product = nnet::product::{product_type}<x_T, y_T>;
 }};\n"""
 
-dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
+dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output});'
 dense_task_sequence_template = 'task_sequence<nnet::dense_{strategy}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
-dense_stream_function_template = '{name}.async({w}, {b});'
+dense_stream_function_template = '{name}.async();'
 dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_stream.h']
 
 
@@ -53,6 +57,9 @@ def format(self, node):
             node.get_input_variable().type.precision, node.get_weights('weight').type.precision
         )
 
+        params['weights'] = node.get_weights('weight').name
+        params['biases'] = node.get_weights('bias').name
+
         return self.template.format(**params)
 
 
@@ -63,8 +70,8 @@ def __init__(self):
 
     def format(self, node):
         params = self._default_function_params(node)
-        params['w'] = node.get_weights('weight').name
-        params['b'] = node.get_weights('bias').name
+        #params['w'] = node.get_weights('weight').name
+        #params['b'] = node.get_weights('bias').name
 
         return self.template.format(**params)
 
diff --git a/hls4ml/templates/oneapi/firmware/myproject.cpp b/hls4ml/templates/oneapi/firmware/myproject.cpp
index 06e7d3fe37..da9439f74a 100644
--- a/hls4ml/templates/oneapi/firmware/myproject.cpp
+++ b/hls4ml/templates/oneapi/firmware/myproject.cpp
@@ -1,9 +1,12 @@
 #include "myproject.h"
-#include "parameters.h"
 #include <sycl/ext/intel/experimental/task_sequence.hpp>
 
 // hls-fpga-machine-learning insert weights
 
+
+#include "parameters.h"
+
+
 // The inter-task pipes need to be declared in the global scope
 // hls-fpga-machine-learning insert inter-task pipes
 
diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h
index 082ae5dc8c..8f313ea30f 100644
--- a/hls4ml/templates/oneapi/firmware/myproject.h
+++ b/hls4ml/templates/oneapi/firmware/myproject.h
@@ -3,6 +3,9 @@
 
 #include "defines.h"
 
+// hls-fpga-machine-learning insert weights
+
+
 // This file defines the interface to the kernel
 
 // currently this is fixed
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
index dc76189083..2b65eef42b 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
@@ -152,12 +152,11 @@ void dense_rf_lt(const data_T &data, res_T &res, const typename CONFIG_T::weight
     }
 }
 template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
-                    const typename CONFIG_T::bias_t &biases) {
+void dense_resource(const data_T &data, res_T &res) {
     if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
-        dense_rf_lt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        dense_rf_lt<data_T, res_T, CONFIG_T>(data, res, CONFIG_T::weights, CONFIG_T::biases);
     } else {
-        dense_rf_gt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+        dense_rf_gt<data_T, res_T, CONFIG_T>(data, res, CONFIG_T::weights, CONFIG_T::biases);
     }
 }
 } // namespace nnet
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 3c0a778c50..b42ff2990f 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -242,6 +242,14 @@ def write_project_header(self, model):
                     for out in model_outputs:
                         newline += out.declare_cpp()
 
+               # Insert weights
+                elif '// hls-fpga-machine-learning insert weights' in line:
+                    newline = line
+                    for layer in model.get_layers():
+                        for w in layer.get_weights():
+                            #if w not in model_brams:
+                            newline += f'#include "weights/{w.name}.h"\n'                        
+
                 # Simply copy line, if no inserts are required
                 else:
                     newline = line

From d67857369385d066b7cdaad49077069b3bf9473c Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Tue, 27 Jan 2026 18:58:42 +0000
Subject: [PATCH 2/6] hgq2 homogeneous quant fix

---
 hls4ml/converters/keras_v3/hgq2/_base.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/hls4ml/converters/keras_v3/hgq2/_base.py b/hls4ml/converters/keras_v3/hgq2/_base.py
index 4a6d0a22c2..f7b4c9ddd3 100644
--- a/hls4ml/converters/keras_v3/hgq2/_base.py
+++ b/hls4ml/converters/keras_v3/hgq2/_base.py
@@ -30,15 +30,19 @@ def extract_fixed_quantizer_config(q, tensor: 'KerasTensor', is_input: bool) ->
     k, B, I = ops.convert_to_numpy(k), ops.convert_to_numpy(B), ops.convert_to_numpy(I)  # noqa: E741
     I = np.where(B > 0, I, 0)  # noqa: E741 # type: ignore
 
-    k = np.broadcast_to(k.astype(np.int16), (1,) + shape)  # type: ignore
-    B = np.broadcast_to(B.astype(np.int16), (1,) + shape)  # type: ignore
-    I = np.broadcast_to(I.astype(np.int16), (1,) + shape)  # noqa: E741
+    if np.size(k) != 1:
+        k = np.broadcast_to(k.astype(np.int16), (1,) + shape)  # type: ignore
+        B = np.broadcast_to(B.astype(np.int16), (1,) + shape)  # type: ignore
+        I = np.broadcast_to(I.astype(np.int16), (1,) + shape)  # noqa: E741
+    else:
+        k = np.ravel(k).astype(np.int16)
+        B = np.ravel(B).astype(np.int16)
+        I = np.ravel(I).astype(np.int16)  # noqa: E741
 
     overflow_mode: str = internal_q.overflow_mode
     round_mode: str = internal_q.round_mode
     if round_mode.startswith('S_'):
         round_mode = round_mode[2:]
-    fusible = np.unique(k).size == 1 and np.unique(B).size == 1 and np.unique(I).size == 1
 
     input_keras_tensor_names = tensor.name if is_input else f'{tensor.name}_q'
     output_keras_tensor_names = f'{tensor.name}_q' if is_input else tensor.name
@@ -48,7 +52,7 @@ def extract_fixed_quantizer_config(q, tensor: 'KerasTensor', is_input: bool) ->
         'mask_kbi': (k, B, I),
         'SAT': overflow_mode,
         'RND': round_mode,
-        'fusible': fusible,
+        'fusible': None,
         'input_keras_tensor_names': [input_keras_tensor_names],
         'output_keras_tensor_names': [output_keras_tensor_names],
         'overrides': {},

From 59bd96f0c5e9c8e95538a9e96e0233c2d70695ba Mon Sep 17 00:00:00 2001
From: laurilaatu <l.laatu@imperial.ac.uk>
Date: Mon, 9 Feb 2026 16:31:00 +0000
Subject: [PATCH 3/6] Changes required for oneAPI MHA

---
 hls4ml/backends/oneapi/oneapi_backend.py      |   8 -
 .../backends/oneapi/passes/core_templates.py  |  88 ++++++++++-
 .../keras_v3/hgq2/multi_head_attention.py     |   4 +-
 .../firmware/nnet_utils/nnet_activation.h     |  82 +++++++---
 .../oneapi/firmware/nnet_utils/nnet_dense.h   |   7 +-
 hls4ml/writer/oneapi_writer.py                | 149 ++++++++++--------
 6 files changed, 233 insertions(+), 105 deletions(-)

diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
index 0c11c16d09..94f26c9f1c 100644
--- a/hls4ml/backends/oneapi/oneapi_backend.py
+++ b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -19,7 +19,6 @@
     Embedding,
     Layer,
     SimpleRNN,
-    Softmax,
 )
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
@@ -257,13 +256,6 @@ def init_activation(self, layer):
         if layer.get_attr('recurrent_activation') == 'tanh':
             layer.set_attr('recurrent_activation', 'dense_tanh')
 
-    @layer_optimizer(Softmax)
-    def init_softmax(self, layer):
-        if layer.model.config.get_config_value('IOType') == 'io_parallel':
-            assert len(layer.get_input_variable().shape) == 1, (
-                'Softmax with io_parallel strategy cannot be used on multidimensional tensors.'
-            )
-
     @layer_optimizer(Embedding)
     def init_embed(self, layer):
         if layer.attributes['n_in'] is None:
diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 64a4c7097a..5a2d765e8f 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -38,7 +38,7 @@
     using product = nnet::product::{product_type}<x_T, y_T>;
 }};\n"""
 
-dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output});'
+dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
 dense_task_sequence_template = 'task_sequence<nnet::dense_{strategy}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
 dense_stream_function_template = '{name}.async();'
 dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_stream.h']
@@ -70,8 +70,8 @@ def __init__(self):
 
     def format(self, node):
         params = self._default_function_params(node)
-        #params['w'] = node.get_weights('weight').name
-        #params['b'] = node.get_weights('bias').name
+        params['w'] = node.get_weights('weight').name
+        params['b'] = node.get_weights('bias').name
 
         return self.template.format(**params)
 
@@ -199,7 +199,7 @@ def format(self, node):
     static constexpr unsigned reuse_factor = {reuse};
 }};\n"""
 
-softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
+softmax_config_template_qkeras = """struct {type}_config{index} : nnet::activ_config {{
     static constexpr unsigned n_in = {n_in};
     static constexpr unsigned table_size = {table_size};
     static constexpr unsigned io_type = nnet::{iotype};
@@ -209,6 +209,26 @@ def format(self, node):
     typedef {inv_table_t.name} inv_table_t;
 }};\n"""
 
+softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
+    static const unsigned n_in = {n_in};
+    static const unsigned n_slice = {n_slice};
+    static const unsigned n_outer = {n_outer};
+    static const unsigned n_inner = {n_inner};
+    static const unsigned parallelization_factor = {parallelization_factor};
+    static const unsigned exp_table_size = {exp_table_size};
+    static const unsigned inv_table_size = {inv_table_size};
+    static const unsigned io_type = nnet::{iotype};
+    static const unsigned reuse_factor = {reuse};
+    static const unsigned axis = {axis};
+    static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
+    static constexpr float exp_scale = {exp_scale};
+    typedef {exp_table_t.name} exp_table_t;
+    typedef {inv_table_t.name} inv_table_t;
+    typedef {accum_t.name} accum_t;
+    typedef {inv_inp_t.name} inv_inp_t;
+    typedef {inp_norm_t_str} inp_norm_t;
+}};\n"""
+
 activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
 param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});'
 
@@ -260,10 +280,68 @@ def __init__(self):
         super(ActivationConfigTemplate, self).__init__(Softmax)  # Skip ActivationConfigTemplate's __init__
         self.template = softmax_config_template
 
+    def format(self, node):
+        from math import ceil, log2
+
+        params = self._default_config_params(node)
+        params['type'] = node.get_attr('activation')
+        params.setdefault('exp_table_size', params['table_size'])
+        params.setdefault('inv_table_size', params['table_size'])
+        params.setdefault('n_inner', 1)
+        params.setdefault('n_outer', 1)
+        params.setdefault('exp_scale', 1.0)
+        params.setdefault('parallelization_factor', -1)
+
+        n_slice = params['n_in'] // params['n_inner'] // params['n_outer']  # type: ignore
+        params['n_slice'] = n_slice
+
+        if params['accum_t'].name == 'model_default_t':  # type: ignore
+            scale = ceil(log2(n_slice))
+            exp_table_t = node.attributes['exp_table_t'].precision
+            signed, width, integers = exp_table_t.signed, exp_table_t.width, exp_table_t.integer
+            params['accum_t_str'] = f'ac_{"" if signed else "u"}fixed<{width + scale}, {integers + scale}>'
+        else:
+            params['accum_t_str'] = params['accum_t'].name  # type: ignore
+        if params['inv_inp_t'].name == 'model_default_t':  # type: ignore
+            params['inv_inp_t'] = params['exp_table_t']
+
+        if params['implementation'] == 'stable':
+            if 'inp_norm_t' not in params:
+                # Only used in stable (max-normalized) implementation
+                input_t = node.get_input_variable().type.precision
+                width, iwidth, signed = input_t.width, input_t.integer, input_t.signed  # noqa: F841
+                width, iwidth = width - signed, iwidth - signed
+                if signed:
+                    # Fix table size if too large
+                    exp_table_size = params['inv_table_size']
+                    params['exp_table_size'] = str(min(int(exp_table_size), 2**width))
+                params['inp_norm_t_str'] = f'ac_ufixed<{width}, {iwidth}>'
+            else:
+                params['inp_norm_t_str'] = params['inp_norm_t'].name  # type: ignore
+        else:
+            params['inp_norm_t_str'] = 'ac_fixed<1,0>'
+
+        return self.template.format(**params)
+
+
+class SoftmaxFunctionTemplate(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(Softmax, include_header=activ_include_list)
+        self.template = activ_function_template
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        use_multidim = node.get_attr('n_inner', 1) > 1 or node.get_attr('n_outer', 1) > 1
+        use_multidim = use_multidim and node.model.config.get_config_value('IOType') == 'io_parallel'
+        params['activation'] = 'softmax' if not use_multidim else 'softmax_multidim'
+        params['config'] = f'softmax_config{node.index}'
+
+        return self.template.format(**params)
+
 
 class ActivationFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
-        super().__init__((Activation, HardActivation, Softmax), include_header=activ_include_list)
+        super().__init__((Activation, HardActivation), include_header=activ_include_list)
         self.template = activ_function_template
 
     def format(self, node):
diff --git a/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py b/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py
index 24bd87d3e9..d5c1eda7b9 100644
--- a/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py
+++ b/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py
@@ -15,7 +15,7 @@
 
 @register
 class QMultiHeadAttentionHandler(QLayerHandler):
-    handles = ('hgq.layers.multi_head_attention.QMultiHeadAttention',)
+    handles = ('hgq.layers.attn.mha.QMultiHeadAttention',)
 
     def handle(
         self,
@@ -129,7 +129,7 @@ def _handle(self, layer, tensor_q, tensor_O, node_index, tensor_k, tensor_v):
 
 @register
 class QLinformerAttentionHandler(QMultiHeadAttentionHandler):
-    handles = ('hgq.layers.linformer_attention.QLinformerAttention',)
+    handles = ('hgq.layers.attn.linformer.QLinformerAttention',)
 
     def handle(
         self,
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
index f118ecb05c..c2353c34a8 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
@@ -100,15 +100,8 @@ template <class data_T, class res_T, typename CONFIG_T> void sigmoid(const data_
 enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 };
 
 template <class data_T, typename CONFIG_T> inline unsigned softmax_stable_idx_from_real_val(const data_T x) {
-    // Number of address bits for table
-    static constexpr int N = ceillog2<CONFIG_T::table_size>::val;
-
-    // Slice the top N bits of the input
-    [[intel::fpga_register]] ac_int<N, false> y = x.template slc<N>(x.width - N - 1);
-    // If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness
-    if (x != 0 && y == 0)
-        y[0] = 1;
-    return y.to_uint();
+    // Extract the lower 'width' bits of x
+    return x.template slc<data_T::width>(0).to_uint();
 }
 
 template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_from_real_val(const data_T x) {
@@ -121,7 +114,6 @@ template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_f
 }
 
 template <class data_T, class res_T, typename CONFIG_T> void softmax_stable(const data_T &data, res_T &res) {
-// Look-up tables
 #include "activation_tables/exp_table.tb"
 #include "activation_tables/invert_table.tb"
 
@@ -130,29 +122,34 @@ template <class data_T, class res_T, typename CONFIG_T> void softmax_stable(cons
     [[intel::fpga_register]] auto x_max =
         reduce<typename data_T::value_type, CONFIG_T::n_in, Op_max<typename data_T::value_type>>(data.data(), op_max);
 
-    // For the diffs, use the same type as the input but force rounding and saturation
-    [[intel::fpga_register]] ac_fixed<data_T::value_type::width, data_T::value_type::i_width, true, AC_RND, AC_SAT>
-        d_xi_xmax[CONFIG_T::n_in];
+    // Normalize inputs: d = x_max - x
+    [[intel::fpga_register]] typename CONFIG_T::inp_norm_t d_xi_xmax[CONFIG_T::n_in];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        d_xi_xmax[i] = data[i] - x_max;
+        // HGQ stable: d = x_max - data
+        d_xi_xmax[i] = x_max - data[i];
     }
 
-    // Calculate all the e^x's
+    // Exponentials
     [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        exp_res[i] = exp_table[softmax_stable_idx_from_real_val<typename data_T::value_type, CONFIG_T>(d_xi_xmax[i])];
+        unsigned idx = softmax_stable_idx_from_real_val<typename CONFIG_T::inp_norm_t, CONFIG_T>(d_xi_xmax[i]);
+        exp_res[i] = exp_table[idx];
     }
 
-    // Explicitly sum previously calculated exponentials with an adder tree
-    Op_add<typename CONFIG_T::exp_table_t> op_add;
-    [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
-        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+    // Sum of Exponentials
+    Op_add<typename CONFIG_T::accum_t> op_add;
+    [[intel::fpga_register]] typename CONFIG_T::accum_t exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);
 
-    // Multiply previously calculated exponetials with the reciprocal of the sum
-    [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
-        invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+    // Reciprocal of Sum
+    typename CONFIG_T::inv_inp_t exp_sum_cast = exp_sum;
+    unsigned inv_idx = softmax_stable_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T>(exp_sum_cast);
+
+    [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[inv_idx];
+
+    // Final Multiplication
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         res[i] = exp_res[i] * inv_exp_sum;
@@ -265,6 +262,45 @@ template <class data_T, class res_T, typename CONFIG_T> inline void softmax(cons
     }
 }
 
+// *************************************************
+//       Multidimensional Softmax
+// *************************************************
+
+// Helper to remap the config for the core softmax function
+template <class CONFIG_T> struct softmax_multidim_slice_config : CONFIG_T {
+    static constexpr unsigned n_in = CONFIG_T::n_slice;
+};
+
+template <class data_T, class res_T, typename CONFIG_T> inline void softmax_multidim(const data_T &data, res_T &res) {
+    using buffer_data_t = std::array<typename data_T::value_type, CONFIG_T::n_slice>;
+    using buffer_res_t = std::array<typename res_T::value_type, CONFIG_T::n_slice>;
+    using slice_config = softmax_multidim_slice_config<CONFIG_T>;
+
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_outer; i++) {
+        #pragma unroll
+        for (unsigned k = 0; k < CONFIG_T::n_inner; k++) {
+
+            [[intel::fpga_register]] buffer_data_t buffer_in;
+            [[intel::fpga_register]] buffer_res_t buffer_out;
+
+            // Gather Phase
+            #pragma unroll
+            for (unsigned j = 0; j < CONFIG_T::n_slice; j++) {
+                unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k;
+                buffer_in[j] = data[idx];
+            }
+
+            nnet::softmax<buffer_data_t, buffer_res_t, slice_config>(buffer_in, buffer_out);
+
+            #pragma unroll
+            for (unsigned j = 0; j < CONFIG_T::n_slice; j++) {
+                unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k;
+                res[idx] = buffer_out[j];
+            }
+        }
+    }
+}
 // *************************************************
 //       TanH Activation
 // *************************************************
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
index 2b65eef42b..dc76189083 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h
@@ -152,11 +152,12 @@ void dense_rf_lt(const data_T &data, res_T &res, const typename CONFIG_T::weight
     }
 }
 template <class data_T, class res_T, typename CONFIG_T>
-void dense_resource(const data_T &data, res_T &res) {
+void dense_resource(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights,
+                    const typename CONFIG_T::bias_t &biases) {
     if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
-        dense_rf_lt<data_T, res_T, CONFIG_T>(data, res, CONFIG_T::weights, CONFIG_T::biases);
+        dense_rf_lt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     } else {
-        dense_rf_gt<data_T, res_T, CONFIG_T>(data, res, CONFIG_T::weights, CONFIG_T::biases);
+        dense_rf_gt<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     }
 }
 } // namespace nnet
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index b42ff2990f..007b645cb0 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -242,13 +242,13 @@ def write_project_header(self, model):
                     for out in model_outputs:
                         newline += out.declare_cpp()
 
-               # Insert weights
+                # Insert weights
                 elif '// hls-fpga-machine-learning insert weights' in line:
                     newline = line
                     for layer in model.get_layers():
                         for w in layer.get_weights():
-                            #if w not in model_brams:
-                            newline += f'#include "weights/{w.name}.h"\n'                        
+                            # if w not in model_brams:
+                            newline += f'#include "weights/{w.name}.h"\n'
 
                 # Simply copy line, if no inserts are required
                 else:
@@ -557,16 +557,16 @@ def write_nnet_utils(self, model):
             dstpath = f'{model.config.get_output_dir()}/src/firmware/{dst}'
             copyfile(srcpath, dstpath)
 
-    def __get_table_size(self, model, activation):
+    def __get_table_size(self, model, activation, table_name='table_size'):
         for layer in model.get_layers():
             if (
                 layer.get_attr('activation') == activation or layer.get_attr('recurrent_activation') == activation
-            ) and layer.get_attr('table_size') is not None:
-                return int(layer.get_attr('table_size'))
+            ) and layer.get_attr(table_name) is not None:
+                return int(layer.get_attr(table_name))
         return 1024
 
-    def __get_table_header(self, table_name, table_size):
-        table_header = f'static const typename CONFIG_T::table_t {table_name}[{table_size}] = {{'
+    def __get_table_header(self, table_name, table_size, table_type='table_t'):
+        table_header = f'static const typename CONFIG_T::{table_type} {table_name}[{table_size}] = {{'
         return table_header
 
     def __write_elu_table(self, model, path):
@@ -695,46 +695,58 @@ def __write_selu_table(self, model, path):
         h_file.write('};\n')
         h_file.close()
 
+    def __get_table_precision(self, model, activation, table_name='table_precision'):
+        for layer in model.get_layers():
+            if layer.get_attr('activation') == activation and layer.get_attr(table_name) is not None:
+                precision = layer.get_attr(table_name)
+                return precision.precision
+
+        return None  # fp_bits, fp_integer, fp_signed
+
     def __write_exp_table(self, model, path):
         table_name = 'exp_table'
-        table_size = self.__get_table_size(model, 'softmax')
+        table_size = self.__get_table_size(model, 'softmax', table_name='exp_table_size')
 
         h_file = open(f'{path}/{table_name}.tb', 'w')
-        h_file.write(self.__get_table_header(table_name, table_size))
+        h_file.write(self.__get_table_header(table_name, table_size, table_type='exp_table_t'))
 
         # Default fixed point precision
         # 6 bits for integer part, 10 bits for decimal - total, 16
-        fp_bits = 16
-        fp_integer = 6
-        fp_signed = True
+        precision = self.__get_table_precision(model, 'softmax', table_name='inp_norm_t')
+
+        if precision is None:
+            fp_bits = 16
+            fp_integer = 6
+            fp_signed = True
+
+            for layer in model.get_layers():
+                if layer.name == 'softmax':
+                    ac_type = layer.get_input_variable().type
+                    if ac_type is not None:
+                        try:
+                            fp_bits = ac_type.precision.integer + ac_type.precision.fractional
+                            fp_integer = ac_type.precision.integer
+                            fp_signed = ac_type.precision.signed
+                        except Exception:
+                            # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
+                            pass
+                        if fp_signed is False:
+                            raise Exception('Softmax types need to be signed')
 
-        # Exp table should use the same precision as exp_table, as seen in Vivado code
-        # init_exp_table<data_T, CONFIG_T>(exp_table);
-        for layer in model.get_layers():
-            if layer.name == 'softmax':
-                ac_type = layer.get_input_variable().type
-                if ac_type is not None:
-                    try:
-                        fp_bits = ac_type.precision.integer + ac_type.precision.fractional
-                        fp_integer = ac_type.precision.integer
-                        fp_signed = ac_type.precision.signed
-                    except Exception:
-                        # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
-                        pass
-                    if fp_signed is False:
-                        raise Exception('Softmax types need to be signed')
+        else:
+            fp_bits = precision.width
+            fp_integer = precision.integer
+            fp_signed = precision.signed
 
+        f_bits = fp_bits - fp_integer
         sep = ''
-        N = ceil_log2(table_size)
         for i in range(table_size):
-            f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
-            b = uint_to_binary(i, N)
-            if i == 0:
-                b.insert(0, 0)
-            else:
-                b.insert(0, 1)
-            f.set_msb_bits(b)
-            real_val = f.exp_float()
+            # Index represents the raw bit pattern of the input
+            real_val_in = i * (2.0 ** (-f_bits))
+
+            # Calculate exp(-x) for the stable implementation
+            real_val = np.exp(-real_val_in)
+
             h_file.write(sep + str(real_val))
             sep = ', '
 
@@ -743,41 +755,50 @@ def __write_exp_table(self, model, path):
 
     def __write_invert_table(self, model, path):
         table_name = 'invert_table'
-        table_size = self.__get_table_size(model, 'softmax')
+        table_size = self.__get_table_size(model, 'softmax', table_name='inv_table_size')
 
         h_file = open(f'{path}/{table_name}.tb', 'w')
-        h_file.write(self.__get_table_header(table_name, table_size))
-
+        h_file.write(self.__get_table_header(table_name, table_size, table_type='inv_table_t'))
         # Default fixed point precision, in case values from layer attributes cannot be extracted
         # 8 bits for integer part, 10 bits for decimal - total, 18
-        fp_bits = 18
-        fp_integer = 8
-        fp_signed = True
 
-        # Invert table should use the same precision as exp_table, as seen in Vivado code
-        # init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
-        for layer in model.get_layers():
-            if layer.name == 'softmax':
-                ac_type = layer.get_attr('exp_table_t')
-                if ac_type is not None:
-                    try:
-                        fp_bits = ac_type.precision.integer + ac_type.precision.fractional
-                        fp_integer = ac_type.precision.integer
-                        fp_signed = ac_type.precision.signed
-                    except Exception:
-                        # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
-                        pass
-                    if fp_signed is False:
-                        raise Exception('Softmax types need to be signed')
+        precision = self.__get_table_precision(model, 'softmax', table_name='inv_inp_t')
+
+        if precision is None:
+            fp_bits = 18
+            fp_integer = 8
+            fp_signed = True
+
+            for layer in model.get_layers():
+                if layer.name == 'softmax':
+                    ac_type = layer.get_attr('exp_table_t')
+                    if ac_type is not None:
+                        try:
+                            fp_bits = ac_type.precision.integer + ac_type.precision.fractional
+                            fp_integer = ac_type.precision.integer
+                            fp_signed = ac_type.precision.signed
+                        except Exception:
+                            # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
+                            pass
+                        if fp_signed is False:
+                            raise Exception('Softmax types need to be signed')
+
+        else:
+            fp_bits = precision.width
+            fp_integer = precision.integer
+            fp_signed = precision.signed
 
+        f_bits = fp_bits - fp_integer
         sep = ''
-        N = ceil_log2(table_size)
         for i in range(table_size):
-            f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
-            b = uint_to_binary(i, N)
-            b.insert(0, 0)
-            f.set_msb_bits(b)
-            real_val = f.inv_float()
+            # Index represents the raw bit pattern of the input
+            real_val_in = i * (2.0 ** (-f_bits))
+
+            if real_val_in == 0:
+                real_val = 999.0
+            else:
+                real_val = 1.0 / real_val_in
+
             h_file.write(sep + str(real_val))
             sep = ', '
 

From dbb207b7a5c1f343d8100bba9645340a2098730c Mon Sep 17 00:00:00 2001
From: laurilaatu <l.laatu@imperial.ac.uk>
Date: Mon, 9 Feb 2026 16:33:38 +0000
Subject: [PATCH 4/6] Original weight implementation

---
 .../backends/oneapi/passes/core_templates.py  | 91 +------------------
 1 file changed, 3 insertions(+), 88 deletions(-)

diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py
index 5a2d765e8f..9602b2d0fc 100644
--- a/hls4ml/backends/oneapi/passes/core_templates.py
+++ b/hls4ml/backends/oneapi/passes/core_templates.py
@@ -6,7 +6,6 @@
 # Dense templates
 
 dense_config_template = """struct config{index} : nnet::dense_config {{
-
     static constexpr unsigned n_in = {n_in};
     static constexpr unsigned n_out = {n_out};
     static constexpr unsigned io_type = nnet::{iotype};
@@ -31,16 +30,13 @@
     typedef {weight_t.name} weight_t;
     typedef {index_t.name} index_t;
 
-    static constexpr weight_t weights = {weights};
-    static constexpr bias_t biases = {biases};
-
     template<class x_T, class y_T>
     using product = nnet::product::{product_type}<x_T, y_T>;
 }};\n"""
 
 dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});'
 dense_task_sequence_template = 'task_sequence<nnet::dense_{strategy}_stream<{input_pipe}, {output_pipe}, {config}>> {name};'
-dense_stream_function_template = '{name}.async();'
+dense_stream_function_template = '{name}.async({w}, {b});'
 dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_stream.h']
 
 
@@ -57,9 +53,6 @@ def format(self, node):
             node.get_input_variable().type.precision, node.get_weights('weight').type.precision
         )
 
-        params['weights'] = node.get_weights('weight').name
-        params['biases'] = node.get_weights('bias').name
-
         return self.template.format(**params)
 
 
@@ -199,7 +192,7 @@ def format(self, node):
     static constexpr unsigned reuse_factor = {reuse};
 }};\n"""
 
-softmax_config_template_qkeras = """struct {type}_config{index} : nnet::activ_config {{
+softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
     static constexpr unsigned n_in = {n_in};
     static constexpr unsigned table_size = {table_size};
     static constexpr unsigned io_type = nnet::{iotype};
@@ -209,26 +202,6 @@ def format(self, node):
     typedef {inv_table_t.name} inv_table_t;
 }};\n"""
 
-softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{
-    static const unsigned n_in = {n_in};
-    static const unsigned n_slice = {n_slice};
-    static const unsigned n_outer = {n_outer};
-    static const unsigned n_inner = {n_inner};
-    static const unsigned parallelization_factor = {parallelization_factor};
-    static const unsigned exp_table_size = {exp_table_size};
-    static const unsigned inv_table_size = {inv_table_size};
-    static const unsigned io_type = nnet::{iotype};
-    static const unsigned reuse_factor = {reuse};
-    static const unsigned axis = {axis};
-    static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation};
-    static constexpr float exp_scale = {exp_scale};
-    typedef {exp_table_t.name} exp_table_t;
-    typedef {inv_table_t.name} inv_table_t;
-    typedef {accum_t.name} accum_t;
-    typedef {inv_inp_t.name} inv_inp_t;
-    typedef {inp_norm_t_str} inp_norm_t;
-}};\n"""
-
 activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});'
 param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});'
 
@@ -280,68 +253,10 @@ def __init__(self):
         super(ActivationConfigTemplate, self).__init__(Softmax)  # Skip ActivationConfigTemplate's __init__
         self.template = softmax_config_template
 
-    def format(self, node):
-        from math import ceil, log2
-
-        params = self._default_config_params(node)
-        params['type'] = node.get_attr('activation')
-        params.setdefault('exp_table_size', params['table_size'])
-        params.setdefault('inv_table_size', params['table_size'])
-        params.setdefault('n_inner', 1)
-        params.setdefault('n_outer', 1)
-        params.setdefault('exp_scale', 1.0)
-        params.setdefault('parallelization_factor', -1)
-
-        n_slice = params['n_in'] // params['n_inner'] // params['n_outer']  # type: ignore
-        params['n_slice'] = n_slice
-
-        if params['accum_t'].name == 'model_default_t':  # type: ignore
-            scale = ceil(log2(n_slice))
-            exp_table_t = node.attributes['exp_table_t'].precision
-            signed, width, integers = exp_table_t.signed, exp_table_t.width, exp_table_t.integer
-            params['accum_t_str'] = f'ac_{"" if signed else "u"}fixed<{width + scale}, {integers + scale}>'
-        else:
-            params['accum_t_str'] = params['accum_t'].name  # type: ignore
-        if params['inv_inp_t'].name == 'model_default_t':  # type: ignore
-            params['inv_inp_t'] = params['exp_table_t']
-
-        if params['implementation'] == 'stable':
-            if 'inp_norm_t' not in params:
-                # Only used in stable (max-normalized) implementation
-                input_t = node.get_input_variable().type.precision
-                width, iwidth, signed = input_t.width, input_t.integer, input_t.signed  # noqa: F841
-                width, iwidth = width - signed, iwidth - signed
-                if signed:
-                    # Fix table size if too large
-                    exp_table_size = params['inv_table_size']
-                    params['exp_table_size'] = str(min(int(exp_table_size), 2**width))
-                params['inp_norm_t_str'] = f'ac_ufixed<{width}, {iwidth}>'
-            else:
-                params['inp_norm_t_str'] = params['inp_norm_t'].name  # type: ignore
-        else:
-            params['inp_norm_t_str'] = 'ac_fixed<1,0>'
-
-        return self.template.format(**params)
-
-
-class SoftmaxFunctionTemplate(FunctionCallTemplate):
-    def __init__(self):
-        super().__init__(Softmax, include_header=activ_include_list)
-        self.template = activ_function_template
-
-    def format(self, node):
-        params = self._default_function_params(node)
-        use_multidim = node.get_attr('n_inner', 1) > 1 or node.get_attr('n_outer', 1) > 1
-        use_multidim = use_multidim and node.model.config.get_config_value('IOType') == 'io_parallel'
-        params['activation'] = 'softmax' if not use_multidim else 'softmax_multidim'
-        params['config'] = f'softmax_config{node.index}'
-
-        return self.template.format(**params)
-
 
 class ActivationFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
-        super().__init__((Activation, HardActivation), include_header=activ_include_list)
+        super().__init__((Activation, HardActivation, Softmax), include_header=activ_include_list)
         self.template = activ_function_template
 
     def format(self, node):

From 51efff0c34744ab2fa70d7e3a52fdbf196ffcf0a Mon Sep 17 00:00:00 2001
From: laurilaatu <l.laatu@imperial.ac.uk>
Date: Mon, 9 Feb 2026 16:51:19 +0000
Subject: [PATCH 5/6] Restore oneAPI weight placement

---
 hls4ml/templates/oneapi/firmware/myproject.cpp | 5 +----
 hls4ml/templates/oneapi/firmware/myproject.h   | 3 ---
 hls4ml/writer/oneapi_writer.py                 | 7 -------
 3 files changed, 1 insertion(+), 14 deletions(-)

diff --git a/hls4ml/templates/oneapi/firmware/myproject.cpp b/hls4ml/templates/oneapi/firmware/myproject.cpp
index da9439f74a..06e7d3fe37 100644
--- a/hls4ml/templates/oneapi/firmware/myproject.cpp
+++ b/hls4ml/templates/oneapi/firmware/myproject.cpp
@@ -1,12 +1,9 @@
 #include "myproject.h"
+#include "parameters.h"
 #include <sycl/ext/intel/experimental/task_sequence.hpp>
 
 // hls-fpga-machine-learning insert weights
 
-
-#include "parameters.h"
-
-
 // The inter-task pipes need to be declared in the global scope
 // hls-fpga-machine-learning insert inter-task pipes
 
diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h
index 8f313ea30f..082ae5dc8c 100644
--- a/hls4ml/templates/oneapi/firmware/myproject.h
+++ b/hls4ml/templates/oneapi/firmware/myproject.h
@@ -3,9 +3,6 @@
 
 #include "defines.h"
 
-// hls-fpga-machine-learning insert weights
-
-
 // This file defines the interface to the kernel
 
 // currently this is fixed
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 007b645cb0..8ef2b0b0a1 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -242,13 +242,6 @@ def write_project_header(self, model):
                     for out in model_outputs:
                         newline += out.declare_cpp()
 
-                # Insert weights
-                elif '// hls-fpga-machine-learning insert weights' in line:
-                    newline = line
-                    for layer in model.get_layers():
-                        for w in layer.get_weights():
-                            # if w not in model_brams:
-                            newline += f'#include "weights/{w.name}.h"\n'
 
                 # Simply copy line, if no inserts are required
                 else:

From 6067bea99e35fd0bb3b2d89323e721e3916b0960 Mon Sep 17 00:00:00 2001
From: laurilaatu <l.laatu@imperial.ac.uk>
Date: Mon, 9 Feb 2026 16:52:42 +0000
Subject: [PATCH 6/6] pre-commit

---
 hls4ml/writer/oneapi_writer.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 8ef2b0b0a1..b945f3faf9 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -242,7 +242,6 @@ def write_project_header(self, model):
                     for out in model_outputs:
                         newline += out.declare_cpp()
 
-
                 # Simply copy line, if no inserts are required
                 else:
                     newline = line