diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
index 0c11c16d0..94f26c9f1 100644
--- a/hls4ml/backends/oneapi/oneapi_backend.py
+++ b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -19,7 +19,6 @@
     Embedding,
     Layer,
     SimpleRNN,
-    Softmax,
 )
 from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
 from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
@@ -257,13 +256,6 @@ def init_activation(self, layer):
         if layer.get_attr('recurrent_activation') == 'tanh':
             layer.set_attr('recurrent_activation', 'dense_tanh')
 
-    @layer_optimizer(Softmax)
-    def init_softmax(self, layer):
-        if layer.model.config.get_config_value('IOType') == 'io_parallel':
-            assert len(layer.get_input_variable().shape) == 1, (
-                'Softmax with io_parallel strategy cannot be used on multidimensional tensors.'
-            )
-
     @layer_optimizer(Embedding)
     def init_embed(self, layer):
         if layer.attributes['n_in'] is None:
diff --git a/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py b/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py
index 7154d0c9c..09723f533 100644
--- a/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py
+++ b/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py
@@ -14,7 +14,7 @@
 
 
 class QMultiHeadAttentionHandler(QLayerHandler):
-    handles = ('hgq.layers.multi_head_attention.QMultiHeadAttention',)
+    handles = ('hgq.layers.attn.mha.QMultiHeadAttention',)
 
     def handle(
         self,
@@ -127,7 +127,7 @@ def _handle(self, layer, tensor_q, tensor_O, node_index, tensor_k, tensor_v):
 
 
 class QLinformerAttentionHandler(QMultiHeadAttentionHandler):
-    handles = ('hgq.layers.linformer_attention.QLinformerAttention',)
+    handles = ('hgq.layers.attn.linformer.QLinformerAttention',)
 
     def handle(
         self,
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
index f118ecb05..c2353c34a 100644
--- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
+++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
@@ -100,15 +100,8 @@ template <class data_T, class res_T, typename CONFIG_T> void sigmoid(const data_
 enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 };
 
 template <class data_T, typename CONFIG_T> inline unsigned softmax_stable_idx_from_real_val(const data_T x) {
-    // Number of address bits for table
-    static constexpr int N = ceillog2<CONFIG_T::table_size>::val;
-
-    // Slice the top N bits of the input
-    [[intel::fpga_register]] ac_int<N, false> y = x.template slc<N>(x.width - N - 1);
-    // If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness
-    if (x != 0 && y == 0)
-        y[0] = 1;
-    return y.to_uint();
+    // Extract the lower 'width' bits of x
+    return x.template slc<data_T::width>(0).to_uint();
 }
 
 template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_from_real_val(const data_T x) {
@@ -121,7 +114,6 @@ template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_f
 }
 
 template <class data_T, class res_T, typename CONFIG_T> void softmax_stable(const data_T &data, res_T &res) {
-// Look-up tables
 #include "activation_tables/exp_table.tb"
 #include "activation_tables/invert_table.tb"
 
@@ -130,29 +122,34 @@ template <class data_T, class res_T, typename CONFIG_T> void softmax_stable(cons
     [[intel::fpga_register]] auto x_max =
         reduce<typename data_T::value_type, CONFIG_T::n_in, Op_max<typename data_T::value_type>>(data.data(), op_max);
 
-    // For the diffs, use the same type as the input but force rounding and saturation
-    [[intel::fpga_register]] ac_fixed<data_T::value_type::width, data_T::value_type::i_width, true, AC_RND, AC_SAT>
-        d_xi_xmax[CONFIG_T::n_in];
+    // Normalize inputs: d = x_max - x
+    [[intel::fpga_register]] typename CONFIG_T::inp_norm_t d_xi_xmax[CONFIG_T::n_in];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        d_xi_xmax[i] = data[i] - x_max;
+        // HGQ stable: d = x_max - data
+        d_xi_xmax[i] = x_max - data[i];
     }
 
-    // Calculate all the e^x's
+    // Exponentials
     [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
-        exp_res[i] = exp_table[softmax_stable_idx_from_real_val<typename data_T::value_type, CONFIG_T>(d_xi_xmax[i])];
+        unsigned idx = softmax_stable_idx_from_real_val<typename CONFIG_T::inp_norm_t, CONFIG_T>(d_xi_xmax[i]);
+        exp_res[i] = exp_table[idx];
     }
 
-    // Explicitly sum previously calculated exponentials with an adder tree
-    Op_add<typename CONFIG_T::exp_table_t> op_add;
-    [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
-        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
+    // Sum of Exponentials
+    Op_add<typename CONFIG_T::accum_t> op_add;
+    [[intel::fpga_register]] typename CONFIG_T::accum_t exp_sum =
+        reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);
 
-    // Multiply previously calculated exponetials with the reciprocal of the sum
-    [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
-        invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
+    // Reciprocal of Sum
+    typename CONFIG_T::inv_inp_t exp_sum_cast = exp_sum;
+    unsigned inv_idx = softmax_stable_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T>(exp_sum_cast);
+
+    [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[inv_idx];
+
+    // Final Multiplication
     #pragma unroll
     for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
         res[i] = exp_res[i] * inv_exp_sum;
@@ -265,6 +262,45 @@ template <class data_T, class res_T, typename CONFIG_T> inline void softmax(cons
     }
 }
 
+// *************************************************
+//       Multidimensional Softmax
+// *************************************************
+
+// Helper to remap the config for the core softmax function
+template <class CONFIG_T> struct softmax_multidim_slice_config : CONFIG_T {
+    static constexpr unsigned n_in = CONFIG_T::n_slice;
+};
+
+template <class data_T, class res_T, typename CONFIG_T> inline void softmax_multidim(const data_T &data, res_T &res) {
+    using buffer_data_t = std::array<typename data_T::value_type, CONFIG_T::n_slice>;
+    using buffer_res_t = std::array<typename res_T::value_type, CONFIG_T::n_slice>;
+    using slice_config = softmax_multidim_slice_config<CONFIG_T>;
+
+    #pragma unroll
+    for (unsigned i = 0; i < CONFIG_T::n_outer; i++) {
+        #pragma unroll
+        for (unsigned k = 0; k < CONFIG_T::n_inner; k++) {
+
+            [[intel::fpga_register]] buffer_data_t buffer_in;
+            [[intel::fpga_register]] buffer_res_t buffer_out;
+
+            // Gather Phase
+            #pragma unroll
+            for (unsigned j = 0; j < CONFIG_T::n_slice; j++) {
+                unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k;
+                buffer_in[j] = data[idx];
+            }
+
+            nnet::softmax<buffer_data_t, buffer_res_t, slice_config>(buffer_in, buffer_out);
+
+            #pragma unroll
+            for (unsigned j = 0; j < CONFIG_T::n_slice; j++) {
+                unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k;
+                res[idx] = buffer_out[j];
+            }
+        }
+    }
+}
 // *************************************************
 //       TanH Activation
 // *************************************************
diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py
index 3c0a778c5..b945f3faf 100644
--- a/hls4ml/writer/oneapi_writer.py
+++ b/hls4ml/writer/oneapi_writer.py
@@ -549,16 +549,16 @@ def write_nnet_utils(self, model):
             dstpath = f'{model.config.get_output_dir()}/src/firmware/{dst}'
             copyfile(srcpath, dstpath)
 
-    def __get_table_size(self, model, activation):
+    def __get_table_size(self, model, activation, table_name='table_size'):
         for layer in model.get_layers():
             if (
                 layer.get_attr('activation') == activation or layer.get_attr('recurrent_activation') == activation
-            ) and layer.get_attr('table_size') is not None:
-                return int(layer.get_attr('table_size'))
+            ) and layer.get_attr(table_name) is not None:
+                return int(layer.get_attr(table_name))
         return 1024
 
-    def __get_table_header(self, table_name, table_size):
-        table_header = f'static const typename CONFIG_T::table_t {table_name}[{table_size}] = {{'
+    def __get_table_header(self, table_name, table_size, table_type='table_t'):
+        table_header = f'static const typename CONFIG_T::{table_type} {table_name}[{table_size}] = {{'
         return table_header
 
     def __write_elu_table(self, model, path):
@@ -687,46 +687,58 @@ def __write_selu_table(self, model, path):
         h_file.write('};\n')
         h_file.close()
 
+    def __get_table_precision(self, model, activation, table_name='table_precision'):
+        for layer in model.get_layers():
+            if layer.get_attr('activation') == activation and layer.get_attr(table_name) is not None:
+                precision = layer.get_attr(table_name)
+                return precision.precision
+
+        return None  # fp_bits, fp_integer, fp_signed
+
     def __write_exp_table(self, model, path):
         table_name = 'exp_table'
-        table_size = self.__get_table_size(model, 'softmax')
+        table_size = self.__get_table_size(model, 'softmax', table_name='exp_table_size')
 
         h_file = open(f'{path}/{table_name}.tb', 'w')
-        h_file.write(self.__get_table_header(table_name, table_size))
+        h_file.write(self.__get_table_header(table_name, table_size, table_type='exp_table_t'))
 
         # Default fixed point precision
         # 6 bits for integer part, 10 bits for decimal - total, 16
-        fp_bits = 16
-        fp_integer = 6
-        fp_signed = True
+        precision = self.__get_table_precision(model, 'softmax', table_name='inp_norm_t')
+
+        if precision is None:
+            fp_bits = 16
+            fp_integer = 6
+            fp_signed = True
+
+            for layer in model.get_layers():
+                if layer.name == 'softmax':
+                    ac_type = layer.get_input_variable().type
+                    if ac_type is not None:
+                        try:
+                            fp_bits = ac_type.precision.integer + ac_type.precision.fractional
+                            fp_integer = ac_type.precision.integer
+                            fp_signed = ac_type.precision.signed
+                        except Exception:
+                            # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
+                            pass
+                        if fp_signed is False:
+                            raise Exception('Softmax types need to be signed')
 
-        # Exp table should use the same precision as exp_table, as seen in Vivado code
-        # init_exp_table<data_T, CONFIG_T>(exp_table);
-        for layer in model.get_layers():
-            if layer.name == 'softmax':
-                ac_type = layer.get_input_variable().type
-                if ac_type is not None:
-                    try:
-                        fp_bits = ac_type.precision.integer + ac_type.precision.fractional
-                        fp_integer = ac_type.precision.integer
-                        fp_signed = ac_type.precision.signed
-                    except Exception:
-                        # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
-                        pass
-                    if fp_signed is False:
-                        raise Exception('Softmax types need to be signed')
+        else:
+            fp_bits = precision.width
+            fp_integer = precision.integer
+            fp_signed = precision.signed
 
+        f_bits = fp_bits - fp_integer
         sep = ''
-        N = ceil_log2(table_size)
         for i in range(table_size):
-            f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
-            b = uint_to_binary(i, N)
-            if i == 0:
-                b.insert(0, 0)
-            else:
-                b.insert(0, 1)
-            f.set_msb_bits(b)
-            real_val = f.exp_float()
+            # Index represents the raw bit pattern of the input
+            real_val_in = i * (2.0 ** (-f_bits))
+
+            # Calculate exp(-x) for the stable implementation
+            real_val = np.exp(-real_val_in)
+
             h_file.write(sep + str(real_val))
             sep = ', '
 
@@ -735,41 +747,50 @@ def __write_exp_table(self, model, path):
 
     def __write_invert_table(self, model, path):
         table_name = 'invert_table'
-        table_size = self.__get_table_size(model, 'softmax')
+        table_size = self.__get_table_size(model, 'softmax', table_name='inv_table_size')
 
         h_file = open(f'{path}/{table_name}.tb', 'w')
-        h_file.write(self.__get_table_header(table_name, table_size))
-
+        h_file.write(self.__get_table_header(table_name, table_size, table_type='inv_table_t'))
         # Default fixed point precision, in case values from layer attributes cannot be extracted
         # 8 bits for integer part, 10 bits for decimal - total, 18
-        fp_bits = 18
-        fp_integer = 8
-        fp_signed = True
 
-        # Invert table should use the same precision as exp_table, as seen in Vivado code
-        # init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
-        for layer in model.get_layers():
-            if layer.name == 'softmax':
-                ac_type = layer.get_attr('exp_table_t')
-                if ac_type is not None:
-                    try:
-                        fp_bits = ac_type.precision.integer + ac_type.precision.fractional
-                        fp_integer = ac_type.precision.integer
-                        fp_signed = ac_type.precision.signed
-                    except Exception:
-                        # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
-                        pass
-                    if fp_signed is False:
-                        raise Exception('Softmax types need to be signed')
+        precision = self.__get_table_precision(model, 'softmax', table_name='inv_inp_t')
+
+        if precision is None:
+            fp_bits = 18
+            fp_integer = 8
+            fp_signed = True
+
+            for layer in model.get_layers():
+                if layer.name == 'softmax':
+                    ac_type = layer.get_attr('exp_table_t')
+                    if ac_type is not None:
+                        try:
+                            fp_bits = ac_type.precision.integer + ac_type.precision.fractional
+                            fp_integer = ac_type.precision.integer
+                            fp_signed = ac_type.precision.signed
+                        except Exception:
+                            # FixedPrecisionType wasn't correctly stored in layer attributes, use default values
+                            pass
+                        if fp_signed is False:
+                            raise Exception('Softmax types need to be signed')
+
+        else:
+            fp_bits = precision.width
+            fp_integer = precision.integer
+            fp_signed = precision.signed
 
+        f_bits = fp_bits - fp_integer
         sep = ''
-        N = ceil_log2(table_size)
         for i in range(table_size):
-            f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
-            b = uint_to_binary(i, N)
-            b.insert(0, 0)
-            f.set_msb_bits(b)
-            real_val = f.inv_float()
+            # Index represents the raw bit pattern of the input
+            real_val_in = i * (2.0 ** (-f_bits))
+
+            if real_val_in == 0:
+                real_val = 999.0
+            else:
+                real_val = 1.0 / real_val_in
+
             h_file.write(sep + str(real_val))
             sep = ', '