From 3d463b3d7948520cca82afe67df7d67c4163b897 Mon Sep 17 00:00:00 2001 From: laurilaatu Date: Mon, 26 Jan 2026 20:37:28 +0000 Subject: [PATCH 1/6] weights for dense --- hls4ml/backends/oneapi/passes/core_templates.py | 15 +++++++++++---- hls4ml/templates/oneapi/firmware/myproject.cpp | 5 ++++- hls4ml/templates/oneapi/firmware/myproject.h | 3 +++ .../oneapi/firmware/nnet_utils/nnet_dense.h | 7 +++---- hls4ml/writer/oneapi_writer.py | 8 ++++++++ 5 files changed, 29 insertions(+), 9 deletions(-) diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py index 9602b2d0fc..64a4c7097a 100644 --- a/hls4ml/backends/oneapi/passes/core_templates.py +++ b/hls4ml/backends/oneapi/passes/core_templates.py @@ -6,6 +6,7 @@ # Dense templates dense_config_template = """struct config{index} : nnet::dense_config {{ + static constexpr unsigned n_in = {n_in}; static constexpr unsigned n_out = {n_out}; static constexpr unsigned io_type = nnet::{iotype}; @@ -30,13 +31,16 @@ typedef {weight_t.name} weight_t; typedef {index_t.name} index_t; + static constexpr weight_t weights = {weights}; + static constexpr bias_t biases = {biases}; + template using product = nnet::product::{product_type}; }};\n""" -dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' +dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output});' dense_task_sequence_template = 'task_sequence> {name};' -dense_stream_function_template = '{name}.async({w}, {b});' +dense_stream_function_template = '{name}.async();' dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_stream.h'] @@ -53,6 +57,9 @@ def format(self, node): node.get_input_variable().type.precision, node.get_weights('weight').type.precision ) + params['weights'] = node.get_weights('weight').name + params['biases'] = node.get_weights('bias').name + return self.template.format(**params) @@ -63,8 +70,8 @@ def __init__(self): def format(self, node): params = self._default_function_params(node) - params['w'] = node.get_weights('weight').name - params['b'] = node.get_weights('bias').name + #params['w'] = node.get_weights('weight').name + #params['b'] = node.get_weights('bias').name return self.template.format(**params) diff --git a/hls4ml/templates/oneapi/firmware/myproject.cpp b/hls4ml/templates/oneapi/firmware/myproject.cpp index 06e7d3fe37..da9439f74a 100644 --- a/hls4ml/templates/oneapi/firmware/myproject.cpp +++ b/hls4ml/templates/oneapi/firmware/myproject.cpp @@ -1,9 +1,12 @@ #include "myproject.h" -#include "parameters.h" #include // hls-fpga-machine-learning insert weights + +#include "parameters.h" + + // The inter-task pipes need to be declared in the global scope // hls-fpga-machine-learning insert inter-task pipes diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h index 082ae5dc8c..8f313ea30f 100644 --- a/hls4ml/templates/oneapi/firmware/myproject.h +++ b/hls4ml/templates/oneapi/firmware/myproject.h @@ -3,6 +3,9 @@ #include "defines.h" +// hls-fpga-machine-learning insert weights + + // This file defines the interface to the kernel // currently this is fixed diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h index dc76189083..2b65eef42b 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h @@ -152,12 +152,11 @@ void dense_rf_lt(const data_T &data, res_T &res, const typename CONFIG_T::weight } } template -void dense_resource(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, - const typename CONFIG_T::bias_t &biases) { +void dense_resource(const data_T &data, res_T &res) { if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { - dense_rf_lt(data, res, weights, biases); + dense_rf_lt(data, res, CONFIG_T::weights, CONFIG_T::biases); } else { - dense_rf_gt(data, res, weights, biases); + dense_rf_gt(data, res, CONFIG_T::weights, CONFIG_T::biases); } } } // namespace nnet diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py index 3c0a778c50..b42ff2990f 100644 --- a/hls4ml/writer/oneapi_writer.py +++ b/hls4ml/writer/oneapi_writer.py @@ -242,6 +242,14 @@ def write_project_header(self, model): for out in model_outputs: newline += out.declare_cpp() + # Insert weights + elif '// hls-fpga-machine-learning insert weights' in line: + newline = line + for layer in model.get_layers(): + for w in layer.get_weights(): + #if w not in model_brams: + newline += f'#include "weights/{w.name}.h"\n' + # Simply copy line, if no inserts are required else: newline = line From d67857369385d066b7cdaad49077069b3bf9473c Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Tue, 27 Jan 2026 18:58:42 +0000 Subject: [PATCH 2/6] hgq2 homogeneous quant fix --- hls4ml/converters/keras_v3/hgq2/_base.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/hls4ml/converters/keras_v3/hgq2/_base.py b/hls4ml/converters/keras_v3/hgq2/_base.py index 4a6d0a22c2..f7b4c9ddd3 100644 --- a/hls4ml/converters/keras_v3/hgq2/_base.py +++ b/hls4ml/converters/keras_v3/hgq2/_base.py @@ -30,15 +30,19 @@ def extract_fixed_quantizer_config(q, tensor: 'KerasTensor', is_input: bool) -> k, B, I = ops.convert_to_numpy(k), ops.convert_to_numpy(B), ops.convert_to_numpy(I) # noqa: E741 I = np.where(B > 0, I, 0) # noqa: E741 # type: ignore - k = np.broadcast_to(k.astype(np.int16), (1,) + shape) # type: ignore - B = np.broadcast_to(B.astype(np.int16), (1,) + shape) # type: ignore - I = np.broadcast_to(I.astype(np.int16), (1,) + shape) # noqa: E741 + if np.size(k) != 1: + k = np.broadcast_to(k.astype(np.int16), (1,) + shape) # type: ignore + B = np.broadcast_to(B.astype(np.int16), (1,) + shape) # type: ignore + I = np.broadcast_to(I.astype(np.int16), (1,) + shape) # noqa: E741 + else: + k = np.ravel(k).astype(np.int16) + B = np.ravel(B).astype(np.int16) + I = np.ravel(I).astype(np.int16) # noqa: E741 overflow_mode: str = internal_q.overflow_mode round_mode: str = internal_q.round_mode if round_mode.startswith('S_'): round_mode = round_mode[2:] - fusible = np.unique(k).size == 1 and np.unique(B).size == 1 and np.unique(I).size == 1 input_keras_tensor_names = tensor.name if is_input else f'{tensor.name}_q' output_keras_tensor_names = f'{tensor.name}_q' if is_input else tensor.name @@ -48,7 +52,7 @@ def extract_fixed_quantizer_config(q, tensor: 'KerasTensor', is_input: bool) -> 'mask_kbi': (k, B, I), 'SAT': overflow_mode, 'RND': round_mode, - 'fusible': fusible, + 'fusible': None, 'input_keras_tensor_names': [input_keras_tensor_names], 'output_keras_tensor_names': [output_keras_tensor_names], 'overrides': {}, From 59bd96f0c5e9c8e95538a9e96e0233c2d70695ba Mon Sep 17 00:00:00 2001 From: laurilaatu Date: Mon, 9 Feb 2026 16:31:00 +0000 Subject: [PATCH 3/6] Changes required for oneAPI MHA --- hls4ml/backends/oneapi/oneapi_backend.py | 8 - .../backends/oneapi/passes/core_templates.py | 88 ++++++++++- .../keras_v3/hgq2/multi_head_attention.py | 4 +- .../firmware/nnet_utils/nnet_activation.h | 82 +++++++--- .../oneapi/firmware/nnet_utils/nnet_dense.h | 7 +- hls4ml/writer/oneapi_writer.py | 149 ++++++++++-------- 6 files changed, 233 insertions(+), 105 deletions(-) diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py index 0c11c16d09..94f26c9f1c 100644 --- a/hls4ml/backends/oneapi/oneapi_backend.py +++ b/hls4ml/backends/oneapi/oneapi_backend.py @@ -19,7 +19,6 @@ Embedding, Layer, SimpleRNN, - Softmax, ) from hls4ml.model.optimizer import get_backend_passes, layer_optimizer from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType @@ -257,13 +256,6 @@ def init_activation(self, layer): if layer.get_attr('recurrent_activation') == 'tanh': layer.set_attr('recurrent_activation', 'dense_tanh') - @layer_optimizer(Softmax) - def init_softmax(self, layer): - if layer.model.config.get_config_value('IOType') == 'io_parallel': - assert len(layer.get_input_variable().shape) == 1, ( - 'Softmax with io_parallel strategy cannot be used on multidimensional tensors.' - ) - @layer_optimizer(Embedding) def init_embed(self, layer): if layer.attributes['n_in'] is None: diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py index 64a4c7097a..5a2d765e8f 100644 --- a/hls4ml/backends/oneapi/passes/core_templates.py +++ b/hls4ml/backends/oneapi/passes/core_templates.py @@ -38,7 +38,7 @@ using product = nnet::product::{product_type}; }};\n""" -dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output});' +dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' dense_task_sequence_template = 'task_sequence> {name};' dense_stream_function_template = '{name}.async();' dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_stream.h'] @@ -70,8 +70,8 @@ def __init__(self): def format(self, node): params = self._default_function_params(node) - #params['w'] = node.get_weights('weight').name - #params['b'] = node.get_weights('bias').name + params['w'] = node.get_weights('weight').name + params['b'] = node.get_weights('bias').name return self.template.format(**params) @@ -199,7 +199,7 @@ def format(self, node): static constexpr unsigned reuse_factor = {reuse}; }};\n""" -softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{ +softmax_config_template_qkeras = """struct {type}_config{index} : nnet::activ_config {{ static constexpr unsigned n_in = {n_in}; static constexpr unsigned table_size = {table_size}; static constexpr unsigned io_type = nnet::{iotype}; @@ -209,6 +209,26 @@ def format(self, node): typedef {inv_table_t.name} inv_table_t; }};\n""" +softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{ + static const unsigned n_in = {n_in}; + static const unsigned n_slice = {n_slice}; + static const unsigned n_outer = {n_outer}; + static const unsigned n_inner = {n_inner}; + static const unsigned parallelization_factor = {parallelization_factor}; + static const unsigned exp_table_size = {exp_table_size}; + static const unsigned inv_table_size = {inv_table_size}; + static const unsigned io_type = nnet::{iotype}; + static const unsigned reuse_factor = {reuse}; + static const unsigned axis = {axis}; + static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation}; + static constexpr float exp_scale = {exp_scale}; + typedef {exp_table_t.name} exp_table_t; + typedef {inv_table_t.name} inv_table_t; + typedef {accum_t.name} accum_t; + typedef {inv_inp_t.name} inv_inp_t; + typedef {inp_norm_t_str} inp_norm_t; +}};\n""" + activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});' param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});' @@ -260,10 +280,68 @@ def __init__(self): super(ActivationConfigTemplate, self).__init__(Softmax) # Skip ActivationConfigTemplate's __init__ self.template = softmax_config_template + def format(self, node): + from math import ceil, log2 + + params = self._default_config_params(node) + params['type'] = node.get_attr('activation') + params.setdefault('exp_table_size', params['table_size']) + params.setdefault('inv_table_size', params['table_size']) + params.setdefault('n_inner', 1) + params.setdefault('n_outer', 1) + params.setdefault('exp_scale', 1.0) + params.setdefault('parallelization_factor', -1) + + n_slice = params['n_in'] // params['n_inner'] // params['n_outer'] # type: ignore + params['n_slice'] = n_slice + + if params['accum_t'].name == 'model_default_t': # type: ignore + scale = ceil(log2(n_slice)) + exp_table_t = node.attributes['exp_table_t'].precision + signed, width, integers = exp_table_t.signed, exp_table_t.width, exp_table_t.integer + params['accum_t_str'] = f'ac_{"" if signed else "u"}fixed<{width + scale}, {integers + scale}>' + else: + params['accum_t_str'] = params['accum_t'].name # type: ignore + if params['inv_inp_t'].name == 'model_default_t': # type: ignore + params['inv_inp_t'] = params['exp_table_t'] + + if params['implementation'] == 'stable': + if 'inp_norm_t' not in params: + # Only used in stable (max-normalized) implementation + input_t = node.get_input_variable().type.precision + width, iwidth, signed = input_t.width, input_t.integer, input_t.signed # noqa: F841 + width, iwidth = width - signed, iwidth - signed + if signed: + # Fix table size if too large + exp_table_size = params['inv_table_size'] + params['exp_table_size'] = str(min(int(exp_table_size), 2**width)) + params['inp_norm_t_str'] = f'ac_ufixed<{width}, {iwidth}>' + else: + params['inp_norm_t_str'] = params['inp_norm_t'].name # type: ignore + else: + params['inp_norm_t_str'] = 'ac_fixed<1,0>' + + return self.template.format(**params) + + +class SoftmaxFunctionTemplate(FunctionCallTemplate): + def __init__(self): + super().__init__(Softmax, include_header=activ_include_list) + self.template = activ_function_template + + def format(self, node): + params = self._default_function_params(node) + use_multidim = node.get_attr('n_inner', 1) > 1 or node.get_attr('n_outer', 1) > 1 + use_multidim = use_multidim and node.model.config.get_config_value('IOType') == 'io_parallel' + params['activation'] = 'softmax' if not use_multidim else 'softmax_multidim' + params['config'] = f'softmax_config{node.index}' + + return self.template.format(**params) + class ActivationFunctionTemplate(FunctionCallTemplate): def __init__(self): - super().__init__((Activation, HardActivation, Softmax), include_header=activ_include_list) + super().__init__((Activation, HardActivation), include_header=activ_include_list) self.template = activ_function_template def format(self, node): diff --git a/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py b/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py index 24bd87d3e9..d5c1eda7b9 100644 --- a/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py +++ b/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py @@ -15,7 +15,7 @@ @register class QMultiHeadAttentionHandler(QLayerHandler): - handles = ('hgq.layers.multi_head_attention.QMultiHeadAttention',) + handles = ('hgq.layers.attn.mha.QMultiHeadAttention',) def handle( self, @@ -129,7 +129,7 @@ def _handle(self, layer, tensor_q, tensor_O, node_index, tensor_k, tensor_v): @register class QLinformerAttentionHandler(QMultiHeadAttentionHandler): - handles = ('hgq.layers.linformer_attention.QLinformerAttention',) + handles = ('hgq.layers.attn.linformer.QLinformerAttention',) def handle( self, diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h index f118ecb05c..c2353c34a8 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h @@ -100,15 +100,8 @@ template void sigmoid(const data_ enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 }; template inline unsigned softmax_stable_idx_from_real_val(const data_T x) { - // Number of address bits for table - static constexpr int N = ceillog2::val; - - // Slice the top N bits of the input - [[intel::fpga_register]] ac_int y = x.template slc(x.width - N - 1); - // If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness - if (x != 0 && y == 0) - y[0] = 1; - return y.to_uint(); + // Extract the lower 'width' bits of x + return x.template slc(0).to_uint(); } template inline unsigned softmax_latency_idx_from_real_val(const data_T x) { @@ -121,7 +114,6 @@ template inline unsigned softmax_latency_idx_f } template void softmax_stable(const data_T &data, res_T &res) { -// Look-up tables #include "activation_tables/exp_table.tb" #include "activation_tables/invert_table.tb" @@ -130,29 +122,34 @@ template void softmax_stable(cons [[intel::fpga_register]] auto x_max = reduce>(data.data(), op_max); - // For the diffs, use the same type as the input but force rounding and saturation - [[intel::fpga_register]] ac_fixed - d_xi_xmax[CONFIG_T::n_in]; + // Normalize inputs: d = x_max - x + [[intel::fpga_register]] typename CONFIG_T::inp_norm_t d_xi_xmax[CONFIG_T::n_in]; #pragma unroll for (unsigned i = 0; i < CONFIG_T::n_in; i++) { - d_xi_xmax[i] = data[i] - x_max; + // HGQ stable: d = x_max - data + d_xi_xmax[i] = x_max - data[i]; } - // Calculate all the e^x's + // Exponentials [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; #pragma unroll for (unsigned i = 0; i < CONFIG_T::n_in; i++) { - exp_res[i] = exp_table[softmax_stable_idx_from_real_val(d_xi_xmax[i])]; + unsigned idx = softmax_stable_idx_from_real_val(d_xi_xmax[i]); + exp_res[i] = exp_table[idx]; } - // Explicitly sum previously calculated exponentials with an adder tree - Op_add op_add; - [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = - reduce>(exp_res, op_add); + // Sum of Exponentials + Op_add op_add; + [[intel::fpga_register]] typename CONFIG_T::accum_t exp_sum = + reduce>(exp_res, op_add); - // Multiply previously calculated exponetials with the reciprocal of the sum - [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = - invert_table[softmax_stable_idx_from_real_val(exp_sum)]; + // Reciprocal of Sum + typename CONFIG_T::inv_inp_t exp_sum_cast = exp_sum; + unsigned inv_idx = softmax_stable_idx_from_real_val(exp_sum_cast); + + [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[inv_idx]; + + // Final Multiplication #pragma unroll for (unsigned i = 0; i < CONFIG_T::n_in; i++) { res[i] = exp_res[i] * inv_exp_sum; @@ -265,6 +262,45 @@ template inline void softmax(cons } } +// ************************************************* +// Multidimensional Softmax +// ************************************************* + +// Helper to remap the config for the core softmax function +template struct softmax_multidim_slice_config : CONFIG_T { + static constexpr unsigned n_in = CONFIG_T::n_slice; +}; + +template inline void softmax_multidim(const data_T &data, res_T &res) { + using buffer_data_t = std::array; + using buffer_res_t = std::array; + using slice_config = softmax_multidim_slice_config; + + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_outer; i++) { + #pragma unroll + for (unsigned k = 0; k < CONFIG_T::n_inner; k++) { + + [[intel::fpga_register]] buffer_data_t buffer_in; + [[intel::fpga_register]] buffer_res_t buffer_out; + + // Gather Phase + #pragma unroll + for (unsigned j = 0; j < CONFIG_T::n_slice; j++) { + unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k; + buffer_in[j] = data[idx]; + } + + nnet::softmax(buffer_in, buffer_out); + + #pragma unroll + for (unsigned j = 0; j < CONFIG_T::n_slice; j++) { + unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k; + res[idx] = buffer_out[j]; + } + } + } +} // ************************************************* // TanH Activation // ************************************************* diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h index 2b65eef42b..dc76189083 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense.h @@ -152,11 +152,12 @@ void dense_rf_lt(const data_T &data, res_T &res, const typename CONFIG_T::weight } } template -void dense_resource(const data_T &data, res_T &res) { +void dense_resource(const data_T &data, res_T &res, const typename CONFIG_T::weight_t &weights, + const typename CONFIG_T::bias_t &biases) { if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { - dense_rf_lt(data, res, CONFIG_T::weights, CONFIG_T::biases); + dense_rf_lt(data, res, weights, biases); } else { - dense_rf_gt(data, res, CONFIG_T::weights, CONFIG_T::biases); + dense_rf_gt(data, res, weights, biases); } } } // namespace nnet diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py index b42ff2990f..007b645cb0 100644 --- a/hls4ml/writer/oneapi_writer.py +++ b/hls4ml/writer/oneapi_writer.py @@ -242,13 +242,13 @@ def write_project_header(self, model): for out in model_outputs: newline += out.declare_cpp() - # Insert weights + # Insert weights elif '// hls-fpga-machine-learning insert weights' in line: newline = line for layer in model.get_layers(): for w in layer.get_weights(): - #if w not in model_brams: - newline += f'#include "weights/{w.name}.h"\n' + # if w not in model_brams: + newline += f'#include "weights/{w.name}.h"\n' # Simply copy line, if no inserts are required else: @@ -557,16 +557,16 @@ def write_nnet_utils(self, model): dstpath = f'{model.config.get_output_dir()}/src/firmware/{dst}' copyfile(srcpath, dstpath) - def __get_table_size(self, model, activation): + def __get_table_size(self, model, activation, table_name='table_size'): for layer in model.get_layers(): if ( layer.get_attr('activation') == activation or layer.get_attr('recurrent_activation') == activation - ) and layer.get_attr('table_size') is not None: - return int(layer.get_attr('table_size')) + ) and layer.get_attr(table_name) is not None: + return int(layer.get_attr(table_name)) return 1024 - def __get_table_header(self, table_name, table_size): - table_header = f'static const typename CONFIG_T::table_t {table_name}[{table_size}] = {{' + def __get_table_header(self, table_name, table_size, table_type='table_t'): + table_header = f'static const typename CONFIG_T::{table_type} {table_name}[{table_size}] = {{' return table_header def __write_elu_table(self, model, path): @@ -695,46 +695,58 @@ def __write_selu_table(self, model, path): h_file.write('};\n') h_file.close() + def __get_table_precision(self, model, activation, table_name='table_precision'): + for layer in model.get_layers(): + if layer.get_attr('activation') == activation and layer.get_attr(table_name) is not None: + precision = layer.get_attr(table_name) + return precision.precision + + return None # fp_bits, fp_integer, fp_signed + def __write_exp_table(self, model, path): table_name = 'exp_table' - table_size = self.__get_table_size(model, 'softmax') + table_size = self.__get_table_size(model, 'softmax', table_name='exp_table_size') h_file = open(f'{path}/{table_name}.tb', 'w') - h_file.write(self.__get_table_header(table_name, table_size)) + h_file.write(self.__get_table_header(table_name, table_size, table_type='exp_table_t')) # Default fixed point precision # 6 bits for integer part, 10 bits for decimal - total, 16 - fp_bits = 16 - fp_integer = 6 - fp_signed = True + precision = self.__get_table_precision(model, 'softmax', table_name='inp_norm_t') + + if precision is None: + fp_bits = 16 + fp_integer = 6 + fp_signed = True + + for layer in model.get_layers(): + if layer.name == 'softmax': + ac_type = layer.get_input_variable().type + if ac_type is not None: + try: + fp_bits = ac_type.precision.integer + ac_type.precision.fractional + fp_integer = ac_type.precision.integer + fp_signed = ac_type.precision.signed + except Exception: + # FixedPrecisionType wasn't correctly stored in layer attributes, use default values + pass + if fp_signed is False: + raise Exception('Softmax types need to be signed') - # Exp table should use the same precision as exp_table, as seen in Vivado code - # init_exp_table(exp_table); - for layer in model.get_layers(): - if layer.name == 'softmax': - ac_type = layer.get_input_variable().type - if ac_type is not None: - try: - fp_bits = ac_type.precision.integer + ac_type.precision.fractional - fp_integer = ac_type.precision.integer - fp_signed = ac_type.precision.signed - except Exception: - # FixedPrecisionType wasn't correctly stored in layer attributes, use default values - pass - if fp_signed is False: - raise Exception('Softmax types need to be signed') + else: + fp_bits = precision.width + fp_integer = precision.integer + fp_signed = precision.signed + f_bits = fp_bits - fp_integer sep = '' - N = ceil_log2(table_size) for i in range(table_size): - f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed) - b = uint_to_binary(i, N) - if i == 0: - b.insert(0, 0) - else: - b.insert(0, 1) - f.set_msb_bits(b) - real_val = f.exp_float() + # Index represents the raw bit pattern of the input + real_val_in = i * (2.0 ** (-f_bits)) + + # Calculate exp(-x) for the stable implementation + real_val = np.exp(-real_val_in) + h_file.write(sep + str(real_val)) sep = ', ' @@ -743,41 +755,50 @@ def __write_exp_table(self, model, path): def __write_invert_table(self, model, path): table_name = 'invert_table' - table_size = self.__get_table_size(model, 'softmax') + table_size = self.__get_table_size(model, 'softmax', table_name='inv_table_size') h_file = open(f'{path}/{table_name}.tb', 'w') - h_file.write(self.__get_table_header(table_name, table_size)) - + h_file.write(self.__get_table_header(table_name, table_size, table_type='inv_table_t')) # Default fixed point precision, in case values from layer attributes cannot be extracted # 8 bits for integer part, 10 bits for decimal - total, 18 - fp_bits = 18 - fp_integer = 8 - fp_signed = True - # Invert table should use the same precision as exp_table, as seen in Vivado code - # init_invert_table(invert_table); - for layer in model.get_layers(): - if layer.name == 'softmax': - ac_type = layer.get_attr('exp_table_t') - if ac_type is not None: - try: - fp_bits = ac_type.precision.integer + ac_type.precision.fractional - fp_integer = ac_type.precision.integer - fp_signed = ac_type.precision.signed - except Exception: - # FixedPrecisionType wasn't correctly stored in layer attributes, use default values - pass - if fp_signed is False: - raise Exception('Softmax types need to be signed') + precision = self.__get_table_precision(model, 'softmax', table_name='inv_inp_t') + + if precision is None: + fp_bits = 18 + fp_integer = 8 + fp_signed = True + + for layer in model.get_layers(): + if layer.name == 'softmax': + ac_type = layer.get_attr('exp_table_t') + if ac_type is not None: + try: + fp_bits = ac_type.precision.integer + ac_type.precision.fractional + fp_integer = ac_type.precision.integer + fp_signed = ac_type.precision.signed + except Exception: + # FixedPrecisionType wasn't correctly stored in layer attributes, use default values + pass + if fp_signed is False: + raise Exception('Softmax types need to be signed') + + else: + fp_bits = precision.width + fp_integer = precision.integer + fp_signed = precision.signed + f_bits = fp_bits - fp_integer sep = '' - N = ceil_log2(table_size) for i in range(table_size): - f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed) - b = uint_to_binary(i, N) - b.insert(0, 0) - f.set_msb_bits(b) - real_val = f.inv_float() + # Index represents the raw bit pattern of the input + real_val_in = i * (2.0 ** (-f_bits)) + + if real_val_in == 0: + real_val = 999.0 + else: + real_val = 1.0 / real_val_in + h_file.write(sep + str(real_val)) sep = ', ' From dbb207b7a5c1f343d8100bba9645340a2098730c Mon Sep 17 00:00:00 2001 From: laurilaatu Date: Mon, 9 Feb 2026 16:33:38 +0000 Subject: [PATCH 4/6] Original weight implementation --- .../backends/oneapi/passes/core_templates.py | 91 +------------------ 1 file changed, 3 insertions(+), 88 deletions(-) diff --git a/hls4ml/backends/oneapi/passes/core_templates.py b/hls4ml/backends/oneapi/passes/core_templates.py index 5a2d765e8f..9602b2d0fc 100644 --- a/hls4ml/backends/oneapi/passes/core_templates.py +++ b/hls4ml/backends/oneapi/passes/core_templates.py @@ -6,7 +6,6 @@ # Dense templates dense_config_template = """struct config{index} : nnet::dense_config {{ - static constexpr unsigned n_in = {n_in}; static constexpr unsigned n_out = {n_out}; static constexpr unsigned io_type = nnet::{iotype}; @@ -31,16 +30,13 @@ typedef {weight_t.name} weight_t; typedef {index_t.name} index_t; - static constexpr weight_t weights = {weights}; - static constexpr bias_t biases = {biases}; - template using product = nnet::product::{product_type}; }};\n""" dense_function_template = 'nnet::dense_{strategy}<{input_t}, {output_t}, {config}>({input}, {output}, {w}, {b});' dense_task_sequence_template = 'task_sequence> {name};' -dense_stream_function_template = '{name}.async();' +dense_stream_function_template = '{name}.async({w}, {b});' dense_include_list = ['nnet_utils/nnet_dense.h', 'nnet_utils/nnet_dense_stream.h'] @@ -57,9 +53,6 @@ def format(self, node): node.get_input_variable().type.precision, node.get_weights('weight').type.precision ) - params['weights'] = node.get_weights('weight').name - params['biases'] = node.get_weights('bias').name - return self.template.format(**params) @@ -199,7 +192,7 @@ def format(self, node): static constexpr unsigned reuse_factor = {reuse}; }};\n""" -softmax_config_template_qkeras = """struct {type}_config{index} : nnet::activ_config {{ +softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{ static constexpr unsigned n_in = {n_in}; static constexpr unsigned table_size = {table_size}; static constexpr unsigned io_type = nnet::{iotype}; @@ -209,26 +202,6 @@ def format(self, node): typedef {inv_table_t.name} inv_table_t; }};\n""" -softmax_config_template = """struct {type}_config{index} : nnet::activ_config {{ - static const unsigned n_in = {n_in}; - static const unsigned n_slice = {n_slice}; - static const unsigned n_outer = {n_outer}; - static const unsigned n_inner = {n_inner}; - static const unsigned parallelization_factor = {parallelization_factor}; - static const unsigned exp_table_size = {exp_table_size}; - static const unsigned inv_table_size = {inv_table_size}; - static const unsigned io_type = nnet::{iotype}; - static const unsigned reuse_factor = {reuse}; - static const unsigned axis = {axis}; - static const nnet::softmax_implementation implementation = nnet::softmax_implementation::{implementation}; - static constexpr float exp_scale = {exp_scale}; - typedef {exp_table_t.name} exp_table_t; - typedef {inv_table_t.name} inv_table_t; - typedef {accum_t.name} accum_t; - typedef {inv_inp_t.name} inv_inp_t; - typedef {inp_norm_t_str} inp_norm_t; -}};\n""" - activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {output});' param_activ_function_template = 'nnet::{activation}<{input_t}, {output_t}, {config}>({input}, {param}, {output});' @@ -280,68 +253,10 @@ def __init__(self): super(ActivationConfigTemplate, self).__init__(Softmax) # Skip ActivationConfigTemplate's __init__ self.template = softmax_config_template - def format(self, node): - from math import ceil, log2 - - params = self._default_config_params(node) - params['type'] = node.get_attr('activation') - params.setdefault('exp_table_size', params['table_size']) - params.setdefault('inv_table_size', params['table_size']) - params.setdefault('n_inner', 1) - params.setdefault('n_outer', 1) - params.setdefault('exp_scale', 1.0) - params.setdefault('parallelization_factor', -1) - - n_slice = params['n_in'] // params['n_inner'] // params['n_outer'] # type: ignore - params['n_slice'] = n_slice - - if params['accum_t'].name == 'model_default_t': # type: ignore - scale = ceil(log2(n_slice)) - exp_table_t = node.attributes['exp_table_t'].precision - signed, width, integers = exp_table_t.signed, exp_table_t.width, exp_table_t.integer - params['accum_t_str'] = f'ac_{"" if signed else "u"}fixed<{width + scale}, {integers + scale}>' - else: - params['accum_t_str'] = params['accum_t'].name # type: ignore - if params['inv_inp_t'].name == 'model_default_t': # type: ignore - params['inv_inp_t'] = params['exp_table_t'] - - if params['implementation'] == 'stable': - if 'inp_norm_t' not in params: - # Only used in stable (max-normalized) implementation - input_t = node.get_input_variable().type.precision - width, iwidth, signed = input_t.width, input_t.integer, input_t.signed # noqa: F841 - width, iwidth = width - signed, iwidth - signed - if signed: - # Fix table size if too large - exp_table_size = params['inv_table_size'] - params['exp_table_size'] = str(min(int(exp_table_size), 2**width)) - params['inp_norm_t_str'] = f'ac_ufixed<{width}, {iwidth}>' - else: - params['inp_norm_t_str'] = params['inp_norm_t'].name # type: ignore - else: - params['inp_norm_t_str'] = 'ac_fixed<1,0>' - - return self.template.format(**params) - - -class SoftmaxFunctionTemplate(FunctionCallTemplate): - def __init__(self): - super().__init__(Softmax, include_header=activ_include_list) - self.template = activ_function_template - - def format(self, node): - params = self._default_function_params(node) - use_multidim = node.get_attr('n_inner', 1) > 1 or node.get_attr('n_outer', 1) > 1 - use_multidim = use_multidim and node.model.config.get_config_value('IOType') == 'io_parallel' - params['activation'] = 'softmax' if not use_multidim else 'softmax_multidim' - params['config'] = f'softmax_config{node.index}' - - return self.template.format(**params) - class ActivationFunctionTemplate(FunctionCallTemplate): def __init__(self): - super().__init__((Activation, HardActivation), include_header=activ_include_list) + super().__init__((Activation, HardActivation, Softmax), include_header=activ_include_list) self.template = activ_function_template def format(self, node): From 51efff0c34744ab2fa70d7e3a52fdbf196ffcf0a Mon Sep 17 00:00:00 2001 From: laurilaatu Date: Mon, 9 Feb 2026 16:51:19 +0000 Subject: [PATCH 5/6] Restore oneAPI weight placement --- hls4ml/templates/oneapi/firmware/myproject.cpp | 5 +---- hls4ml/templates/oneapi/firmware/myproject.h | 3 --- hls4ml/writer/oneapi_writer.py | 7 ------- 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/hls4ml/templates/oneapi/firmware/myproject.cpp b/hls4ml/templates/oneapi/firmware/myproject.cpp index da9439f74a..06e7d3fe37 100644 --- a/hls4ml/templates/oneapi/firmware/myproject.cpp +++ b/hls4ml/templates/oneapi/firmware/myproject.cpp @@ -1,12 +1,9 @@ #include "myproject.h" +#include "parameters.h" #include // hls-fpga-machine-learning insert weights - -#include "parameters.h" - - // The inter-task pipes need to be declared in the global scope // hls-fpga-machine-learning insert inter-task pipes diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h index 8f313ea30f..082ae5dc8c 100644 --- a/hls4ml/templates/oneapi/firmware/myproject.h +++ b/hls4ml/templates/oneapi/firmware/myproject.h @@ -3,9 +3,6 @@ #include "defines.h" -// hls-fpga-machine-learning insert weights - - // This file defines the interface to the kernel // currently this is fixed diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py index 007b645cb0..8ef2b0b0a1 100644 --- a/hls4ml/writer/oneapi_writer.py +++ b/hls4ml/writer/oneapi_writer.py @@ -242,13 +242,6 @@ def write_project_header(self, model): for out in model_outputs: newline += out.declare_cpp() - # Insert weights - elif '// hls-fpga-machine-learning insert weights' in line: - newline = line - for layer in model.get_layers(): - for w in layer.get_weights(): - # if w not in model_brams: - newline += f'#include "weights/{w.name}.h"\n' # Simply copy line, if no inserts are required else: From 6067bea99e35fd0bb3b2d89323e721e3916b0960 Mon Sep 17 00:00:00 2001 From: laurilaatu Date: Mon, 9 Feb 2026 16:52:42 +0000 Subject: [PATCH 6/6] pre-commit --- hls4ml/writer/oneapi_writer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py index 8ef2b0b0a1..b945f3faf9 100644 --- a/hls4ml/writer/oneapi_writer.py +++ b/hls4ml/writer/oneapi_writer.py @@ -242,7 +242,6 @@ def write_project_header(self, model): for out in model_outputs: newline += out.declare_cpp() - # Simply copy line, if no inserts are required else: newline = line