diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py index 0c11c16d0..94f26c9f1 100644 --- a/hls4ml/backends/oneapi/oneapi_backend.py +++ b/hls4ml/backends/oneapi/oneapi_backend.py @@ -19,7 +19,6 @@ Embedding, Layer, SimpleRNN, - Softmax, ) from hls4ml.model.optimizer import get_backend_passes, layer_optimizer from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType @@ -257,13 +256,6 @@ def init_activation(self, layer): if layer.get_attr('recurrent_activation') == 'tanh': layer.set_attr('recurrent_activation', 'dense_tanh') - @layer_optimizer(Softmax) - def init_softmax(self, layer): - if layer.model.config.get_config_value('IOType') == 'io_parallel': - assert len(layer.get_input_variable().shape) == 1, ( - 'Softmax with io_parallel strategy cannot be used on multidimensional tensors.' - ) - @layer_optimizer(Embedding) def init_embed(self, layer): if layer.attributes['n_in'] is None: diff --git a/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py b/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py index 7154d0c9c..09723f533 100644 --- a/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py +++ b/hls4ml/converters/keras_v3/hgq2/multi_head_attention.py @@ -14,7 +14,7 @@ class QMultiHeadAttentionHandler(QLayerHandler): - handles = ('hgq.layers.multi_head_attention.QMultiHeadAttention',) + handles = ('hgq.layers.attn.mha.QMultiHeadAttention',) def handle( self, @@ -127,7 +127,7 @@ def _handle(self, layer, tensor_q, tensor_O, node_index, tensor_k, tensor_v): class QLinformerAttentionHandler(QMultiHeadAttentionHandler): - handles = ('hgq.layers.linformer_attention.QLinformerAttention',) + handles = ('hgq.layers.attn.linformer.QLinformerAttention',) def handle( self, diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h index f118ecb05..c2353c34a 100644 --- a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h +++ b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h @@ -100,15 +100,8 @@ template void sigmoid(const data_ enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 }; template inline unsigned softmax_stable_idx_from_real_val(const data_T x) { - // Number of address bits for table - static constexpr int N = ceillog2::val; - - // Slice the top N bits of the input - [[intel::fpga_register]] ac_int y = x.template slc(x.width - N - 1); - // If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness - if (x != 0 && y == 0) - y[0] = 1; - return y.to_uint(); + // Extract the lower 'width' bits of x + return x.template slc(0).to_uint(); } template inline unsigned softmax_latency_idx_from_real_val(const data_T x) { @@ -121,7 +114,6 @@ template inline unsigned softmax_latency_idx_f } template void softmax_stable(const data_T &data, res_T &res) { -// Look-up tables #include "activation_tables/exp_table.tb" #include "activation_tables/invert_table.tb" @@ -130,29 +122,34 @@ template void softmax_stable(cons [[intel::fpga_register]] auto x_max = reduce>(data.data(), op_max); - // For the diffs, use the same type as the input but force rounding and saturation - [[intel::fpga_register]] ac_fixed - d_xi_xmax[CONFIG_T::n_in]; + // Normalize inputs: d = x_max - x + [[intel::fpga_register]] typename CONFIG_T::inp_norm_t d_xi_xmax[CONFIG_T::n_in]; #pragma unroll for (unsigned i = 0; i < CONFIG_T::n_in; i++) { - d_xi_xmax[i] = data[i] - x_max; + // HGQ stable: d = x_max - data + d_xi_xmax[i] = x_max - data[i]; } - // Calculate all the e^x's + // Exponentials [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in]; #pragma unroll for (unsigned i = 0; i < CONFIG_T::n_in; i++) { - exp_res[i] = exp_table[softmax_stable_idx_from_real_val(d_xi_xmax[i])]; + unsigned idx = softmax_stable_idx_from_real_val(d_xi_xmax[i]); + exp_res[i] = exp_table[idx]; } - // Explicitly sum previously calculated exponentials with an adder tree - Op_add op_add; - [[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum = - reduce>(exp_res, op_add); + // Sum of Exponentials + Op_add op_add; + [[intel::fpga_register]] typename CONFIG_T::accum_t exp_sum = + reduce>(exp_res, op_add); - // Multiply previously calculated exponetials with the reciprocal of the sum - [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = - invert_table[softmax_stable_idx_from_real_val(exp_sum)]; + // Reciprocal of Sum + typename CONFIG_T::inv_inp_t exp_sum_cast = exp_sum; + unsigned inv_idx = softmax_stable_idx_from_real_val(exp_sum_cast); + + [[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[inv_idx]; + + // Final Multiplication #pragma unroll for (unsigned i = 0; i < CONFIG_T::n_in; i++) { res[i] = exp_res[i] * inv_exp_sum; @@ -265,6 +262,45 @@ template inline void softmax(cons } } +// ************************************************* +// Multidimensional Softmax +// ************************************************* + +// Helper to remap the config for the core softmax function +template struct softmax_multidim_slice_config : CONFIG_T { + static constexpr unsigned n_in = CONFIG_T::n_slice; +}; + +template inline void softmax_multidim(const data_T &data, res_T &res) { + using buffer_data_t = std::array; + using buffer_res_t = std::array; + using slice_config = softmax_multidim_slice_config; + + #pragma unroll + for (unsigned i = 0; i < CONFIG_T::n_outer; i++) { + #pragma unroll + for (unsigned k = 0; k < CONFIG_T::n_inner; k++) { + + [[intel::fpga_register]] buffer_data_t buffer_in; + [[intel::fpga_register]] buffer_res_t buffer_out; + + // Gather Phase + #pragma unroll + for (unsigned j = 0; j < CONFIG_T::n_slice; j++) { + unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k; + buffer_in[j] = data[idx]; + } + + nnet::softmax(buffer_in, buffer_out); + + #pragma unroll + for (unsigned j = 0; j < CONFIG_T::n_slice; j++) { + unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k; + res[idx] = buffer_out[j]; + } + } + } +} // ************************************************* // TanH Activation // ************************************************* diff --git a/hls4ml/writer/oneapi_writer.py b/hls4ml/writer/oneapi_writer.py index 3c0a778c5..b945f3faf 100644 --- a/hls4ml/writer/oneapi_writer.py +++ b/hls4ml/writer/oneapi_writer.py @@ -549,16 +549,16 @@ def write_nnet_utils(self, model): dstpath = f'{model.config.get_output_dir()}/src/firmware/{dst}' copyfile(srcpath, dstpath) - def __get_table_size(self, model, activation): + def __get_table_size(self, model, activation, table_name='table_size'): for layer in model.get_layers(): if ( layer.get_attr('activation') == activation or layer.get_attr('recurrent_activation') == activation - ) and layer.get_attr('table_size') is not None: - return int(layer.get_attr('table_size')) + ) and layer.get_attr(table_name) is not None: + return int(layer.get_attr(table_name)) return 1024 - def __get_table_header(self, table_name, table_size): - table_header = f'static const typename CONFIG_T::table_t {table_name}[{table_size}] = {{' + def __get_table_header(self, table_name, table_size, table_type='table_t'): + table_header = f'static const typename CONFIG_T::{table_type} {table_name}[{table_size}] = {{' return table_header def __write_elu_table(self, model, path): @@ -687,46 +687,58 @@ def __write_selu_table(self, model, path): h_file.write('};\n') h_file.close() + def __get_table_precision(self, model, activation, table_name='table_precision'): + for layer in model.get_layers(): + if layer.get_attr('activation') == activation and layer.get_attr(table_name) is not None: + precision = layer.get_attr(table_name) + return precision.precision + + return None # fp_bits, fp_integer, fp_signed + def __write_exp_table(self, model, path): table_name = 'exp_table' - table_size = self.__get_table_size(model, 'softmax') + table_size = self.__get_table_size(model, 'softmax', table_name='exp_table_size') h_file = open(f'{path}/{table_name}.tb', 'w') - h_file.write(self.__get_table_header(table_name, table_size)) + h_file.write(self.__get_table_header(table_name, table_size, table_type='exp_table_t')) # Default fixed point precision # 6 bits for integer part, 10 bits for decimal - total, 16 - fp_bits = 16 - fp_integer = 6 - fp_signed = True + precision = self.__get_table_precision(model, 'softmax', table_name='inp_norm_t') + + if precision is None: + fp_bits = 16 + fp_integer = 6 + fp_signed = True + + for layer in model.get_layers(): + if layer.name == 'softmax': + ac_type = layer.get_input_variable().type + if ac_type is not None: + try: + fp_bits = ac_type.precision.integer + ac_type.precision.fractional + fp_integer = ac_type.precision.integer + fp_signed = ac_type.precision.signed + except Exception: + # FixedPrecisionType wasn't correctly stored in layer attributes, use default values + pass + if fp_signed is False: + raise Exception('Softmax types need to be signed') - # Exp table should use the same precision as exp_table, as seen in Vivado code - # init_exp_table(exp_table); - for layer in model.get_layers(): - if layer.name == 'softmax': - ac_type = layer.get_input_variable().type - if ac_type is not None: - try: - fp_bits = ac_type.precision.integer + ac_type.precision.fractional - fp_integer = ac_type.precision.integer - fp_signed = ac_type.precision.signed - except Exception: - # FixedPrecisionType wasn't correctly stored in layer attributes, use default values - pass - if fp_signed is False: - raise Exception('Softmax types need to be signed') + else: + fp_bits = precision.width + fp_integer = precision.integer + fp_signed = precision.signed + f_bits = fp_bits - fp_integer sep = '' - N = ceil_log2(table_size) for i in range(table_size): - f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed) - b = uint_to_binary(i, N) - if i == 0: - b.insert(0, 0) - else: - b.insert(0, 1) - f.set_msb_bits(b) - real_val = f.exp_float() + # Index represents the raw bit pattern of the input + real_val_in = i * (2.0 ** (-f_bits)) + + # Calculate exp(-x) for the stable implementation + real_val = np.exp(-real_val_in) + h_file.write(sep + str(real_val)) sep = ', ' @@ -735,41 +747,50 @@ def __write_exp_table(self, model, path): def __write_invert_table(self, model, path): table_name = 'invert_table' - table_size = self.__get_table_size(model, 'softmax') + table_size = self.__get_table_size(model, 'softmax', table_name='inv_table_size') h_file = open(f'{path}/{table_name}.tb', 'w') - h_file.write(self.__get_table_header(table_name, table_size)) - + h_file.write(self.__get_table_header(table_name, table_size, table_type='inv_table_t')) # Default fixed point precision, in case values from layer attributes cannot be extracted # 8 bits for integer part, 10 bits for decimal - total, 18 - fp_bits = 18 - fp_integer = 8 - fp_signed = True - # Invert table should use the same precision as exp_table, as seen in Vivado code - # init_invert_table(invert_table); - for layer in model.get_layers(): - if layer.name == 'softmax': - ac_type = layer.get_attr('exp_table_t') - if ac_type is not None: - try: - fp_bits = ac_type.precision.integer + ac_type.precision.fractional - fp_integer = ac_type.precision.integer - fp_signed = ac_type.precision.signed - except Exception: - # FixedPrecisionType wasn't correctly stored in layer attributes, use default values - pass - if fp_signed is False: - raise Exception('Softmax types need to be signed') + precision = self.__get_table_precision(model, 'softmax', table_name='inv_inp_t') + + if precision is None: + fp_bits = 18 + fp_integer = 8 + fp_signed = True + + for layer in model.get_layers(): + if layer.name == 'softmax': + ac_type = layer.get_attr('exp_table_t') + if ac_type is not None: + try: + fp_bits = ac_type.precision.integer + ac_type.precision.fractional + fp_integer = ac_type.precision.integer + fp_signed = ac_type.precision.signed + except Exception: + # FixedPrecisionType wasn't correctly stored in layer attributes, use default values + pass + if fp_signed is False: + raise Exception('Softmax types need to be signed') + + else: + fp_bits = precision.width + fp_integer = precision.integer + fp_signed = precision.signed + f_bits = fp_bits - fp_integer sep = '' - N = ceil_log2(table_size) for i in range(table_size): - f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed) - b = uint_to_binary(i, N) - b.insert(0, 0) - f.set_msb_bits(b) - real_val = f.inv_float() + # Index represents the raw bit pattern of the input + real_val_in = i * (2.0 ** (-f_bits)) + + if real_val_in == 0: + real_val = 999.0 + else: + real_val = 1.0 / real_val_in + h_file.write(sep + str(real_val)) sep = ', '