Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions hls4ml/backends/oneapi/oneapi_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
Embedding,
Layer,
SimpleRNN,
Softmax,
)
from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType
Expand Down Expand Up @@ -257,13 +256,6 @@ def init_activation(self, layer):
if layer.get_attr('recurrent_activation') == 'tanh':
layer.set_attr('recurrent_activation', 'dense_tanh')

@layer_optimizer(Softmax)
def init_softmax(self, layer):
if layer.model.config.get_config_value('IOType') == 'io_parallel':
assert len(layer.get_input_variable().shape) == 1, (
'Softmax with io_parallel strategy cannot be used on multidimensional tensors.'
)

@layer_optimizer(Embedding)
def init_embed(self, layer):
if layer.attributes['n_in'] is None:
Expand Down
4 changes: 2 additions & 2 deletions hls4ml/converters/keras_v3/hgq2/multi_head_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


class QMultiHeadAttentionHandler(QLayerHandler):
handles = ('hgq.layers.multi_head_attention.QMultiHeadAttention',)
handles = ('hgq.layers.attn.mha.QMultiHeadAttention',)

def handle(
self,
Expand Down Expand Up @@ -127,7 +127,7 @@ def _handle(self, layer, tensor_q, tensor_O, node_index, tensor_k, tensor_v):


class QLinformerAttentionHandler(QMultiHeadAttentionHandler):
handles = ('hgq.layers.linformer_attention.QLinformerAttention',)
handles = ('hgq.layers.attn.linformer.QLinformerAttention',)

def handle(
self,
Expand Down
82 changes: 59 additions & 23 deletions hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,15 +100,8 @@ template <class data_T, class res_T, typename CONFIG_T> void sigmoid(const data_
enum class softmax_implementation { latency = 0, legacy = 1, stable = 2, argmax = 3 };

template <class data_T, typename CONFIG_T> inline unsigned softmax_stable_idx_from_real_val(const data_T x) {
// Number of address bits for table
static constexpr int N = ceillog2<CONFIG_T::table_size>::val;

// Slice the top N bits of the input
[[intel::fpga_register]] ac_int<N, false> y = x.template slc<N>(x.width - N - 1);
// If x is the most negative value, the slice will be 0, so we need to set the 0-th bit to ensure correctness
if (x != 0 && y == 0)
y[0] = 1;
return y.to_uint();
// Extract the lower 'width' bits of x
return x.template slc<data_T::width>(0).to_uint();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a logic change. Before it took the upper bits. Does it now assume that the table size is always the same as the width of the value? If so, I guess it must be reinforced somewhere. Are we sure we want this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see the logic is unchanged for Vivado and libero, so I am hesitant to have this change.

}

template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_from_real_val(const data_T x) {
Expand All @@ -121,7 +114,6 @@ template <class data_T, typename CONFIG_T> inline unsigned softmax_latency_idx_f
}

template <class data_T, class res_T, typename CONFIG_T> void softmax_stable(const data_T &data, res_T &res) {
// Look-up tables
#include "activation_tables/exp_table.tb"
#include "activation_tables/invert_table.tb"

Expand All @@ -130,29 +122,34 @@ template <class data_T, class res_T, typename CONFIG_T> void softmax_stable(cons
[[intel::fpga_register]] auto x_max =
reduce<typename data_T::value_type, CONFIG_T::n_in, Op_max<typename data_T::value_type>>(data.data(), op_max);

// For the diffs, use the same type as the input but force rounding and saturation
[[intel::fpga_register]] ac_fixed<data_T::value_type::width, data_T::value_type::i_width, true, AC_RND, AC_SAT>
d_xi_xmax[CONFIG_T::n_in];
// Normalize inputs: d = x_max - x
[[intel::fpga_register]] typename CONFIG_T::inp_norm_t d_xi_xmax[CONFIG_T::n_in];
#pragma unroll
for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
d_xi_xmax[i] = data[i] - x_max;
// HGQ stable: d = x_max - data
d_xi_xmax[i] = x_max - data[i];
}

// Calculate all the e^x's
// Exponentials
[[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_res[CONFIG_T::n_in];
#pragma unroll
for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
exp_res[i] = exp_table[softmax_stable_idx_from_real_val<typename data_T::value_type, CONFIG_T>(d_xi_xmax[i])];
unsigned idx = softmax_stable_idx_from_real_val<typename CONFIG_T::inp_norm_t, CONFIG_T>(d_xi_xmax[i]);
exp_res[i] = exp_table[idx];
}

// Explicitly sum previously calculated exponentials with an adder tree
Op_add<typename CONFIG_T::exp_table_t> op_add;
[[intel::fpga_register]] typename CONFIG_T::exp_table_t exp_sum =
reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::exp_table_t>>(exp_res, op_add);
// Sum of Exponentials
Op_add<typename CONFIG_T::accum_t> op_add;
[[intel::fpga_register]] typename CONFIG_T::accum_t exp_sum =
reduce<typename CONFIG_T::exp_table_t, CONFIG_T::n_in, Op_add<typename CONFIG_T::accum_t>>(exp_res, op_add);

// Multiply previously calculated exponetials with the reciprocal of the sum
[[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum =
invert_table[softmax_stable_idx_from_real_val<typename CONFIG_T::exp_table_t, CONFIG_T>(exp_sum)];
// Reciprocal of Sum
typename CONFIG_T::inv_inp_t exp_sum_cast = exp_sum;
unsigned inv_idx = softmax_stable_idx_from_real_val<typename CONFIG_T::inv_inp_t, CONFIG_T>(exp_sum_cast);

[[intel::fpga_register]] typename CONFIG_T::inv_table_t inv_exp_sum = invert_table[inv_idx];

// Final Multiplication
#pragma unroll
for (unsigned i = 0; i < CONFIG_T::n_in; i++) {
res[i] = exp_res[i] * inv_exp_sum;
Expand Down Expand Up @@ -265,6 +262,45 @@ template <class data_T, class res_T, typename CONFIG_T> inline void softmax(cons
}
}

// *************************************************
// Multidimensional Softmax
// *************************************************

// Helper to remap the config for the core softmax function
template <class CONFIG_T> struct softmax_multidim_slice_config : CONFIG_T {
static constexpr unsigned n_in = CONFIG_T::n_slice;
};

template <class data_T, class res_T, typename CONFIG_T> inline void softmax_multidim(const data_T &data, res_T &res) {
using buffer_data_t = std::array<typename data_T::value_type, CONFIG_T::n_slice>;
using buffer_res_t = std::array<typename res_T::value_type, CONFIG_T::n_slice>;
using slice_config = softmax_multidim_slice_config<CONFIG_T>;

#pragma unroll
for (unsigned i = 0; i < CONFIG_T::n_outer; i++) {
#pragma unroll
for (unsigned k = 0; k < CONFIG_T::n_inner; k++) {

[[intel::fpga_register]] buffer_data_t buffer_in;
[[intel::fpga_register]] buffer_res_t buffer_out;

// Gather Phase
#pragma unroll
for (unsigned j = 0; j < CONFIG_T::n_slice; j++) {
unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k;
buffer_in[j] = data[idx];
}

nnet::softmax<buffer_data_t, buffer_res_t, slice_config>(buffer_in, buffer_out);

#pragma unroll
for (unsigned j = 0; j < CONFIG_T::n_slice; j++) {
unsigned idx = (i * CONFIG_T::n_slice * CONFIG_T::n_inner) + (j * CONFIG_T::n_inner) + k;
res[idx] = buffer_out[j];
}
}
}
}
// *************************************************
// TanH Activation
// *************************************************
Expand Down
143 changes: 82 additions & 61 deletions hls4ml/writer/oneapi_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -549,16 +549,16 @@ def write_nnet_utils(self, model):
dstpath = f'{model.config.get_output_dir()}/src/firmware/{dst}'
copyfile(srcpath, dstpath)

def __get_table_size(self, model, activation):
def __get_table_size(self, model, activation, table_name='table_size'):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

table_name and table_size have very different meanings. Maybe table_name_size or something like that would read better?

for layer in model.get_layers():
if (
layer.get_attr('activation') == activation or layer.get_attr('recurrent_activation') == activation
) and layer.get_attr('table_size') is not None:
return int(layer.get_attr('table_size'))
) and layer.get_attr(table_name) is not None:
return int(layer.get_attr(table_name))
return 1024

def __get_table_header(self, table_name, table_size):
table_header = f'static const typename CONFIG_T::table_t {table_name}[{table_size}] = {{'
def __get_table_header(self, table_name, table_size, table_type='table_t'):
table_header = f'static const typename CONFIG_T::{table_type} {table_name}[{table_size}] = {{'
return table_header

def __write_elu_table(self, model, path):
Expand Down Expand Up @@ -687,46 +687,58 @@ def __write_selu_table(self, model, path):
h_file.write('};\n')
h_file.close()

def __get_table_precision(self, model, activation, table_name='table_precision'):
for layer in model.get_layers():
if layer.get_attr('activation') == activation and layer.get_attr(table_name) is not None:
precision = layer.get_attr(table_name)
return precision.precision

return None # fp_bits, fp_integer, fp_signed

def __write_exp_table(self, model, path):
table_name = 'exp_table'
table_size = self.__get_table_size(model, 'softmax')
table_size = self.__get_table_size(model, 'softmax', table_name='exp_table_size')

h_file = open(f'{path}/{table_name}.tb', 'w')
h_file.write(self.__get_table_header(table_name, table_size))
h_file.write(self.__get_table_header(table_name, table_size, table_type='exp_table_t'))

# Default fixed point precision
# 6 bits for integer part, 10 bits for decimal - total, 16
fp_bits = 16
fp_integer = 6
fp_signed = True
precision = self.__get_table_precision(model, 'softmax', table_name='inp_norm_t')

if precision is None:
fp_bits = 16
fp_integer = 6
fp_signed = True

for layer in model.get_layers():
if layer.name == 'softmax':
ac_type = layer.get_input_variable().type
if ac_type is not None:
try:
fp_bits = ac_type.precision.integer + ac_type.precision.fractional
fp_integer = ac_type.precision.integer
fp_signed = ac_type.precision.signed
except Exception:
# FixedPrecisionType wasn't correctly stored in layer attributes, use default values
pass
if fp_signed is False:
raise Exception('Softmax types need to be signed')

# Exp table should use the same precision as exp_table, as seen in Vivado code
# init_exp_table<data_T, CONFIG_T>(exp_table);
for layer in model.get_layers():
if layer.name == 'softmax':
ac_type = layer.get_input_variable().type
if ac_type is not None:
try:
fp_bits = ac_type.precision.integer + ac_type.precision.fractional
fp_integer = ac_type.precision.integer
fp_signed = ac_type.precision.signed
except Exception:
# FixedPrecisionType wasn't correctly stored in layer attributes, use default values
pass
if fp_signed is False:
raise Exception('Softmax types need to be signed')
else:
fp_bits = precision.width
fp_integer = precision.integer
fp_signed = precision.signed

f_bits = fp_bits - fp_integer
sep = ''
N = ceil_log2(table_size)
for i in range(table_size):
f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
b = uint_to_binary(i, N)
if i == 0:
b.insert(0, 0)
else:
b.insert(0, 1)
f.set_msb_bits(b)
real_val = f.exp_float()
# Index represents the raw bit pattern of the input
real_val_in = i * (2.0 ** (-f_bits))

# Calculate exp(-x) for the stable implementation
real_val = np.exp(-real_val_in)

h_file.write(sep + str(real_val))
sep = ', '

Expand All @@ -735,41 +747,50 @@ def __write_exp_table(self, model, path):

def __write_invert_table(self, model, path):
table_name = 'invert_table'
table_size = self.__get_table_size(model, 'softmax')
table_size = self.__get_table_size(model, 'softmax', table_name='inv_table_size')

h_file = open(f'{path}/{table_name}.tb', 'w')
h_file.write(self.__get_table_header(table_name, table_size))

h_file.write(self.__get_table_header(table_name, table_size, table_type='inv_table_t'))
# Default fixed point precision, in case values from layer attributes cannot be extracted
# 8 bits for integer part, 10 bits for decimal - total, 18
fp_bits = 18
fp_integer = 8
fp_signed = True

# Invert table should use the same precision as exp_table, as seen in Vivado code
# init_invert_table<typename CONFIG_T::exp_table_t, CONFIG_T>(invert_table);
for layer in model.get_layers():
if layer.name == 'softmax':
ac_type = layer.get_attr('exp_table_t')
if ac_type is not None:
try:
fp_bits = ac_type.precision.integer + ac_type.precision.fractional
fp_integer = ac_type.precision.integer
fp_signed = ac_type.precision.signed
except Exception:
# FixedPrecisionType wasn't correctly stored in layer attributes, use default values
pass
if fp_signed is False:
raise Exception('Softmax types need to be signed')
precision = self.__get_table_precision(model, 'softmax', table_name='inv_inp_t')

if precision is None:
fp_bits = 18
fp_integer = 8
fp_signed = True

for layer in model.get_layers():
if layer.name == 'softmax':
ac_type = layer.get_attr('exp_table_t')
if ac_type is not None:
try:
fp_bits = ac_type.precision.integer + ac_type.precision.fractional
fp_integer = ac_type.precision.integer
fp_signed = ac_type.precision.signed
except Exception:
# FixedPrecisionType wasn't correctly stored in layer attributes, use default values
pass
if fp_signed is False:
raise Exception('Softmax types need to be signed')

else:
fp_bits = precision.width
fp_integer = precision.integer
fp_signed = precision.signed

f_bits = fp_bits - fp_integer
sep = ''
N = ceil_log2(table_size)
for i in range(table_size):
f = FixedPointEmulator(fp_bits, fp_integer, signed=fp_signed)
b = uint_to_binary(i, N)
b.insert(0, 0)
f.set_msb_bits(b)
real_val = f.inv_float()
# Index represents the raw bit pattern of the input
real_val_in = i * (2.0 ** (-f_bits))

if real_val_in == 0:
real_val = 999.0
else:
real_val = 1.0 / real_val_in

h_file.write(sep + str(real_val))
sep = ', '

Expand Down
Loading