diff --git a/tmva/sofie/CMakeLists.txt b/tmva/sofie/CMakeLists.txt index c807d1b7b8c27..f56d2350ecadd 100644 --- a/tmva/sofie/CMakeLists.txt +++ b/tmva/sofie/CMakeLists.txt @@ -22,6 +22,7 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ROOTTMVASofie TMVA/OperatorList.hxx TMVA/RModel_Base.hxx TMVA/RModel.hxx + TMVA/RModelProfiler.hxx TMVA/ROperator.hxx TMVA/ROperator_BasicUnary.hxx TMVA/ROperator_BasicBinary.hxx @@ -77,6 +78,7 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ROOTTMVASofie SOURCES src/RModel_Base.cxx src/RModel.cxx + src/RModelProfiler.cxx src/RModel_GNN.cxx src/RModel_GraphIndependent.cxx src/RFunction.cxx diff --git a/tmva/sofie/inc/TMVA/RFunction.hxx b/tmva/sofie/inc/TMVA/RFunction.hxx index 1cca39aa7ff3e..9247bd4180d26 100644 --- a/tmva/sofie/inc/TMVA/RFunction.hxx +++ b/tmva/sofie/inc/TMVA/RFunction.hxx @@ -32,7 +32,7 @@ public: class RFunction_Update: public RFunction { protected: - std::shared_ptr function_block; + std::shared_ptr fFunction_block; FunctionTarget fTarget; GraphType fGraphType; std::vector fInputTensors; @@ -50,9 +50,9 @@ public: void AddInputTensors(const std::vector>& inputShapes); void AddInputTensors(const std::vector>& inputShapes); std::shared_ptr GetFunctionBlock() { - return function_block; + return fFunction_block; } - std::string GenerateModel(const std::string& filename, long read_pos = 0, long block_size = -1); + std::string GenerateModel(const std::string& filename, long read_pos = 0, long block_size = -1, bool verbose = false); std::string Generate(const std::vector& inputPtrs); FunctionTarget GetFunctionTarget() { return fTarget; diff --git a/tmva/sofie/inc/TMVA/RModel.hxx b/tmva/sofie/inc/TMVA/RModel.hxx index 996c51020270f..a82c58c75b2e2 100644 --- a/tmva/sofie/inc/TMVA/RModel.hxx +++ b/tmva/sofie/inc/TMVA/RModel.hxx @@ -11,16 +11,23 @@ namespace SOFIE { class RModel final : public RModel_Base { + friend class RModelProfiler; + private: bool fIsInitialized = false; bool fIsSubGraph = false; + bool fProfile = false; + int fVerbose = 0; int fBatchSize = -1; long fReadPos = 0; // reading file position + size_t fConstantTensorSize = 0; // size (in Bytes) of the allocated constant tensors size_t fWeightsTensorSize = 0; // size (in Bytes) of the allocated weight tensors size_t fOtherTensorSize = 0; // size (in Bytes) of intermediate tensors which are not managed by the memory pool + std::string fProfilerGC = ""; + OptimizationLevel fOptimizationLevel = OptimizationLevel::kExtended; std::unordered_map fInputTensorInfos; // input tensors where shape may not fully defined or other graph inputs? @@ -30,6 +37,7 @@ private: std::unordered_map fDynamicTensorInfos; std::unordered_map, bool>> fShapeTensors; // constant tensors describing a shape std::unordered_map fShapeParams; // parameters defining the dynamic shape (e.g. batch size), store also its default value + std::unordered_map fAliasTensors; // list of alias tensors std::vector fDimShapeNames; // parameter names used to define the shapes std::vector fOutputTensorNames; std::vector fInputTensorNames; // input tensor names using ONNX order @@ -82,6 +90,8 @@ public: void AddConstantTensor(std::string tensor_name, ETensorType type, std::vector shape, std::shared_ptr data); + void AddAliasTensor(const std::string & tensor_name, const std::string & orig_tensor_name); + template void AddConstantTensor(const std::string & name, const std::vector & shape, const T * data) { @@ -130,6 +140,8 @@ public: bool IsReadyInputTensor(const std::string &name) const; /// check if a tensor is a shape tensor bool IsShapeTensor(const std::string & name) const; + /// check if a tensor is a alias tensor + bool IsAliasTensor(const std::string & name) const; // Add intermediate tensor void AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector dim_shape); @@ -152,7 +164,7 @@ public: void Initialize(int batchSize = -1, bool verbose = false); void Initialize(const std::map & inputParams, bool verbose = false); - void Generate(std::underlying_type_t options, int batchSize = -1, long pos = 0, bool verbose = false); + void Generate(std::underlying_type_t options, int batchSize = -1, long pos = 0, bool verbose = false); void Generate(Options options = Options::kDefault, int batchSize = -1, int pos = 0, bool verbose = false) { Generate(static_cast>(options), batchSize, pos, verbose); @@ -205,8 +217,8 @@ public: void ReadInitializedTensorsFromFile(long); long WriteInitializedTensorsToFile(std::string filename = ""); - void PrintIntermediateTensors(); - void PrintOutputTensors(); + void PrintIntermediateTensors() const; + void PrintOutputTensors() const; void OutputGenerated(std::string filename = "", bool append = false); std::vector GetOutputTensorNames() { return fOutputTensorNames; } void SetFilename(std::string filename) { fName = filename; } @@ -224,9 +236,9 @@ public: } */ - void PrintRequiredInputTensors(); - void PrintInitializedTensors(); - void PrintDynamicTensors(); + void PrintRequiredInputTensors() const; + void PrintInitializedTensors() const; + void PrintDynamicTensors() const; void HeadInitializedTensors(std::string name, int n_print = 50); bool UseSession() const { return fUseSession; } diff --git a/tmva/sofie/inc/TMVA/RModelProfiler.hxx b/tmva/sofie/inc/TMVA/RModelProfiler.hxx new file mode 100644 index 0000000000000..fd9c8c7d0267d --- /dev/null +++ b/tmva/sofie/inc/TMVA/RModelProfiler.hxx @@ -0,0 +1,42 @@ +#ifndef TMVA_SOFIE_RMODELPROFILER +#define TMVA_SOFIE_RMODELPROFILER + +#include "TMVA/RModel.hxx" + +namespace TMVA { +namespace Experimental { +namespace SOFIE { + +/// \class RModelProfiler +/// \brief A helper class to generate profiled inference code for an RModel. +/// +/// This class instruments the generated C++ code to measure the execution +/// time of each operator. It is invoked when the RModel::Generate is called +/// with the Options::kProfile flag. +class RModelProfiler { +private: + RModel &fModel; + + void GenerateUtilityFunctions(); + +public: + // The profiler must be constructed with a model to work on. + RModelProfiler() = delete; + RModelProfiler(RModel &model); + ~RModelProfiler() = default; + + // There is no point in copying or moving an RModelProfiler + RModelProfiler(const RModelProfiler &other) = delete; + RModelProfiler(RModelProfiler &&other) = delete; + RModelProfiler &operator=(const RModelProfiler &other) = delete; + RModelProfiler &operator=(RModelProfiler &&other) = delete; + + // Main function to generate the profiled code. + void Generate(); +}; + +} // namespace SOFIE +} // namespace Experimental +} // namespace TMVA + +#endif // TMVA_SOFIE_RMODELPROFILER diff --git a/tmva/sofie/inc/TMVA/RModel_Base.hxx b/tmva/sofie/inc/TMVA/RModel_Base.hxx index 2cbcc6cc8ea41..2ab5dacaac57f 100644 --- a/tmva/sofie/inc/TMVA/RModel_Base.hxx +++ b/tmva/sofie/inc/TMVA/RModel_Base.hxx @@ -26,6 +26,7 @@ enum class Options { kRootBinaryWeightFile = 0x4, kGNN = 0x8, kGNNComponent = 0x10, + kProfile = 0x20, }; // Optimization levels inspired by ONNXRuntime. diff --git a/tmva/sofie/inc/TMVA/ROperator.hxx b/tmva/sofie/inc/TMVA/ROperator.hxx index f0afd9c4374c1..200cd3f2976fe 100644 --- a/tmva/sofie/inc/TMVA/ROperator.hxx +++ b/tmva/sofie/inc/TMVA/ROperator.hxx @@ -37,6 +37,9 @@ public: //virtual void Forward_blas() = 0; virtual ~ROperator(){} + std::string name = "UnnamedOperator"; + const std::string &GetOperatorName() { return name; }; + protected: const std::string SP = " "; ///< space used to correctly indent the generated C++ code diff --git a/tmva/sofie/inc/TMVA/ROperator_BasicBinary.hxx b/tmva/sofie/inc/TMVA/ROperator_BasicBinary.hxx index 1c4f20363ebe2..491b669554118 100644 --- a/tmva/sofie/inc/TMVA/ROperator_BasicBinary.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_BasicBinary.hxx @@ -192,7 +192,7 @@ public: dataY[i] = BinaryOperatorTrait::Func(dataA[i], dataB[i]); } model.AddConstantTensor(fNY, fShapeY, dataY.data()); - // flag tensors to not be written in the weight file + // flag tensors to not be written in the generated code or weight file model.SetNotWritableInitializedTensor(nameA); model.SetNotWritableInitializedTensor(nameB); fIsOutputConstant = true; diff --git a/tmva/sofie/inc/TMVA/ROperator_BasicNary.hxx b/tmva/sofie/inc/TMVA/ROperator_BasicNary.hxx index bcc0e52a40ca3..f73bd34e53386 100644 --- a/tmva/sofie/inc/TMVA/ROperator_BasicNary.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_BasicNary.hxx @@ -23,10 +23,11 @@ struct NaryOperatorTraits { static const std::string Name() {return "Max";} static std::string Op(const std::string& res, std::vector& inputs) { std::stringstream out; - out << "\t" << "\t" << res << " = " << inputs[0] << ";\n"; + out << res << " = std::max({ " << inputs[0]; for (size_t i = 1; i < inputs.size(); i++) { - out << "\t" << "\t" << res << " = std::max(" << res << ", " << inputs[i] << ");\n"; + out << ", " << inputs[i]; } + out << "});\n"; return out.str(); } }; @@ -36,10 +37,11 @@ struct NaryOperatorTraits { static const std::string Name() {return "Min";} static std::string Op(const std::string& res, std::vector& inputs) { std::stringstream out; - out << "\t" << "\t" << res << " = " << inputs[0] << ";\n"; + out << res << " = std::min({ " << inputs[0]; for (size_t i = 1; i < inputs.size(); i++) { - out << "\t" << "\t" << res << " = std::min(" << res << ", " << inputs[i] << ");\n"; + out << ", " << inputs[i]; } + out << "});\n"; return out.str(); } }; @@ -52,7 +54,7 @@ struct NaryOperatorTraits { static const std::string Name() {return "Mean";} static std::string Op(const std::string& res, std::vector& inputs) { std::stringstream out; - out << "\t" << "\t" << res << " = (" << inputs[0]; + out << res << " = (" << inputs[0]; for (size_t i = 1; i < inputs.size(); i++) { out << " + " << inputs[i]; } @@ -66,7 +68,7 @@ struct NaryOperatorTraits { static const std::string Name() {return "Sum";} static std::string Op(const std::string& res, std::vector& inputs) { std::stringstream out; - out << "\t" << "\t" << res << " = " << inputs[0]; + out << res << " = " << inputs[0]; for (size_t i = 1; i < inputs.size(); i++) { out << " + " << inputs[i]; } @@ -83,10 +85,11 @@ private: std::vector fNInputs; std::string fNY; - std::vector> fShapeInputs; + std::vector> fShapeInputs; std::vector fNBroadcastedInputs; std::vector fShapeY; + std::vector fDimShapeY; bool fBroadcast = false; @@ -119,64 +122,164 @@ public: } void Initialize(RModel& model) override { + std::vector> inputShapes; for (auto &it : fNInputs) { if (!model.CheckIfTensorAlreadyExist(it)) { throw std::runtime_error("TMVA SOFIE BasicNary Op Input Tensor " + it + " is not found in model"); } - fShapeInputs.push_back(model.GetTensorShape(it)); + fShapeInputs.push_back(model.GetDimTensorShape(it)); + if (fNInputs.size()> 2) { + if (model.IsDimInputTensor(it)) + throw std::runtime_error("TMVA SOFIE BasicNary : supports only 2 inputs for dynamic tensors"); + else + inputShapes.push_back(model.GetTensorShape(it)); + } } // Find the common shape of the input tensors - fShapeY = UTILITY::MultidirectionalBroadcastShape(fShapeInputs); - model.AddIntermediateTensor(fNY, model.GetTensorType(fNInputs[0]), fShapeY); - // Broadcasting - size_t N = fNInputs.size(); - fNBroadcastedInputs.reserve(N); - for (size_t i = 0; i < N; i++) { - if (!UTILITY::AreSameShape(model.GetTensorShape(fNInputs[i]), fShapeY)) { - fBroadcast = true; - std::string name = "Broadcasted" + fNInputs[i]; - model.AddIntermediateTensor(name, model.GetTensorType(fNInputs[0]), fShapeY); - fNBroadcastedInputs.emplace_back("tensor_" + name); - } else { - fNBroadcastedInputs.emplace_back("tensor_" + fNInputs[i]); + if (fShapeInputs.size() > 2 ) { + // support dynamic tensors now for input list of size=2 + auto shapeY = UTILITY::MultidirectionalBroadcastShape(inputShapes); + fDimShapeY = ConvertShapeToDim(shapeY); + } else if (fShapeInputs.size() == 2 ) { + auto ret = UTILITY::MultidirectionalBroadcastShape(fShapeInputs[0], fShapeInputs[1]); + // use same code as in BinaryOperator (need to extend for input sizes > 2) + fBroadcast = ret.first; + fDimShapeY = ret.second; + // case of all parametric shapes and MultiDirectionalBroadcastShape return the max of the 2 + // need to do before we declare the output tensor shape and the broadcasted ones + if (ret.first & 4) { + // check if one of the parameter is an input dimension + // define function to find this + auto IsInputDimParam = [&](const std::string &p) { + auto inputNames = model.GetInputTensorNames(); + for (auto &input : inputNames) { + for (auto &i_s : model.GetDimTensorShape(input)) { + if (i_s.isParam && i_s.param == p) + return true; + } + } + return false; + }; + auto & shapeA = fShapeInputs[0]; + auto & shapeB = fShapeInputs[1]; + for (size_t i = 0; i < fDimShapeY.size(); i++) { + auto &s = fDimShapeY[i]; + if (s.isParam && s.param.find("std::max") != std::string::npos) { + if (IsInputDimParam(shapeA[i].param)) { + // case dim is 1 we indicate that the input parameter is equal to 1 + if (shapeA[i].dim != 1) + s = shapeA[i]; + else + s = shapeB[i]; + } else if (IsInputDimParam(shapeB[i].param)) { + if (shapeB[i].dim != 1) + s = shapeB[i]; + else + s = shapeA[i]; + } + } + } } + } else if (fShapeInputs.size() == 1 ) { + fDimShapeY = fShapeInputs[0]; } + if (!fShapeY.empty()) + model.AddIntermediateTensor(fNY, model.GetTensorType(fNInputs[0]), fShapeY); + else + model.AddIntermediateTensor(fNY, model.GetTensorType(fNInputs[0]), fDimShapeY); + + fType = ConvertTypeToString(model.GetTensorType(fNInputs[0])); + + if (model.Verbose()) { + std::cout << NaryOperatorTraits::Name() << " : "; + if (fNInputs.size() == 2) + std::cout << ConvertShapeToString(fShapeInputs[0]) << " , " + << ConvertShapeToString(fShapeInputs[1]); + std::cout << " --> " << ConvertShapeToString(fDimShapeY) << std::endl; + } } std::string Generate(std::string OpName) override { OpName = "op_" + OpName; - if (fShapeY.empty()) { + if (fDimShapeY.empty()) { throw std::runtime_error("TMVA SOFIE BasicNary called to Generate without being initialized first"); } std::stringstream out; - size_t length = ConvertShapeToLength(fShapeY); + auto length = ConvertDimShapeToLength(fDimShapeY); out << SP << "\n//------ BasicNary operator\n"; - if (fBroadcast) { - for (size_t i = 0; i < fNInputs.size(); i++) { - if (fNBroadcastedInputs[i] != fNInputs[i]) { - out << SP << SP << "// Broadcasting " << fNInputs[i] << " to " << ConvertShapeToString(fShapeY) << "\n"; - out << SP << SP << "{\n"; - out << SP << SP << SP << fType << "* data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<" << fType << ">(tensor_" + fNInputs[i] << ", " << ConvertShapeToString(fShapeInputs[i]); - out << ", " << ConvertShapeToString(fShapeY) << ");\n"; - out << SP << SP << SP << "std::copy(data, data + " << length << ", " << fNBroadcastedInputs[i] << ");\n"; - out << SP << SP << SP << "delete[] data;\n"; - out << SP << SP << "}\n"; - } - } - } - if (fNInputs.size() == 1) { + int nInputs = fNInputs.size(); + + if (nInputs == 1) { out << SP << "std::copy(tensor_" << fNInputs[0] << ", tensor_" << fNInputs[0] << " + "; out << length << ", tensor_" << fNY << ");\n"; } else { - std::vector inputs(fNBroadcastedInputs.size()); - for (size_t i = 0; i < fNBroadcastedInputs.size(); i++) { - inputs[i] = fNBroadcastedInputs[i] + "[id]"; + + // implement operator without broadcasting, but using loos on all indices + std::vector> inputStrides(nInputs); + for (int i = 0; i < nInputs; i++) + inputStrides[i] = UTILITY::ComputeStrideFromShape(fShapeInputs[i]); + + auto stridesY = UTILITY::ComputeStrideFromShape(fDimShapeY); + + // make loop on output indices + std::string compute_idx_Y; + int nloop = 0; + if (fDimShapeY.empty() || + std::all_of(fDimShapeY.begin(), fDimShapeY.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_Y = "0"; + } else { + for (size_t i = 0; i < fDimShapeY.size(); ++i) { + if (fDimShapeY[i].dim != 1 && fDimShapeY[i].GetVal() != "1") { + nloop++; + for (int j = 0; j < nloop; j++) out << SP; + out << "for (size_t idx_" << i << " = 0; idx_" << i << " < " << fDimShapeY[i] + << "; ++idx_" << i << "){\n"; + compute_idx_Y += "idx_" + std::to_string(i); + if (stridesY[i].GetVal() != "1") + compute_idx_Y += " * " + stridesY[i].GetVal(); + compute_idx_Y += " + "; + } + } + // remove last 3 characters " + " + for (int j = 0; j < 3; j++) + compute_idx_Y.pop_back(); + } + // find indices for input tensors + std::vector inputs(nInputs); + for (int ipt = 0; ipt < nInputs; ipt++ ) { + std::string compute_idx_X; + auto & shape = fShapeInputs[ipt]; + auto & stride = inputStrides[ipt]; + if (shape.empty() || + std::all_of(shape.begin(), shape.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_X = "0"; + } else { + for (size_t i = 0; i < shape.size(); ++i) { + if (shape[i].dim == 1 || shape[i].GetVal() == "1") + continue; + compute_idx_X += "idx_" + std::to_string(i + (fDimShapeY.size() - shape.size())); + if (stride[i].GetVal() != "1") + compute_idx_X += " * " + stride[i].GetVal(); + compute_idx_X += " + "; + } + // remove last 3 character " + " + for (int j = 0; j < 3; j++) + compute_idx_X.pop_back(); + } + inputs[ipt] = "tensor_" + fNInputs[ipt] + "[" + compute_idx_X + "]"; + } + + // perform the operation + for (int j = 0; j < nloop + 1; j++) out << SP; + std::string output = "tensor_" + fNY + "[" + compute_idx_Y + "]"; + out << NaryOperatorTraits::Op(output, inputs); + + for (int i = nloop; i > 0; i--) { + for (int j = 0; j < i; j++) out << SP; + out << "}\n"; } - out << SP << "for (size_t id = 0; id < " << length << "; id++) {\n"; - out << NaryOperatorTraits::Op("tensor_" + fNY + "[id]", inputs); - out << SP << "}\n"; } return out.str(); } diff --git a/tmva/sofie/inc/TMVA/ROperator_BatchNormalization.hxx b/tmva/sofie/inc/TMVA/ROperator_BatchNormalization.hxx index f2d31796bbbcd..c37e7fc4b68de 100644 --- a/tmva/sofie/inc/TMVA/ROperator_BatchNormalization.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_BatchNormalization.hxx @@ -141,8 +141,8 @@ public: } } - std::string Generate(std::string OpName) override { - OpName = "op_" + OpName; + std::string Generate(std::string opName) override { + opName = "op_" + opName; if (fShapeX.empty()){ throw std::runtime_error("TMVA SOFIE Batch Normalization called to Generate without being initialized first"); } @@ -158,7 +158,7 @@ public: spatial_dim = ConvertDimShapeToLength( spatialShape); } - out << "\n\n//---- BatchNorm" << (fActivation == EActivationType::RELU ? " + ReLU" : "") << "\n"; + out << "\n\n//---- BatchNorm" << (fActivation == EActivationType::RELU ? " + ReLU " : " ") << opName << "\n"; out << SP << "{\n"; out << SP << " size_t i = 0;\n"; out << SP << " for (size_t n = 0; n < " << batchSize << "; ++n) {\n"; diff --git a/tmva/sofie/inc/TMVA/ROperator_Cast.hxx b/tmva/sofie/inc/TMVA/ROperator_Cast.hxx index f48e27ee4f264..8267bb8a7e4f4 100644 --- a/tmva/sofie/inc/TMVA/ROperator_Cast.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_Cast.hxx @@ -46,7 +46,7 @@ public: throw std::runtime_error("TMVA SOFIE Cast Op Input Tensor is not found in model"); } fShape = model.GetDimTensorShape(fNX); - // shoud we add a check if the same type + // should we add a check if the same type auto inputType = model.GetTensorType(fNX); if (model.IsInitializedTensor(fNX)) { fIsOutputConstant = true; @@ -57,29 +57,30 @@ public: } else fIsOutputConstant = false; + } else if (model.IsShapeTensor(fNX) && ConvertStringToType(fAttrType) == ETensorType::INT64) { + auto shapeData = model.GetShapeTensorValues(fNX); + model.AddShapeTensor(fNY, shapeData, fShape.size() == 0); + fIsOutputConstant = true; } if (!fIsOutputConstant) model.AddIntermediateTensor(fNY, ConvertStringToType(fAttrType), fShape); if (model.Verbose()) { - std::cout << "Cast : " << ConvertTypeToString(inputType) << " " << fNX << " -> " << fAttrType << " for " << fNY; + std::cout << "Cast : " << ConvertTypeToString(inputType) << " " << fNX << " -> " << fAttrType << " for " << fNY + << " shape " << ConvertDimShapeToString(fShape); if (fIsOutputConstant) std::cout << " (constant) "; std::cout << std::endl; } } - std::string Generate(std::string OpName) override { - if (fIsOutputConstant) return ""; + std::string Generate(std::string opName) override { + + // output shape can be empty if is a scalar - OpName = "op_" + OpName; - if (fShape.empty()) { - throw std::runtime_error("TMVA SOFIE Cast called to Generate without being initialized first"); - } std::stringstream out; auto length = ConvertDimShapeToLength(fShape); - // out << SP << ETensorType << " " << OpName << "_attr = " << fattr << ";\n"; - out << "\n//------ CAST\n"; + out << "\n//------ CAST " << opName << " ---> " << fNY << " " << ConvertDimShapeToString(fShape) << "\n"; // no generated code for constant outputs if (fIsOutputConstant) return out.str(); diff --git a/tmva/sofie/inc/TMVA/ROperator_Comparision.hxx b/tmva/sofie/inc/TMVA/ROperator_Comparision.hxx index 0d365ae517de5..734434357a149 100644 --- a/tmva/sofie/inc/TMVA/ROperator_Comparision.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_Comparision.hxx @@ -56,7 +56,6 @@ template class ROperator_Comparision final : public ROperator{ private: - bool fIsModelOutput = false; std::string fNX1; std::string fNX2; std::string fNY; @@ -65,11 +64,10 @@ private: std::vector fDimShapeX1; std::vector fDimShapeX2; std::vector fShapeY; - std::string fNBroadcastedX1; - std::string fNBroadcastedX2; + std::vector fDimShapeY; ETensorType fTensorType1 = ETensorType::UNDEFINED; ETensorType fTensorType2 = ETensorType::UNDEFINED; - bool fBroadcast = false; + int fBroadcastFlag = 0; public: @@ -115,184 +113,260 @@ public: } fTensorType1 = model.GetTensorType(fNX1); fTensorType2 = model.GetTensorType(fNX2); - bool broadcast = !UTILITY::AreSameShape(fShapeX1, fShapeX2); - if (broadcast) { - // Y is the common shape of A and B - fShapeY = UTILITY::UnidirectionalBroadcastShape(fShapeX1, fShapeX2); - bool broadcastX1 = !UTILITY::AreSameShape(fShapeX1, fShapeY); - bool broadcastX2 = !UTILITY::AreSameShape(fShapeX2, fShapeY); - // Broadcast A to Y - if (broadcastX1) { - if (model.IsInitializedTensor(fNX1)) { - auto data = model.GetInitializedTensorData(fNX1); - std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeX1, fShapeY), - std::default_delete()); - // Update the data and the shape of A - model.UpdateInitializedTensor(fNX1, model.GetTensorType(fNX1), fShapeY, broadcastedData); - fShapeX1 = fShapeY; - } else { - // Add an intermediate tensor for broadcasting A - fNBroadcastedX1 = "Broadcasted" + fNX1; - model.AddIntermediateTensor(fNBroadcastedX1, model.GetTensorType(fNX1), fShapeY); + // case of non dynamic tensors + if (!fShapeX1.empty() && !fShapeX2.empty()) { + bool broadcastX1 = false; + bool broadcastX2 = false; + if (UTILITY::AreSameShape(fShapeX1, fShapeX2)) { + // no broadcast needed + fShapeY = fShapeX1; + } else { + // Y is the common shape of A and B + fShapeY = UTILITY::UnidirectionalBroadcastShape(fShapeX1, fShapeX2); + broadcastX1 = !UTILITY::AreSameShape(fShapeX1, fShapeY); + broadcastX2 = !UTILITY::AreSameShape(fShapeX2, fShapeY); + } + + + // analyze case of constant tensors or shape tensors (which have known shapes but data as Dim values + // normal case with non-dynamic tensor is also here + T *data1 = nullptr; + T *data2 = nullptr; + std::unique_ptr broadcastedData1; + std::unique_ptr broadcastedData2; + // data for shape tensors + std::vector shapeData1; + std::vector shapeData2; + size_t length = ConvertShapeToLength(fShapeY); + bool *outData = new bool[length]; + if (model.IsInitializedTensor(fNX1)) { + data1 = static_cast(model.GetInitializedTensorData(fNX1).get()); + if (broadcastX1) { + broadcastedData1 = std::unique_ptr( + UTILITY::UnidirectionalBroadcast(data1, fShapeX1, fShapeY)); + data1 = broadcastedData1.get(); } + + } else if (model.IsShapeTensor(fNX1)) { + shapeData1 = model.GetShapeTensorValues(fNX1); } - // Broadcast B to Y - if (broadcastX2) { - if (model.IsInitializedTensor(fNX2)) { - auto data = model.GetInitializedTensorData(fNX2); - std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeX2, fShapeY), - std::default_delete()); - // Update the data and the shape of B - model.UpdateInitializedTensor(fNX2, model.GetTensorType(fNX2), fShapeY, broadcastedData); - fShapeX2 = fShapeY; - } else { - // Add an intermediate tensor for broadcasting B - fNBroadcastedX2 = "Broadcasted" + fNX2; - model.AddIntermediateTensor(fNBroadcastedX2, model.GetTensorType(fNX2), fShapeY); + if (model.IsInitializedTensor(fNX2)) { + data2 = static_cast(model.GetInitializedTensorData(fNX2).get()); + if (broadcastX2) { + broadcastedData2 = std::unique_ptr( + UTILITY::UnidirectionalBroadcast(data2, fShapeX2, fShapeY)); + data2 = broadcastedData2.get(); } + } else if (model.IsShapeTensor(fNX2)) { + shapeData2 = model.GetShapeTensorValues(fNX2); } - } else { - fShapeY = fShapeX1; - } - // case of constant tensors - T * data1 = nullptr; - T * data2 = nullptr; - std::vector shapeData1; - std::vector shapeData2; - size_t length = ConvertShapeToLength(fShapeY); - bool * outData = new bool[length]; - if (model.IsInitializedTensor(fNX1)) { - data1 = static_cast(model.GetInitializedTensorData(fNX1).get()); - } else if (model.IsShapeTensor(fNX1)) { - shapeData1 = model.GetShapeTensorValues(fNX1); - } - if (model.IsInitializedTensor(fNX2)) { - data2 = static_cast(model.GetInitializedTensorData(fNX2).get()); - } else if (model.IsShapeTensor(fNX2)) { - shapeData2 = model.GetShapeTensorValues(fNX2); - } - if (data1 && data2) { - fIsOutputConstant = true; - for (size_t i = 0; i < length; i++) - outData[i] = ComparisionTrait::Result(data1[i], data2[i]); - model.AddConstantTensor(fNY, fShapeY, outData); - if (model.Verbose()) - std::cout << ComparisionTrait::Name() << " op ---> " << fNY << " " << ConvertShapeToString(fShapeY) << " : " - << ConvertValuesToString(length,outData) << std::endl; - } else if ((data1 || !shapeData1.empty()) && (data2 || !shapeData2.empty())) { - fIsOutputConstant = true; - if (data1 && !data2) { - // data 1 is constant and data2 is shape - for (size_t i = 0; i < length; i++) { - if (shapeData2[i].isParam) { - if (shapeData2[i].dim == size_t(-1) || data1[i] > 0) { - fIsOutputConstant = false; - break; - } else { - // assume a comparison is done with .dim = 0 - shapeData2[i].dim = 0; + if (data1 && data2) { + fIsOutputConstant = true; + for (size_t i = 0; i < length; i++) + outData[i] = ComparisionTrait::Result(data1[i], data2[i]); + model.AddConstantTensor(fNY, fShapeY, outData); + if (model.Verbose()) + std::cout << ComparisionTrait::Name() << " op ---> " << fNY << " " + << ConvertShapeToString(fShapeY) << " : " << ConvertValuesToString(length, outData) + << std::endl; + } else if ((data1 || !shapeData1.empty()) && (data2 || !shapeData2.empty())) { + fIsOutputConstant = true; + if (data1 && !data2) { + // data 1 is constant and data2 is shape + for (size_t i = 0; i < length; i++) { + if (shapeData2[i].isParam) { + if (shapeData2[i].dim == size_t(-1) || data1[i] > 0) { + fIsOutputConstant = false; + break; + } else { + // assume a comparison is done with .dim = 0 + shapeData2[i].dim = 0; + } } + outData[i] = ComparisionTrait::Result(data1[i], static_cast(shapeData2[i].dim)); } - outData[i] = ComparisionTrait::Result(data1[i], static_cast(shapeData2[i].dim)); - } - } else if (!data1 && data2) { - // data 1 is shape and dat2 is constant - for (size_t i = 0; i < length; i++) { - if (shapeData1[i].isParam) { - if (shapeData1[i].dim == size_t(-1) || data2[i] > 0) { + } else if (!data1 && data2) { + // data 1 is shape and dat2 is constant + for (size_t i = 0; i < length; i++) { + if (shapeData1[i].isParam) { + if (shapeData1[i].dim == size_t(-1) || data2[i] > 0) { + fIsOutputConstant = false; + break; + } else { + // assume a comparison is done with .dim = 0 + shapeData1[i].dim = 0; + } + } + outData[i] = ComparisionTrait::Result(static_cast(shapeData1[i].dim), data2[i]); + } + } else if (!shapeData1.empty() && !shapeData2.empty()) { + // both data1 and data2 are shape tensors + for (size_t i = 0; i < length; i++) { + if (!shapeData1[i].isParam && !shapeData2[i].isParam) { + outData[i] = ComparisionTrait::Result(shapeData1[i].dim, shapeData2[i].dim); + } else if (shapeData1[i].isParam && shapeData2[i].isParam) { + if (shapeData1[i].param == shapeData2[i].param) + outData[i] = ComparisionTrait::Result(1, 1); // comparison of two equal value + else { + fIsOutputConstant = false; + break; + } + } else { fIsOutputConstant = false; break; - } else { - // assume a comparison is done with .dim = 0 - shapeData1[i].dim = 0; } } - outData[i] = ComparisionTrait::Result(static_cast(shapeData1[i].dim), data2[i]); } - } else if (!shapeData1.empty() && !shapeData2.empty() ) { - // both data1 and data2 are shape tensors - for (size_t i = 0; i < length; i++) { - if (!shapeData1[i].isParam && !shapeData2[i].isParam) { - outData[i] = ComparisionTrait::Result(shapeData1[i].dim, shapeData2[i].dim); - } - else if (shapeData1[i].isParam && shapeData2[i].isParam) { - if (shapeData1[i].param == shapeData2[i].param) - outData[i] = ComparisionTrait::Result(1,1); // comparison of two equal value - else { - fIsOutputConstant = false; - break; + if (fIsOutputConstant) { + model.AddConstantTensor(fNY, fShapeY, outData); + if (model.Verbose()) + std::cout << ComparisionTrait::Name() << " op ---> " << fNY << " " + << ConvertShapeToString(fShapeY) << " : " << ConvertValuesToString(length, outData) + << " (constant) " << std::endl; + } + } + delete[] outData; + // case of non constant output (no constant or shape tensors) + if (!fIsOutputConstant && !fShapeY.empty()) { + model.AddIntermediateTensor(fNY, ETensorType::BOOL, fShapeY); + fDimShapeY = ConvertShapeToDim(fShapeY); + if (model.Verbose()) + std::cout << ComparisionTrait::Name() << " op ---> " << fNY << " " + << ConvertShapeToString(fShapeY) << std::endl; + } + } else { + // case of dynamic tensors + // case A or B have dynamic shapes. We need to broadcast if shape are not same + auto ret = UTILITY::MultidirectionalBroadcastShape(fDimShapeX1, fDimShapeX2); + fBroadcastFlag = ret.first; + fDimShapeY = ret.second; + // case of all parametric shapes and MultiDirectionalBroadcastShape return the max of the 2 + // need to do before we declare the output tensor shape and the broadcasted ones + if (ret.first & 4) { + // check if one of the parameter is an input dimension + // define function to find this + auto IsInputDimParam = [&](const std::string &p) { + auto inputNames = model.GetInputTensorNames(); + for (auto &input : inputNames) { + for (auto &i_s : model.GetDimTensorShape(input)) { + if (i_s.isParam && i_s.param == p) + return true; } } - else { - fIsOutputConstant = false; - break; + return false; + }; + for (size_t i = 0; i < fDimShapeY.size(); i++) { + auto &s = fDimShapeY[i]; + if (s.isParam && s.param.find("std::max") != std::string::npos) { + if (IsInputDimParam(fDimShapeX1[i].param)) { + // case dim is 1 we indicate that the input parameter is equal to 1 + if (fDimShapeX1[i].dim != 1) + s = fDimShapeX1[i]; + else + s = fDimShapeX2[i]; + } else if (IsInputDimParam(fDimShapeX2[i].param)) { + if (fDimShapeX2[i].dim != 1) + s = fDimShapeX2[i]; + else + s = fDimShapeX1[i]; + } } } } - if (fIsOutputConstant) { - model.AddConstantTensor(fNY, fShapeY, outData); - if (model.Verbose()) - std::cout << ComparisionTrait::Name() << " op ---> " << fNY << " " << ConvertShapeToString(fShapeY) << " : " - << ConvertValuesToString(length,outData) << " (constant) " << std::endl; + model.AddIntermediateTensor(fNY, ETensorType::BOOL, fDimShapeY); + if (model.Verbose()) { + std::cout << ComparisionTrait::Name() << " : " << fNX1 << " " << ConvertShapeToString(fDimShapeX1) << " , " + << fNX2 << " " << ConvertShapeToString(fDimShapeX2) << " --> " + << fNY << " " << ConvertShapeToString(fDimShapeY) << std::endl; + model.PrintIntermediateTensors(); } } - delete [] outData; - if (!fIsOutputConstant) { - model.AddIntermediateTensor(fNY, ETensorType::BOOL , fShapeY); - if (model.Verbose()) - std::cout << ComparisionTrait::Name() << " op ---> " << fNY << " " << ConvertShapeToString(fShapeY) << std::endl; - } - - // check if this is not output operators to add a specific line for definining the tensor_xxx variable - const auto & outputTensorNames = model.GetOutputTensorNames(); - fIsModelOutput = false; - if (std::find(outputTensorNames.begin(), outputTensorNames.end(), fNY) != outputTensorNames.end()) - fIsModelOutput = true; } std::string Generate(std::string opName) override { if (fIsOutputConstant) return ""; opName = "op_" + opName; - if (fShapeY.empty()) { + if (fDimShapeY.empty()) { throw std::runtime_error("TMVA SOFIE Comparision Op called to Generate without being initialized first"); } std::stringstream out; out << SP << "\n//------ " << ComparisionTrait::Name() << " " << opName << " --> " << ConvertShapeToString(fShapeY) << "\n"; - size_t length = ConvertShapeToLength(fShapeY); - // Broadcast A if it's uninitialized - if (!fNBroadcastedX1.empty()) { - std::string type1 = ConvertTypeToString(fTensorType1); - out << SP << "// Broadcasting uninitialized tensor " << fNX1 << "\n"; - out << SP << "{\n"; - out << SP << SP << type1 << "* data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<" << type1 << ">(tensor_" << fNX1 << ", " << ConvertShapeToString(fShapeX1) << ", " << ConvertShapeToString(fShapeY) << ");\n"; - out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNBroadcastedX1 << ");\n"; - out << SP << SP << "delete[] data;\n"; - out << SP << "}\n"; + + // need to add check if tensors are compatible as in binary operator + + // use same code as Binary operator + auto stridesA = UTILITY::ComputeStrideFromShape(fDimShapeX1); + auto stridesB = UTILITY::ComputeStrideFromShape(fDimShapeX2); + auto stridesY = UTILITY::ComputeStrideFromShape(fDimShapeY); + + std::string compute_idx_X1, compute_idx_X2, compute_idx_Y; + if (fDimShapeX1.empty() || + std::all_of(fDimShapeX1.begin(), fDimShapeX1.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_X1 = "0"; + } else { + for (size_t i = 0; i < fDimShapeX1.size(); ++i) { + if (fDimShapeX1[i].dim == 1 || fDimShapeX1[i].GetVal() == "1") + continue; + compute_idx_X1 += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeX1.size())); + if (stridesA[i].GetVal() != "1") + compute_idx_X1 += " * " + stridesA[i].GetVal(); + compute_idx_X1 += " + "; + } + // remove last 3 character " + " + for (int j = 0; j < 3; j++) + compute_idx_X1.pop_back(); } - // Broadcast B if it's uninitialized - if (!fNBroadcastedX2.empty()) { - std::string type2 = ConvertTypeToString(fTensorType2); - out << SP << "// Broadcasting uninitialized tensor " << fNX2 << "\n"; - out << SP << "{\n"; - out << SP << SP << type2 << "* data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<" << type2 << ">(tensor_" << fNX2 << ", " << ConvertShapeToString(fShapeX2) << ", " << ConvertShapeToString(fShapeY) << ");\n"; - out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNBroadcastedX2 << ");\n"; - out << SP << SP << "delete[] data;\n"; - out << SP << "}\n"; + if (fDimShapeX2.empty() || + std::all_of(fDimShapeX2.begin(), fDimShapeX2.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_X2 = "0"; + } else { + for (size_t i = 0; i < fDimShapeX2.size(); ++i) { + if (fDimShapeX2[i].dim == 1 || fDimShapeX2[i].GetVal() == "1") + continue; + compute_idx_X2 += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeX2.size())); + if (stridesB[i].GetVal() != "1") + compute_idx_X2 += " * " + stridesB[i].GetVal(); + compute_idx_X2 += " + "; + } + // remove last 3 character " + " + for (int j = 0; j < 3; j++) + compute_idx_X2.pop_back(); } - const std::string& nameX1 = fNBroadcastedX1.empty()? fNX1 : fNBroadcastedX1; - const std::string& nameX2 = fNBroadcastedX2.empty()? fNX2 : fNBroadcastedX2; - - out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n"; - out << SP << SP << "fTensor_" << fNY << "[id] = " << ComparisionTrait::Op( "tensor_" + nameX1 + "[id]" , "tensor_" + nameX2 + "[id]") << " ;\n"; - out << SP << "}\n"; - // since output is a boolean need to add the tensor_xxx variable since it is not defined as a pointer to a boolean std::vector - if (!fIsModelOutput) - out << SP << "const std::vector & tensor_" << fNY << " = fTensor_" << fNY << ";\n"; + int nloop = 0; + if (fDimShapeY.empty() || + std::all_of(fDimShapeY.begin(), fDimShapeY.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_Y = "0"; + } else { + for (size_t i = 0; i < fDimShapeY.size(); ++i) { + if (fDimShapeY[i].dim != 1 && fDimShapeY[i].GetVal() != "1") { + nloop++; + for (int j = 0; j < nloop; j++) out << SP; + out << "for (size_t idx_" << i << " = 0; idx_" << i << " < " << fDimShapeY[i] + << "; ++idx_" << i << "){\n"; + compute_idx_Y += "idx_" + std::to_string(i); + if (stridesY[i].GetVal() != "1") + compute_idx_Y += " * " + stridesY[i].GetVal(); + compute_idx_Y += " + "; + } + } + // remove last 3 characters " + " + for (int j = 0; j < 3; j++) + compute_idx_Y.pop_back(); + } + for (int j = 0; j < nloop + 1; j++) out << SP; + out << "tensor_" << fNY << "[" << compute_idx_Y << "] = " + << ComparisionTrait::Op( "tensor_" + fNX1 + "[" + compute_idx_X1 + "]" , + "tensor_" + fNX2 + "[" + compute_idx_X2 + "]") << " ;\n"; + + + for (int i = nloop; i > 0; i--) { + for (int j = 0; j < i; j++) out << SP; + out << "}\n"; + } + return out.str(); } diff --git a/tmva/sofie/inc/TMVA/ROperator_Concat.hxx b/tmva/sofie/inc/TMVA/ROperator_Concat.hxx index ad855341dfc17..d8155195c9f49 100644 --- a/tmva/sofie/inc/TMVA/ROperator_Concat.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_Concat.hxx @@ -123,7 +123,7 @@ concat_dim = inputs[i][iaxis]; else if (inputs[i][iaxis].isParam || concat_dim.isParam) { concat_dim = - Dim{ concat_dim.GetVal() + std::string("+ ") + inputs[i][iaxis].GetVal(), + Dim{ concat_dim.GetVal() + std::string(" + ") + inputs[i][iaxis].GetVal(), static_cast(-1)}; } else { concat_dim = Dim { concat_dim.dim + inputs[i][iaxis].dim }; @@ -156,7 +156,7 @@ } // output shape for concatenated axis - ret[fAxis] = Dim{concat_dim}; + ret[fAxis] = concat_dim; } // case of stacking (not supported yet) @@ -205,7 +205,7 @@ size_t inputLength = ConvertShapeToLength(inputShape); std::copy(inputData, inputData + inputLength, outputData.begin() + offset ); offset += inputLength; - // data do not need to be written as a weight + // data do not need to be written in teh generated code model.SetNotWritableInitializedTensor(input); } model.AddConstantTensor(fOutput, outputShape, outputData.data()); @@ -221,15 +221,18 @@ std::vector inputData; auto inputShape = model.GetTensorShape(input); // shape is not dynamic size_t inputLength = ConvertShapeToLength(inputShape); // shape can be a scalar - if (model.IsShapeTensor(input)) + if (model.IsShapeTensor(input)) { inputData = model.GetShapeTensorValues(input); - else if (model.IsConstantTensor(input)) { + } else if (model.IsInitializedTensor(input)) { inputData.resize(inputLength); auto intData = static_cast(model.GetInitializedTensorData(input).get()); for (size_t i = 0; i < inputData.size(); i++) inputData[i] = Dim{ static_cast(intData[i])}; } - std::cout << "concatenating input data " << inputLength << " " << inputData[0] << std::endl; + else { + // this should not happen + throw std::runtime_error("TMVA SOFIE Concat Operator- invalid input type for shape output type"); + } std::copy(inputData.begin(), inputData.end(), outputData.begin() + offset ); offset += inputLength; } @@ -251,13 +254,15 @@ } std::string Generate(std::string opName) override { - if (fIsOutputConstant) return ""; opName = "op_" + opName; + std::stringstream out; + out<<"\n//--------- Concat " << opName << " --> " << fOutput << " " << ConvertShapeToString(fOutputShape) << "\n"; + + if (fIsOutputConstant) return out.str(); + if(fOutputShape.empty()){ throw std::runtime_error("TMVA SOFIE Concat called to Generate without being initialized first"); } - std::stringstream out; - out<<"\n//--------- Concat " << opName << " --> " << ConvertShapeToString(fOutputShape) << "\n"; // special case when memory is contiguous bool hasShapeOnes = true; for(int i = 0; i0) - out << SP << SP << SP << "idxOut += " << fInputShapes[j-1][fAxis].GetVal() << ";\n"; + out << SP << SP << SP << "idxOut += " << inStrides[j-1][fAxis-1].GetVal() << ";\n"; out << SP << SP << SP << "int idxIn" << j <<" = "; for (int k = 0; k < fAxis; k++) { if (k > 0) out << " + "; out << inStrides[j][k].GetVal() << "*i" << k; } out << ";\n"; - out << SP << SP << SP << "for (size_t iC = 0; iC < " << fInputShapes[j][fAxis].GetVal() << "; ++iC) {\n"; + out << SP << SP << SP << "for (size_t iC = 0; iC < " << inStrides[j][fAxis-1].GetVal() << "; ++iC) {\n"; out << SP << SP << SP << SP << "tensor_" << fOutput << "[idxOut+iC] = tensor_" << fInputs[j] << "[idxIn" << j << "+iC];\n"; out << SP << SP << SP << "}\n"; // concatenate the axis values diff --git a/tmva/sofie/inc/TMVA/ROperator_Constant.hxx b/tmva/sofie/inc/TMVA/ROperator_Constant.hxx index 1cf5d13f5cd6f..7c824f1abe6e3 100644 --- a/tmva/sofie/inc/TMVA/ROperator_Constant.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_Constant.hxx @@ -123,11 +123,16 @@ public: if (model.Verbose()) { std::cout << "adding constant tensor " << fNY << " with shape " << ConvertShapeToString(fShape) << " and values ["; - for (auto v : fValues) std::cout << " " << v; - std::cout << "]" << std::endl; + if (!fIsConstantOfShape) { + for (auto v : fValues) std::cout << " " << v; + std::cout << "]" << std::endl; + } else { // for constant of shape is enough to print one value + std::cout << "... " << fValues[0] << " ....]" << std::endl; + } } } else { model.AddIntermediateTensor(fNY, ConvertStringToType(TensorType::Name()), fDimOutputShape); + fOutputTensorNames.emplace_back(fNY); } } @@ -136,9 +141,9 @@ public: std::stringstream out; if (fIsOutputConstant) { if (fNX.empty()) - out << "// ---- Constant (no-op) " << opName << " --> " << ConvertShapeToString(fDimOutputShape) << "\n"; + out << "// ---- Constant (no-op) " << opName << " --> " << fNY << " " << ConvertShapeToString(fDimOutputShape) << "\n"; else - out << "// ---- ConstantOfShape (no-op) " << opName << " --> " << ConvertShapeToString(fDimOutputShape) << "\n"; + out << "// ---- ConstantOfShape (no-op) " << opName << " --> " << fNY << " " << ConvertShapeToString(fDimOutputShape) << "\n"; return out.str(); } // Only ConstantOfShape might require generation code @@ -153,9 +158,7 @@ public: } auto length = ConvertDimShapeToLength(fDimOutputShape); // vector is already allocated- fill with values - out << SP << "if (" << length << " > fTensor_" << fNY << ".size())\n"; - out << SP << SP << "fTensor_" << fNY << ".resize(" << length << ");\n"; - out << SP << "std::fill(fTensor_" << fNY << ".begin(), fTensor_" << fNY << ".end(), " << fValues[0] << ");\n"; + out << SP << "std::fill(tensor_" << fNY << ", tensor_" << fNY << " + " << length << ", " << fValues[0] << ");\n"; return out.str(); } }; diff --git a/tmva/sofie/inc/TMVA/ROperator_Conv.hxx b/tmva/sofie/inc/TMVA/ROperator_Conv.hxx index 95f226ca91d4b..87d1ad0a0bf67 100644 --- a/tmva/sofie/inc/TMVA/ROperator_Conv.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_Conv.hxx @@ -20,6 +20,8 @@ template class ROperator_Conv final : public ROperator { private: + bool fBroadcastBias = false; + std::string fAttrAutopad; std::vector fAttrDilations; size_t fAttrGroup; @@ -30,7 +32,6 @@ private: std::string fNX; std::string fNW; std::string fNB; - std::string fNB2; // bias tensor name after broadcasting std::string fNY; std::string convK; @@ -262,6 +263,9 @@ public: std::runtime_error("TMVA SOFIE Conv op Input Tensor " + fNB + " is not found in model"); } fShapeB = model.GetTensorShape(fNB); + if (fShapeB.size() != 1) + throw + std::runtime_error("TMVA SOFIE Conv op : invalid shape for Bias tensor (is not 1D)"); std::vector targetShape(fShapeY.begin() + 1, fShapeY.end()); auto shapeDimB = model.GetDimTensorShape(fNB); bool broadcast_needed = !UTILITY::AreSameShape(shapeDimB, targetShape); @@ -278,7 +282,9 @@ public: if (fType != "float") throw std::runtime_error("TMVA SOFIE Conv op: Broadcasting for non-float type tensors is not supported"); // here is the actual broadcasting + fBroadcastBias = true; if (!fUseSession) { + // do here broadcasting std::vector shape(fDim + 1, 1); shape[0] = fShapeB[0]; auto intTargetShape = ConvertShapeToInt(targetShape); @@ -287,26 +293,28 @@ public: std::default_delete()); model.UpdateInitializedTensor(fNB, model.GetTensorType(fNB), intTargetShape, new_data_ptr); fShapeB = model.GetTensorShape(fNB); - fNB2 = fNB; // use same name - } - else { - // In case of session add broadcasting code in Session constructor and in GenerateInitCode - // we need to add a new intermediate tensor for broadcasted bias tensor - fNB2 = fNB + "bcast"; - model.AddIntermediateTensor(fNB2, model.GetTensorType(fNB), targetShape); } } } - // output channel size can be parametric + // output channel size can be parametric and is an expression std::vector outputDims = std::vector(fShapeY.begin()+2, fShapeY.end()); - auto outputChannelSize = ConvertDimShapeToLength(outputDims); // size/channel = D * H * W + //check if shape is not parametric + std::vector outputInts = ConvertShapeToInt(outputDims); + Dim channelDim; + if (outputInts.empty()) { + auto outputChannelSize = ConvertDimShapeToLength(outputDims); // size/channel = D * H * W + channelDim = Dim{ outputChannelSize, static_cast(-1)}; + } else { + size_t outputChannelSize = ConvertShapeToLength(outputInts); + channelDim = Dim{ outputChannelSize }; + } size_t kernelSize = fAttrKernelShape[0]; for (size_t i = 1; i < fDim; i++) { kernelSize *= fAttrKernelShape[i]; } std::vector shape1 = {fShapeW[0], fShapeW[1], kernelSize}; - std::vector shape2 = {Dim{fShapeW[1]}, Dim{kernelSize}, Dim{outputChannelSize}}; + std::vector shape2 = {Dim{fShapeW[1]}, Dim{kernelSize}, channelDim }; model.AddIntermediateTensor(fNX +"_f", ConvertStringToType(fType), shape1 ); model.AddIntermediateTensor(fNX +"_xcol", ConvertStringToType(fType), shape2 ); convK = fNX +"_f"; @@ -325,15 +333,25 @@ public: std::string GenerateInitCode() override { std::stringstream out; // Generate initialization code for broadcasting of bias tensor - if (!fNB2.empty()) { + if (fBroadcastBias) { // include a separate scope to avoid defining unique operator temp variables std::vector shape(fDim + 1, 1); + // bias (is a 1D tensor) shape[0] = fShapeB[0]; std::vector targetShape(fShapeY.begin() + 1, fShapeY.end()); - out << SP << "{\n"; + out << "//--- broadcast bias tensor " << fNB << "for Conv op if needed \n"; + // in case of dynamic tensors check needs to be done at run time + bool isOutDynamic = ConvertShapeToInt(targetShape).empty(); + auto length = ConvertDimShapeToLength(targetShape); + if (isOutDynamic) + out << SP << "if (" << length << " > " << ConvertShapeToLength(shape) << ") {\n"; + else + out << SP << "{\n"; out << SP << SP << "float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" << fNB << ", " << ConvertShapeToString(shape) << ", " << ConvertShapeToString(fShapeY) << ");\n"; - out << SP << SP << "std::copy(data, data + " << ConvertDimShapeToLength(targetShape) << ", tensor_" << fNB2 << ");\n"; + out << SP << SP << "fTensor_" << fNB << ".resize(" << length << ");\n"; + out << SP << SP << "std::copy(data, data + " << length << ", fTensor_" << fNB << ".begin());\n"; + out << SP << SP << "tensor_" << fNB << " = fTensor_" << fNB << ".data();\n"; out << SP << SP << "delete[] data;\n"; out << SP << "}\n"; } @@ -553,13 +571,13 @@ public: out << SP << SP << "}\n"; // end of group loop } - if (fNB2 != "") { + if (fNB != "") { out << SP << "int " << OpName << "_size = " << outputBatchStride << ";\n"; out << SP << "float " << OpName << "_gamma = 1.0;\n"; out << SP << "int " << OpName << "_incx = 1;\n"; out << SP << "int " << OpName << "_incy = 1;\n"; - out << SP << "BLAS::saxpy_(&" << OpName << "_size, &" << OpName << "_gamma, tensor_" << fNB2 << ", &" + out << SP << "BLAS::saxpy_(&" << OpName << "_size, &" << OpName << "_gamma, tensor_" << fNB << ", &" << OpName << "_incx, tensor_" << fNY << " + out_offset, &" << OpName << "_incy);\n"; } diff --git a/tmva/sofie/inc/TMVA/ROperator_Gather.hxx b/tmva/sofie/inc/TMVA/ROperator_Gather.hxx index 81411b8ebf71a..0d50c0747c028 100644 --- a/tmva/sofie/inc/TMVA/ROperator_Gather.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_Gather.hxx @@ -72,8 +72,6 @@ public: // empty shape Indices is a scalar value for the indices size_t indicesLength = ConvertShapeToLength(model.GetTensorShape(fNIndices)); int64_t* indicesData = static_cast(model.GetInitializedTensorData(fNIndices).get()); - //flag index tensor as not writable (not sure this is needed since index tensor might be used in generated code) - model.SetNotWritableInitializedTensor(fNIndices); // update indices data in case of negative dim values for (size_t i = 0; i < indicesLength; i++) { // move this at generation time? @@ -153,13 +151,14 @@ public: } std::string Generate(std::string opName) override { + opName = "op_" + opName; + std::stringstream out; + out << "//--------- Gather " << opName << " --> " << fNY << " " << ConvertShapeToString(fShapeY) << "\n"; if (fIsOutputConstant) { // no code to generate here for constant output. Tensor output is defined in Session constructor - return "//---------------------------------------\n"; + out << "//--------------------(constant)----------\n"; + return out.str(); } - opName = "op_" + opName; - std::stringstream out; - out << "//--------- Gather " << opName << " --> " << ConvertShapeToString(fShapeY) << "\n"; // The shape of the output is q + r - 1 size_t r = fShapeX.size(); // Indices of shape q diff --git a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx index d954720396151..a18914b8892a8 100644 --- a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx @@ -24,6 +24,7 @@ namespace SOFIE{ private: bool fIsDynamic = false; + bool fBroadcastBias = false; float fAttrAlpha = 1.0; float fAttrBeta = 1.0; @@ -33,7 +34,6 @@ namespace SOFIE{ std::string fNA; std::string fNB; std::string fNC = ""; - std::string fNC2; // bias tensor name after broadcasting std::string fNY; std::string fType; EActivationType fActivation; @@ -107,6 +107,7 @@ namespace SOFIE{ if (input[0].size() > 2 && input[1].size() == input[0].size()) { // in case of dim > 2 first dimensions are equal to the input ones not // equal to 1 (e.g. (1,2,3) * (2,3,4) -> (2,2,4)) + // here could probably use the Broadcasting function UTILITY::MultidirectionalBroadcastShape for (size_t i = 0; i < input[0].size()-2; i++) { Dim valueA = input[0][i]; Dim valueB = input[1][i]; @@ -207,13 +208,7 @@ namespace SOFIE{ } fShapeY = DynamicShapeInference({fShapeA, fShapeB}); - std::vector shapeY; - if (!fIsDynamic) { - shapeY = ConvertShapeToInt(fShapeY); - if (shapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Gemm Op " + fNY + " has invalid shape" + ConvertShapeToString(fShapeY)); - } - } + std::vector shapeY = ConvertShapeToInt(fShapeY); // bias is normally not dynamic (not support it for time being) if (fNC != ""){ @@ -222,38 +217,33 @@ namespace SOFIE{ throw std::runtime_error("TMVA SOFIE Gemm Op Input Tensor" + fNC + " is dynamic and is not supported"); } fShapeC = model.GetTensorShape(fNC); - fNC2 = fNC; size_t lengthC = ConvertShapeToLength(fShapeC); size_t lengthY = ConvertShapeToLength(shapeY); - // for dynamic outputs broadcasting is always done - bool broadcast_needed = lengthC != lengthY; + // for dynamic outputs broadcasting is always needed + bool broadcast_needed = false; + if (fIsDynamic && shapeY.empty()) + broadcast_needed = true; + else + broadcast_needed = lengthC != lengthY; if (broadcast_needed) { - if (!model.UseSession()) { - // without session dynamic tensors not supported in Gemm - if (fIsDynamic) { - throw std::runtime_error("TMVA SOFIE Gemm Op: dynamic tensors not supported without a session"); - } - auto original_data = model.GetInitializedTensorData(fNC); - auto targetShape = UTILITY::UnidirectionalBroadcastShape(fShapeC, shapeY); - if (fType == "float") { - std::shared_ptr new_data_ptr(UTILITY::UnidirectionalBroadcast( - static_cast(original_data.get()), fShapeC, targetShape), - std::default_delete()); - - model.UpdateInitializedTensor(fNC, model.GetTensorType(fNC), shapeY, new_data_ptr); - fShapeC = shapeY; - } - } else { - // In case of session add broadcasting code in Session constructor and in GenerateInitCode - // we need to add a new intermediate tensor for broadcasted bias tensor - fNC2 = fNC + "bcast"; - if (!fIsDynamic) { - model.AddIntermediateTensor(fNC2, model.GetTensorType(fNC), shapeY); - } - else - model.AddDynamicTensor(fNC2,model.GetTensorType(fNC), fShapeY); + fBroadcastBias = true; + // check if broadcasting is compatible and note that prepend 1 to shapeC + auto shapeDimC = ConvertShapeToDim(fShapeC); + auto r = UTILITY::MultidirectionalBroadcastShape(fShapeY, shapeDimC); + // return flag must be equal to 1 since this is a unidirectional broadcast of C->Y + if (r.first > 1) { + throw std::runtime_error("TMVA SOFIE Gemm Op - bias tensor of shape " + ConvertShapeToString(fShapeC) + " cannot be uni-directional broadcasted to " + ConvertDimShapeToString(fShapeY)); + } + fShapeC = ConvertShapeToInt(shapeDimC); + if (fShapeC.empty()) { + throw std::runtime_error("TMVA SOFIE Gemm Op - Error in bias tensor " + ConvertDimShapeToString(shapeDimC) ); + } + } else { + // for the case lengthY == lengthC but shape is different (e.g. Y is (2,3) and is (6)) + if (shapeY != fShapeC) { + throw std::runtime_error("TMVA SOFIE Gemm Op: invalid shape for bias tensor " + ConvertShapeToString(fShapeC)); } } } @@ -291,21 +281,31 @@ namespace SOFIE{ std::string GenerateInitCode() override { std::stringstream out; // generate initialization code for broadcasting of bias tensor - if (fShapeC.size() != fShapeY.size() && fNC != fNC2) { +#if 0 + if (fShapeC.size() != fShapeY.size() && fBroadcastBias) { // we broadcast here always C in Y output, so target shape is the one of Y // no need to call UTILITY::UnidirectionalBroadcastShape. // here in case of parametric shape we need to assume that the parameters will be defined in the initialization code. - auto targetShape = fShapeY; - // include a separate scope to avoid defining unique operator temp variables - out << "//--- broadcast bias tensor " << fNC << "for Gemm op\n"; - out << SP << "{\n"; - out << " float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" - << fNC << "," << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY) << ");\n"; auto length = ConvertDimShapeToLength(fShapeY); // output size - out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNC2 << ");\n"; + // include a separate scope to avoid defining unique operator temp variables + out << "//--- broadcast bias tensor " << fNC << "for Gemm op if needed \n"; + // in case of dynamic tensors check needs to be done at run time + bool isOutDynamic = ConvertShapeToInt(fShapeY).empty(); + if (isOutDynamic) + out << SP << "if (" << length << " > " << ConvertShapeToLength(fShapeC) << ") {\n"; + else + out << SP << "{\n"; + // here we broadcast + out << SP << SP << "float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" + << fNC << "," << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY) << ");\n"; + + out << SP << SP << "fTensor_" << fNC << ".resize(" << length << ");\n"; + out << SP << SP << "std::copy(data, data + " << length << ", fTensor_" << fNC << ".begin());\n"; + out << SP << SP << "tensor_" << fNC << " = fTensor_" << fNC << ".data();\n"; out << SP << SP << "delete [] data;\n"; out << SP << "}\n"; } +#endif return out.str(); } @@ -316,7 +316,8 @@ namespace SOFIE{ throw std::runtime_error("TMVA SOFIE Gemm Op called to Generate without being initialized first"); } std::stringstream out; - out << "\n//--------- Gemm\n"; + out << "\n//--------- Gemm " << opName << " " << ConvertShapeToString(fShapeA) << " * " << ConvertShapeToString(fShapeB) + << " -> " << ConvertShapeToString(fShapeY) << "\n"; // need to consider case A and B have dim > 2 (for MatMul) int64_t dimA = fShapeA.size(); int64_t dimB = fShapeB.size(); @@ -327,18 +328,20 @@ namespace SOFIE{ auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal()); auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal()); auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal()); + // size of A: if (trasposeA) is m*k else k*m + // size of B n*k std::vector sY = {fShapeY[dimY-2], fShapeY[dimY-1]}; // extra dimensions in case of stacked MatMul - std::vector sA; + std::vector sExtraY; for (int64_t i = 0; i < dimY-2; i++) { - sA.push_back(fShapeY[i]); + sExtraY.push_back(fShapeY[i]); } auto lengthGemm = ConvertDimShapeToLength(sY); // size of the Gemm operation - auto lengthExtra = ConvertDimShapeToLength(sA); // extra length in case input tensors are of dim>2 (MatMul) + auto lengthExtra_Y = ConvertDimShapeToLength(sExtraY); // extra length in case input tensors are of dim>2 (MatMul) // case bias is present if (!fNC.empty()){ - if (fNC2 == fNC) { + if (!fBroadcastBias) { // add a check in case broadcasting was not needed or done outside of session // C should have smaller dimension of Y if (!fIsDynamic) { @@ -347,7 +350,7 @@ namespace SOFIE{ + ConvertShapeToString(fShapeC) + " output length " + lengthGemm); } else { // add a dynamic check (C should not be a dynamic tensor) - out << SP << "assert(" << lengthGemm << " != " << ConvertShapeToLength(fShapeC) << ");\n"; + out << SP << "assert(" << lengthGemm << " == " << ConvertShapeToLength(fShapeC) << ");\n"; } } } else { @@ -360,31 +363,83 @@ namespace SOFIE{ // include MatMul case where we stack the Gemm operations // exclude case where we have only 1's in the additional dims - bool doStackMul = dimY > 2 && ( fIsDynamic || std::stoi(lengthExtra) > 1); + bool doStackMul = dimY > 2 && ( fIsDynamic || std::stoi(lengthExtra_Y) > 1); + // compute input offset for stack multiplications + std::string lengthExtra_A; + std::string lengthExtra_B; + std::string increment_A; + std::string increment_B; + + if (doStackMul) { + std::vector sA(fShapeA.begin(), fShapeA.begin()+dimA-2); + std::vector sB(fShapeB.begin(), fShapeB.begin()+dimB-2); + std::vector mA = {fShapeA[dimA-2], fShapeA[dimA-1]}; + std::vector mB = {fShapeA[dimB-2], fShapeB[dimB-1]}; + lengthExtra_A = ConvertDimShapeToLength(sA); + lengthExtra_B = ConvertDimShapeToLength(sB); + // size of A performing matmul is m*k and n*k for B + increment_A = ConvertDimShapeToLength(mA); + increment_B = ConvertDimShapeToLength(mB); + } + bool extraA = (doStackMul && lengthExtra_A != "1"); + bool extraB = (doStackMul && lengthExtra_B != "1"); if (doStackMul) { - out << SP << "size_t " << opName << "_yoffset = 0;\n"; // needed if we stack the gemm operations - out << SP << "for (int i = 0; i < " << lengthExtra << "; i++){\n"; + out << SP << "size_t " << opName << "_y_offset = 0;\n"; // needed if we stack the gemm operations + if (extraA) + out << SP << "size_t " << opName << "_A_offset = 0;\n"; + if (extraB) + out << SP << "size_t " << opName << "_B_offset = 0;\n"; + out << SP << "for (size_t i = 0; i < " << lengthExtra_Y << "; i++){\n"; out << SP; } + // do the bias broadcasting + if (fBroadcastBias) { + out << SP << "for (size_t j = 0; j < " << sY[0] << "; j++) { \n"; + out << SP << SP << "size_t y_index = "; + if (doStackMul) // add offset in caseof stack multiplications (not sure if bias is present in these cases) + out << opName << "_y_offset + "; + if (sY[1].GetVal() != "1") + out << sY[1] << " * j;\n"; + else + out << "j;\n"; + + out << SP << SP << "for (size_t k = 0; k < " << sY[1] << "; k++) { \n"; + std::string bias_index; + if (fShapeC[0] == 1 && fShapeC[1] == sY[1].dim) + bias_index = "k"; + else if (fShapeC[1] == 1 && fShapeC[0] == sY[0].dim) + bias_index = "j"; + else if (fShapeC[0] == 1 && fShapeC[1] == 1) // scalar case + bias_index = "0"; + else { + throw std::runtime_error("TMVA SOFIE Gemm Op - invalid shape for bias tensor " + ConvertShapeToString(fShapeC)); + } + + out << SP << SP << SP << "tensor_" << fNY << "[y_index + k] = " << "tensor_" << fNC << "[" << bias_index << "];\n"; + out << SP << SP << "}\n"; + out << SP << "}\n"; + } if (fType == "float"){ out << SP << "TMVA::Experimental::SOFIE::Gemm_Call(" << "tensor_" << fNY; - if (doStackMul) out << " + " << opName << "_yoffset"; + if (doStackMul) out << " + " << opName << "_y_offset"; out << ", " << (fAttrTransB ? "true, " : "false, ") << (fAttrTransA ? "true, " : "false, ") << n << ", " << m << ", " << k << ", "; - out << std::setprecision(std::numeric_limits::max_digits10) << fAttrAlpha << ","; - out << "tensor_" << fNB << ", " << "tensor_" << fNA << ", "; - out << std::setprecision(std::numeric_limits::max_digits10) << fAttrBeta << ","; - // in the case of bias - if (!fNC.empty()) - out << "tensor_" << fNC2; - else + out << std::setprecision(std::numeric_limits::max_digits10) << fAttrAlpha << ", tensor_" << fNB; + if (extraB) out << " + " << opName << "_B_offset"; + out << ", tensor_" << fNA; + if (extraA) out << " + " << opName << "_A_offset"; + out << ", " << std::setprecision(std::numeric_limits::max_digits10) << fAttrBeta << ","; + // in the case of bias and no broadcasting needed + if (!fNC.empty() && !fBroadcastBias) + out << "tensor_" << fNC; + else out << "nullptr"; - out << ");\n"; + out << ");\n"; if(fActivation == EActivationType::RELU){ out << SP << "for (int id = 0; id < " << ConvertDimShapeToLength(fShapeY) << " ; id++){\n"; @@ -394,7 +449,12 @@ namespace SOFIE{ } if (doStackMul) { - out << SP << SP << opName << "_yoffset += " << lengthGemm << ";\n"; + out << SP << SP << opName << "_y_offset += " << lengthGemm << ";\n"; + if (lengthExtra_A != "1") + out << SP << SP << opName << "_A_offset += " << increment_A << ";\n"; + if (lengthExtra_B != "1") + out << SP << SP << opName << "_B_offset += " << increment_B << ";\n"; + out << "}\n"; // end of loop on the stacked multiplications } diff --git a/tmva/sofie/inc/TMVA/ROperator_LayerNormalization.hxx b/tmva/sofie/inc/TMVA/ROperator_LayerNormalization.hxx index 239c5332172b0..f98ce201d400d 100644 --- a/tmva/sofie/inc/TMVA/ROperator_LayerNormalization.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_LayerNormalization.hxx @@ -14,6 +14,7 @@ namespace SOFIE { template class ROperator_LayerNormalization : public ROperator { private: + bool fCastToFloat = false; // flag to indicate if operation 1 are in floats (to be impl) int fAttrAxis; float fAttrEpsilon; size_t fAttrStashType; @@ -31,7 +32,7 @@ private: std::vector fShapeX; std::vector fShapeScale; - std::vector fShapeB; // shape of input Bias (B) is assumed to be fully defined + std::vector fShapeB; std::vector fShapeY; std::vector fShapeMean; std::vector fShapeInvStdDev; @@ -40,8 +41,8 @@ private: size_t fSize; // Size of the input // size_t fAxisDim; - std::vector fNormalizedShape; - std::vector fAxesShape; + std::vector fNormalizedShape; // shape from X[ axis,...,N-1] + std::vector fAxesShape; // shape from X[0,..,axis-1] // lengths in string format std::string fLength; // Length of the input std::string fNormalizedLength; @@ -79,7 +80,7 @@ public: void Initialize(RModel& model) override { if (!model.CheckIfTensorAlreadyExist(fNX)) { - throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found."); + throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Tensor " + fNX + " not found."); } bool isDynamic = model.IsDynamicTensor(fNX); fShapeX = model.GetDimTensorShape(fNX); @@ -104,8 +105,7 @@ public: // Type of mean and std ETensorType type = (fAttrStashType == 1) ? ETensorType::FLOAT : model.GetTensorType(fNX); // Mean - if (fNMean.empty()) { - fNMean = "Mean" + fNX; + if (!fNMean.empty()) { // cannot use initializer list with one element since it is ambiguous if (isDynamic) // add size_t(-1) to indicate that shape is an expression @@ -114,29 +114,60 @@ public: model.AddIntermediateTensor(fNMean, type, std::vector(1,std::stoi(fAxesLength))); } // Inverse Standard Deviation - if (fNInvStdDev.empty()) { - fNInvStdDev = "InvStdDev" + fNX; + if (!fNInvStdDev.empty()) { if (isDynamic) model.AddIntermediateTensor(fNInvStdDev, type, std::vector(1,Dim{fAxesLength,std::size_t(-1)})); else model.AddIntermediateTensor(fNInvStdDev, type, std::vector(1,std::stoi(fAxesLength))); } + // if mean and stdev are not empty they are not defined in the output list // Cast X to float if (fAttrStashType == 1 && model.GetTensorType(fNX) != ETensorType::FLOAT) { - fNCastedX = "Casted" + fNX; - model.AddIntermediateTensor(fNCastedX, ETensorType::FLOAT, fShapeX); - fNNormalizedX = "Normalized" + fNX; - model.AddIntermediateTensor(fNNormalizedX, ETensorType::FLOAT, fShapeX); + fCastToFloat = true; + fType = "float"; + // fNCastedX = "Casted" + fNX; + // model.AddIntermediateTensor(fNCastedX, ETensorType::FLOAT, fShapeX); + // fNNormalizedX = "Normalized" + fNX; + // model.AddIntermediateTensor(fNNormalizedX, ETensorType::FLOAT, fShapeX); + } + // scale shape + fShapeScale = model.GetDimTensorShape(fNScale); + // appends 1 to scale shapes if missing + size_t dimScale = fShapeScale.size(); + if (dimScale < fSize) { + for (size_t i = 0; i < fSize-dimScale; i++) + fShapeScale.insert(fShapeScale.begin(), Dim{1}); + } + // check also shape if consistent now + for (size_t i = 0; i < fSize; i++) { + if (fShapeScale[i].dim != 1 && fShapeScale[i] != fShapeX[i]) + throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Scale Tensor has invalid shape " + ConvertDimShapeToString(fShapeScale)); } - // Broadcast the bias if (!fNB.empty()) { - fShapeB = model.GetTensorShape(fNB); - size_t lengthB = ConvertShapeToLength(fShapeB); - if (isDynamic || lengthB < static_cast(std::stoi(fLength))) { - fNBroadcastedB = "Broadcasted" + fNB; - model.AddIntermediateTensor(fNBroadcastedB, ConvertStringToType(fType), fShapeX); + fShapeB = model.GetDimTensorShape(fNB); + // appends 1 to bias shapes if missing + size_t dimB = fShapeB.size(); + if (dimB < fShapeX.size()) { + for (size_t i = 0; i < fSize-dimB; i++) + fShapeB.insert(fShapeB.begin(), Dim{1}); + } + for (size_t i = 0; i < fSize; i++) { + if (fShapeB[i].dim != 1 && fShapeB[i] != fShapeX[i]) + throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Bias Tensor has invalid shape " + ConvertDimShapeToString(fShapeScale)); } } + + std::cout << "bias + scale " << ConvertDimShapeToString(fShapeB) << " " << ConvertDimShapeToString(fShapeScale) << std::endl; + + // // Broadcast the bias + // if (!fNB.empty()) { + // fShapeB = model.GetTensorShape(fNB); + // size_t lengthB = ConvertShapeToLength(fShapeB); + // if (isDynamic || lengthB < static_cast(std::stoi(fLength))) { + // fNBroadcastedB = "Broadcasted" + fNB; + // model.AddIntermediateTensor(fNBroadcastedB, ConvertStringToType(fType), fShapeX); + // } + // } model.AddNeededStdLib("cmath"); } @@ -162,10 +193,6 @@ public: throw std::runtime_error("TMVA::SOFIE LayerNormalization operator " + opName + " called to generate without being initialized first."); } - if (fShapeX.size() > 5) { - throw std::runtime_error("TMVA::SOFIE LayerNormalization operator not " - "implemented for input tensor of size > 5."); - } std::stringstream out; @@ -179,10 +206,32 @@ public: } auto strides = UTILITY::ComputeStrideFromShape(fShapeX); - std::string InputIndex = "axis_0 * " + strides[0].GetVal(); + std::string inputIndex = "axis_0 * " + strides[0].GetVal(); for (size_t i = 1; i < fSize; i++) { - InputIndex += " + axis_" + std::to_string(i) + " * " + strides[i].GetVal(); + inputIndex += " + axis_" + std::to_string(i); + if (i < fSize-1) inputIndex += " * " + strides[i].GetVal(); } + auto scaleStrides = UTILITY::ComputeStrideFromShape(fShapeScale); + std::string scaleIndex; + for (size_t i = 0; i < fSize; i++) { + if (fShapeScale[i].dim != 1) { + if (!scaleIndex.empty()) scaleIndex += " + "; + scaleIndex += "axis_" + std::to_string(i); + if ( scaleStrides[i].dim != 1) scaleIndex += " * " + scaleStrides[i].GetVal(); + } + } + if (scaleIndex.empty()) scaleIndex = "0"; + + auto biasStrides = UTILITY::ComputeStrideFromShape(fShapeB); + std::string biasIndex; + for (size_t i = 0; i < fSize; i++) { + if (fShapeB[i].dim != 1) { + if (!biasIndex.empty()) biasIndex += " + "; + biasIndex += "axis_" + std::to_string(i); + if ( biasStrides[i].dim != 1) biasIndex += " * " + biasStrides[i].GetVal(); + } + } + if (biasIndex.empty()) biasIndex = "0"; auto axesStrides = UTILITY::ComputeStrideFromShape(fAxesShape); std::string axesIndex = "axis_" + std::to_string(0) + " * " + axesStrides[0].GetVal(); @@ -190,51 +239,42 @@ public: axesIndex += " + axis_" + std::to_string(i) + " * " + axesStrides[i].GetVal(); } - auto normalizedStrides = UTILITY::ComputeStrideFromShape(fNormalizedShape); - std::string normalizedIndex = "axis_" + std::to_string(fAxis) + " * " + normalizedStrides[0].GetVal(); - for (size_t i = fAxis + 1; i < fSize; i++) { - normalizedIndex += " + axis_" + std::to_string(i) + " * " + normalizedStrides[i - fAxis].GetVal(); - } - if (!fNCastedX.empty()) { - // Cast X to float - out << SP << "for (size_t i = 0; i < " << fLength << "; i++) {\n"; - out << SP << SP << "tensor_" << fNCastedX << "[i] = " << "static_cast(tensor_" << fNX; - out << "[i]);\n"; - out << SP << "}\n"; - } + // compute mean and std-dev. Save in tensors if requested out << SP << "// Compute the mean\n"; - // Loop over the normalized dimensions + // Loop over all the dims in [0, fAxis) for (size_t i = 0; i < fAxis; i++) { std::string iIdx = "axis_" + std::to_string(i); out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] << "; " << iIdx << "++) {\n"; } - out << SP << SP << fType << " sum = 0.;\n"; - // loop over all the dims in [0, fAxis) + out << SP << SP << fType << " mean = 0.;\n"; + // loop over the normalized dimensions (fAxis,....,N-1) for (size_t j = fAxis; j < fSize; j++) { std::string jIdx = "axis_" + std::to_string(j); out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] << "; " << jIdx << "++) {\n"; } - out << SP << SP << SP << "sum += tensor_" << fNX << "[" << InputIndex << "];\n"; + out << SP << SP << SP << "mean += tensor_" << fNX << "[" << inputIndex << "];\n"; for (size_t j = fAxis; j < fSize; j++) { out << SP << SP << "}\n"; } - out << SP << SP << "tensor_" << fNMean << "[" << axesIndex << "] = sum / " << fType << "("; - out << fNormalizedLength << ");\n"; - for (size_t i = fAxis; i < fSize; i++) { - out << SP << "}\n"; - } + out << SP << SP << "mean /= " << fType << "(" << fNormalizedLength << ");\n"; + + // for (size_t i = fAxis; i < fSize; i++) { + // out << SP << "}\n"; + // } + // tensor_" << fNMean << "[" << axesIndex << "] out << SP << "// Compute the inverse Standard Deviation\n"; // Loop over the normalized dimensions - for (size_t i = 0; i < fAxis; i++) { - std::string iIdx = "axis_" + std::to_string(i); - out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] - << "; " << iIdx << "++){\n"; - } + // for (size_t i = 0; i < fAxis; i++) { + // std::string iIdx = "axis_" + std::to_string(i); + // out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] + // << "; " << iIdx << "++){\n"; + // } + // Set sum = 0 out << SP << SP << fType << " sum = 0.;\n"; // loop over all the dims in [0, fAxis) @@ -243,92 +283,63 @@ public: out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] << "; " << jIdx << "++){\n"; } - out << SP << SP << SP << "float tmp = tensor_" << fNX << "[" << InputIndex << "] - tensor_" - << fNMean << "[" << axesIndex << "];\n"; + out << SP << SP << SP << "float tmp = tensor_" << fNX << "[" << inputIndex << "] - mean;\n"; out << SP << SP << SP << "sum += tmp*tmp;\n"; for (size_t j = fAxis; j < fSize; j++) { out << SP << SP << "}\n"; } - out << SP << SP << "tensor_" << fNInvStdDev << "[" << axesIndex << "] = 1 / std::sqrt("; + out << SP << SP << fType << " invStdDev = 1 / std::sqrt("; out << "sum / " << fType << "(" << fNormalizedLength << ") + " << fAttrEpsilon << ");\n"; - for (size_t i = 0; i < fAxis; i++) { - out << SP << "}\n"; - } - if (!fNCastedX.empty()) { - out << "// NormalizedX = InvStdDev * (CastedX - Mean)\n"; - for (size_t i = 0; i < fAxis; i++) { - std::string iIdx = "axis_" + std::to_string(i); - out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] - << "; " << iIdx << "++){\n"; - } - for (size_t j = fAxis; j < fSize; j++) { - std::string jIdx = "axis_" + std::to_string(j); - out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] - << "; " << jIdx << "++){\n"; - } - out << SP << SP << SP << "tensor_" << fNNormalizedX << "[" << InputIndex << "] = tensor_"; - out << fNInvStdDev << "[" << axesIndex << "] * (tensor_" << fNCastedX << "[" << InputIndex; - out << "] - tensor_" << fNMean << "[" << axesIndex << "])\n"; - for (size_t j = fAxis; j < fSize; j++) { - out << SP << SP << "}\n"; - } - for (size_t i = fAxis; i < fSize; i++) { - out << SP << "}\n"; - } - out << "// Y = Scale o NormalizedX"; - for (size_t i = 0; i < fAxis; i++) { - std::string iIdx = "axis_" + std::to_string(i); - out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] - << "; " << iIdx << "++){\n"; - } - for (size_t j = fAxis; j < fSize; j++) { - std::string jIdx = "axis_" + std::to_string(j); - out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] - << "; " << jIdx << "++){\n"; - } - out << SP << SP << SP << "tensor_" << fNY << "[" << InputIndex << "] = tensor_" << fNScale; - out << "[" << axesIndex << "] * static_cast<" << fType << ">(tensor_" << fNCastedX << "[" << InputIndex; - out << "]);\n"; - for (size_t j = fAxis; j < fSize; j++) { - out << SP << SP << "}\n"; - } - for (size_t i = fAxis; i < fSize; i++) { - out << SP << "}\n"; - } - } else { - out << SP << "// Y = Scale o InvStdDev (X - Mean)\n"; - for (size_t i = 0; i < fAxis; i++) { - std::string iIdx = "axis_" + std::to_string(i); - out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] - << "; " << iIdx << "++){\n"; - } - for (size_t j = fAxis; j < fSize; j++) { - std::string jIdx = "axis_" + std::to_string(j); - out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] - << "; " << jIdx << "++){\n"; - } - out << SP << SP << SP << "tensor_" << fNY << "[" << InputIndex << "] = tensor_" << fNScale; - out << "[" << normalizedIndex << "] * tensor_" << fNInvStdDev << "[" << axesIndex; - out << "] * (tensor_" << fNX << "[" << InputIndex << "] - tensor_" << fNMean << "["; - out << axesIndex << "]);\n"; - for (size_t j = fAxis; j < fSize; j++) { - out << SP << SP << "}\n"; - } - for (size_t i = fAxis; i < fSize; i++) { - out << SP << "}\n"; - } + // for (size_t i = 0; i < fAxis; i++) { + // out << SP << "}\n"; + // } + + // set output mean and invStdDev if requested + if (!fNMean.empty()) + out << SP << SP << "tensor_" << fNMean << "[" << axesIndex << "] = mean;\n"; + if (!fNInvStdDev.empty()) + out << SP << SP << "tensor_" << fNInvStdDev << "[" << axesIndex << "] = invStdDev;\n"; + + // scale and add bias + + out << SP << "// Y = Scale o InvStdDev (X - Mean)\n"; + // for (size_t i = 0; i < fAxis; i++) { + // std::string iIdx = "axis_" + std::to_string(i); + // out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] + // << "; " << iIdx << "++){\n"; + // } + + for (size_t j = fAxis; j < fSize; j++) { + std::string jIdx = "axis_" + std::to_string(j); + out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] << "; " << jIdx + << "++){\n"; } + out << SP << SP << SP << "tensor_" << fNY << "[" << inputIndex << "] = tensor_" << fNScale; + out << "[" << scaleIndex << "] * invStdDev * (tensor_" << fNX << "[" << inputIndex << "] - mean)"; - if (!fNB.empty()) { - std::string bias = "tensor_" + (fNBroadcastedB.empty() ? fNB : fNBroadcastedB); - out << SP << "// Add the bias to Y\n"; - out << SP << "int " << opName << "_n = " << fLength << ";\n"; - out << SP << "float " << opName << "_alpha = 1.;\n"; - out << SP << "int " << opName << "_inc = 1;\n"; - out << SP << "BLAS::saxpy_(&" << opName << "_n, &" << opName << "_alpha, " << bias << ", &"; - out << opName << "_inc, " << "tensor_" << fNY << ", &" << opName << "_inc);\n"; + // add bias if needed + if (!fNB.empty()) + // assume bias has index as scale + out << " + tensor_" << fNB << "[" << biasIndex << "]"; + out << ";\n"; + + for (size_t j = fAxis; j < fSize; j++) { + out << SP << SP << "}\n"; } + for (size_t i = fAxis; i < fSize; i++) { + out << SP << "}\n"; + } + + // if (!fNB.empty()) { + // std::string bias = "tensor_" + (fNBroadcastedB.empty() ? fNB : fNBroadcastedB); + // out << SP << "// Add the bias to Y\n"; + // out << SP << "int " << opName << "_n = " << fLength << ";\n"; + // out << SP << "float " << opName << "_alpha = 1.;\n"; + // out << SP << "int " << opName << "_inc = 1;\n"; + // out << SP << "BLAS::saxpy_(&" << opName << "_n, &" << opName << "_alpha, " << bias << ", &"; + // out << opName << "_inc, " << "tensor_" << fNY << ", &" << opName << "_inc);\n"; + // } return out.str(); } diff --git a/tmva/sofie/inc/TMVA/ROperator_Range.hxx b/tmva/sofie/inc/TMVA/ROperator_Range.hxx index 9cac15a14fc52..b91e45dd6d84b 100644 --- a/tmva/sofie/inc/TMVA/ROperator_Range.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_Range.hxx @@ -37,15 +37,10 @@ public: } static_assert( (std::is_same_v || std::is_same_v), "TMVA::SOFIE - Unsupported type by Range operator"); - } - - std::vector TypeInference(std::vector input) override { - return input; - } - - std::vector> ShapeInference(std::vector> input) override { - auto ret = input; //suggest copy to compiler - return ret; + { + fInputTensorNames = { fNStart, fNLimit, fNDelta }; + fOutputTensorNames = { fNOutput }; + } } void Initialize(RModel& model) override { @@ -63,32 +58,89 @@ public: std::runtime_error("TMVA SOFIE Range Op Input Tensor " + fNDelta + "is not found in model"); } ETensorType type = ConvertStringToType(fType); - if (model.IsInitializedTensor(fNStart) && model.IsInitializedTensor(fNDelta) && model.IsInitializedTensor(fNLimit)) { - T * start = static_cast(model.GetInitializedTensorData(fNStart).get()); - T * limit = static_cast(model.GetInitializedTensorData(fNLimit).get()); - T * delta = static_cast(model.GetInitializedTensorData(fNDelta).get()); - if (!start || !delta || !limit) - std::runtime_error("TMVA SOFIE Range Op Input Tensor has invalid input data"); - T a = *start; - T b = *limit; - T d = *delta; - int number_of_elements = std::max( static_cast(std::ceil( (b - a) / d )) , 0. ); + + + + auto analyzeInput = [&](const std::string & tName, T & value, Dim & dim) { + int ftype = 0; // type of input (0 intermediate, 1 constant , 2 shape) + if (model.IsInitializedTensor(tName)) { + T * data = static_cast(model.GetInitializedTensorData(tName).get()); + if (!data) + std::runtime_error("TMVA SOFIE Range Op Input Tensor has invalid input data"); + value = *data; + ftype = 1; + } else if (model.IsShapeTensor(tName)) { + auto data = model.GetShapeTensorValues(tName); + dim = data[0]; + if (!dim.isParam) { + value = static_cast(dim.dim); + ftype = 1; + } else + ftype = 2; + } + return ftype; + }; + + T start_value; + T limit_value; + T delta_value; + Dim start_dim; + Dim limit_dim; + Dim delta_dim; + int res1 = analyzeInput(fNStart, start_value, start_dim); + int res2 = analyzeInput(fNLimit, limit_value, limit_dim); + int res3 = analyzeInput(fNDelta, delta_value, delta_dim); + if (res1 == 0 || res2 == 0 || res3 == 0) { + // cannot know at compile time- need to do fully at run time + // + fShape = {Dim{"range_size_" + fNStart + "_" + fNLimit}}; + model.AddDynamicTensor(fNOutput, type, fShape); + } else if (res1 == 1 && res2 == 1 && res3 == 1) { + size_t number_of_elements = std::max(static_cast(std::ceil((limit_value - start_value) / delta_value )) , 0 ); + fIsOutputConstant = true; + + // compute output std::vector output(number_of_elements); - for (int i=0; i shape = {static_cast(number_of_elements)}; + std::vector shape = {number_of_elements}; model.AddConstantTensor(fNOutput,shape, output.data()); - fIsOutputConstant = true; - // set the input tensor not writable - model.SetNotWritableInitializedTensor(fNStart); - model.SetNotWritableInitializedTensor(fNDelta); - model.SetNotWritableInitializedTensor(fNLimit); - } - else { - fShape = {Dim{"range_size"}}; - model.AddDynamicTensor(fNOutput, type, fShape); + fShape = ConvertShapeToDim(shape); + + } else { // case of a shape tensor + std::string start = (res1 == 1) ? std::to_string(start_value) : start_dim.GetVal(); + std::string limit = (res2 == 1) ? std::to_string(limit_value) : limit_dim.GetVal(); + std::string delta = (res3 == 1) ? std::to_string(delta_value) : delta_dim.GetVal(); + std::stringstream s; + if (type == ETensorType::FLOAT ) { + if (delta_value == 1) + s << "std::max(std::ceil("<< limit << " - " << start << "),0.0f)"; + else + s << "std::max(std::ceil(("<< limit << " - " << start << ")/" << delta << "),0.0f)"; + } else if (type == ETensorType::INT64 ) { + if (delta == "1") { + if (start == "0") + s << limit; + else + s << "std::max((" << limit << " - " << start << "),0L)"; + } else { + if (start == "0") + s << "((" << limit << ")/" << delta << ")"; + else + s << "std::max((" << limit << " - " << start << ")/"<< delta << "),0L)"; + } + } else { + throw + std::runtime_error("TMVA SOFIE Range Op Input Tensor " + ConvertTypeToString(type) + "is not supported"); + } + + + fShape = { Dim {s.str(), static_cast(-1)} }; + model.AddDynamicTensor(fNOutput,type, fShape); } + + if (model.Verbose()) { std::cout << "Range -> output is " << fNOutput << " : " << ConvertShapeToString(fShape); if (fIsOutputConstant) std::cout << " : " << ConvertValuesToString(model.GetTensorData(fNOutput)); @@ -96,27 +148,31 @@ public: } } - std::string Generate(std::string OpName) override { + std::string Generate(std::string opName) override { std::stringstream out; - out << "\n//------ Range\n"; + out << "\n//------ Range " << opName << "---> " << ConvertDimShapeToString(fShape) << "\n"; if (fIsOutputConstant) return out.str(); - OpName = "op_" + OpName; + opName = "op_" + opName; if (fShape.empty()) { throw std::runtime_error("TMVA SOFIE Range operator called to Generate without being initialized first"); } - std::string sizeName = fShape[0].param; - out << SP << "size_t " << sizeName << " = static_cast(std::max(std::ceil((static_cast(*tensor_" << fNLimit << ") - static_cast(*tensor_" << fNStart << ")) / static_cast(*tensor_" << fNDelta << ")), 0.0f));\n"; - out << SP << "if (" << sizeName << " > " << "fTensor_" << fNOutput << ".size() ){\n"; - out << SP << SP << "fTensor_" << fNOutput << ".resize(" << sizeName << ");\n"; - // need to re-initialized pointer to tensor data - out << SP << SP << "tensor_" << fNOutput << " = fTensor_" << fNOutput << ".data();\n"; - out << SP << "}\n"; - out << SP << "for (size_t i = 0; i < " << sizeName << "; i++) {\n"; - out << SP << SP << "fTensor_" << fNOutput << "[i] = *tensor_" << fNStart << " + i * (*tensor_" << fNDelta << ");\n"; + std::string outputSizeVar; + std::string outputSize = fShape[0].param; + if (outputSize.find("range_size") != std::string::npos) { + outputSizeVar = outputSize; + outputSize = "static_cast(std::max(std::ceil((static_cast(*tensor_" + fNLimit + + ") - static_cast(*tensor_" + fNStart + ")) / static_cast(*tensor_" + fNDelta + ")), 0.0f))"; + } else { + outputSizeVar = "range_" + opName; + } + out << SP << "size_t " << outputSizeVar << " = " << outputSize << ";\n"; + out << SP << "for (size_t i = 0; i < " << outputSizeVar << "; i++) {\n"; + out << SP << SP << "tensor_" << fNOutput << "[i] = *tensor_" << fNStart << " + i * (*tensor_" << fNDelta << ");\n"; out << SP << "}\n"; + return out.str(); } }; diff --git a/tmva/sofie/inc/TMVA/ROperator_Reduce.hxx b/tmva/sofie/inc/TMVA/ROperator_Reduce.hxx index 1204770d3d321..1da588e965a01 100644 --- a/tmva/sofie/inc/TMVA/ROperator_Reduce.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_Reduce.hxx @@ -166,7 +166,7 @@ public: std::string reducedLength; if (fInputDimShape) { reducedLength = "reducedLength_" + opName; - out << SP << "size_t " << reducedLength << " = " << inputLength << " / " << outputLength << ";\n"; + out << SP << "size_t " << reducedLength << " = (" << inputLength << ") / (" << outputLength << ");\n"; } else { int rLength = std::stoi(inputLength) / std::stoi(outputLength); reducedLength = std::to_string(rLength); diff --git a/tmva/sofie/inc/TMVA/ROperator_Reshape.hxx b/tmva/sofie/inc/TMVA/ROperator_Reshape.hxx index 2634b68dbc875..a3ed28c4860bc 100644 --- a/tmva/sofie/inc/TMVA/ROperator_Reshape.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_Reshape.hxx @@ -108,6 +108,9 @@ public: if (IsInteger(tmp_length) && IsInteger(input_length)) output_shape[i] = Dim{static_cast(std::stoi(input_length) / std::stoi(tmp_length))}; + else if (IsInteger(tmp_length) && std::stoi(tmp_length) == 1) { + output_shape[i] = Dim{input_length, static_cast(-1)}; + } else { //we can try simplifying expression if tmp_length is integer and part of input_length // contains tmp_length @@ -243,7 +246,7 @@ public: // check if optional tensor exists defining shape or axes if (!fNInput2.empty()) { if (model.CheckIfTensorAlreadyExist(fNInput2)) { - if (model.IsConstantTensor(fNInput2) || model.IsInitializedTensor(fNInput2)) { + if (model.IsInitializedTensor(fNInput2)) { // assume input shape is an initialized tensor auto dptr = model.GetInitializedTensorData(fNInput2); auto values = static_cast(dptr.get()); @@ -260,6 +263,9 @@ public: fShapeOutput = ShapeInference({fShapeInput})[0]; // set flag to not write tensor in weight file. Its data will be hard-coded in way model is constructed model.SetNotWritableInitializedTensor(fNInput2); + } else if (model.IsShapeTensor(fNInput2)) { + auto shapeData = model.GetShapeTensorValues(fNInput2); + fShapeOutput = shapeData; } else { // we cannot get shape at initialization time but at run-time fDynamicShape = true; diff --git a/tmva/sofie/inc/TMVA/ROperator_ScatterElements.hxx b/tmva/sofie/inc/TMVA/ROperator_ScatterElements.hxx index 626debd13038e..2525ea32629df 100644 --- a/tmva/sofie/inc/TMVA/ROperator_ScatterElements.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_ScatterElements.hxx @@ -136,6 +136,17 @@ public: return strst.str(); }; + auto tensorIndexOpt = [](const std::vector & sdx, const std::vector & idx) { + std::stringstream strst; + int dims = idx.size(); + for (int i = 0; i < dims-1; i++) { + strst << sdx[i]; + strst << " + "; + } + strst << idx[dims-1]; + return strst.str(); + }; + // copy first input in output (maybe can be avoided??) out << SP << "std::copy(tensor_" << fNX << ", tensor_" << fNX << " + " << length << ", tensor_" << fNY << ");\n"; @@ -143,14 +154,24 @@ public: // loop on tensor rank int dims = fShapeY.size(); std::vector idx(dims); + std::vector sdx(dims); // stride for indices for (int i = 0; i < dims; i++) { idx[i] = std::string("i") + std::to_string(i); + sdx[i] = std::string("s") + std::to_string(i); for (int j = 0; j <= i; j++) out << SP; out << "for (int " << idx[i] << " = 0; " << idx[i] << " < " << fShapeI[i] << "; " << idx[i] << "++) {\n"; + if (i < dims-1) { + for (int j = 0; j <= i+1 ; j++) out << SP; + if (strideI[i].GetVal() != "1") + out << "int "<< sdx[i] << " = " << strideI[i] << " * " << idx[i] << ";\n"; + else + out << "int "<< sdx[i] << " = " << idx[i] << ";\n"; + } } // correct index for specific axis for (int j = 0; j <= dims; j++) out << SP; - out << "int updateIndex = " << tensorIndex(strideI,idx) << ";\n"; + // can use optimised formula for indices since the loop above is on fShapeI + out << "int updateIndex = " << tensorIndexOpt(sdx,idx) << ";\n"; for (int j = 0; j <= dims; j++) out << SP; out << "int iAxis = tensor_" << fNI << "[updateIndex];\n"; for (int j = 0; j <= dims; j++) out << SP; diff --git a/tmva/sofie/inc/TMVA/ROperator_Slice.hxx b/tmva/sofie/inc/TMVA/ROperator_Slice.hxx index b23e3b0a86d21..4e3c1319bd772 100644 --- a/tmva/sofie/inc/TMVA/ROperator_Slice.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_Slice.hxx @@ -25,6 +25,7 @@ private: bool fIsStartUndef = false; bool fIsEndUndef = false; bool fIsStepUndef = false; + bool fIdentitySlice = false; std::string fNData; // input data tensor name std::string fNOutput; // output data name std::vector fNames; // tensor names for meta(axis) information @@ -235,6 +236,8 @@ public: if (iend < 0) { std::string send = std::string("(") + fShapeInput[fAxes[i]].param + "-" + std::to_string(-iend) +")"; fEnd[fAxes[i]] = Dim{send,size_t(-1)}; + } else if (iend == std::numeric_limits::max()){ + fEnd[fAxes[i]] = fShapeInput[fAxes[i]]; } else { fEnd[fAxes[i]] = Dim{size_t(iend)}; } @@ -330,27 +333,58 @@ public: } } else { + // check if Slice is just an Identity operator in case start = 0, end = input_shape and step=1 + size_t ndim = fShapeInput.size(); + fIdentitySlice = fShapeOutput.size() == ndim; + for (size_t idim = 0; idim < ndim; idim++) { + if (!fIdentitySlice) break; + fIdentitySlice &= (fStart[idim].GetVal() == "0"); + fIdentitySlice &= (fSteps[idim].GetVal() == "1"); + fIdentitySlice &= (fEnd[idim].GetVal() == fShapeOutput[idim].GetVal()); + } + model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput); + if (fIdentitySlice) model.AddAliasTensor(fNOutput, fNData); + if (model.Verbose()) { - std::cout << "Slice ---> " << fNOutput << " " << ConvertShapeToString(fShapeOutput) << std::endl; + std::cout << "Slice " << fNData << " " << ConvertShapeToString(fShapeInput) + << "---> " << fNOutput << " " << ConvertShapeToString(fShapeOutput); + if (fIdentitySlice) std::cout << " (using alias tensor since slice is an identity) "; + std::cout << std::endl; + } } } - std::string Generate(std::string OpName) override { - if (fIsOutputConstant) return ""; //no op for constant tensors + std::string Generate(std::string opName) override { - OpName = "op_" + OpName; if (fShapeInput.empty() || fShapeOutput.empty()){ throw std::runtime_error("TMVA SOFIE Slice Op called to Generate without being initialized first"); } std::stringstream out; - //std::string opName = "Slice"; - out << SP << "///------- Slice operator\n" << std::endl; - // loop on the dimensions depending no the orders + out << "///------- Slice operator " << opName << "---> " << fNOutput << " " + << ConvertDimShapeToString(fShapeOutput) << "\n" << std::endl; + if (fIsOutputConstant) return out.str(); //no op for constant tensors + size_t ndim = fShapeInput.size(); + // check if Slice is just an Identity operator in case start = 0, end = input_shape and step=1 + bool identitySlice = fShapeInput.size() == fShapeOutput.size(); + for (size_t idim = 0; idim < ndim; idim++) { + if (!identitySlice) break; + identitySlice &= (fStart[idim].GetVal() == "0"); + identitySlice &= (fSteps[idim].GetVal() == "1"); + identitySlice &= (fEnd[idim].GetVal() == fShapeOutput[idim].GetVal()); + } + + if (identitySlice) { + out << "/// Slice is just an identity (copy pointers) \n"; + out << SP << "tensor_" << fNOutput << " = tensor_" << fNData << ";\n"; + return out.str(); + } + + // loop on the dimensions depending no the orders auto strides = UTILITY::ComputeStrideFromShape(fShapeInput); diff --git a/tmva/sofie/inc/TMVA/ROperator_Tile.hxx b/tmva/sofie/inc/TMVA/ROperator_Tile.hxx index 1086f72eae71c..9b291b40e0854 100644 --- a/tmva/sofie/inc/TMVA/ROperator_Tile.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_Tile.hxx @@ -20,8 +20,8 @@ private: std::string fNRepeats; std::string fNInput; std::string fNY; - std::vectorfShapeInput; - std::vector fShapeY; + std::vectorfShapeInput; + std::vector fShapeY; public: ROperator_Tile(){} @@ -35,13 +35,18 @@ public: return input; } - std::vector> ShapeInference(std::vector> input) override { - std::vector ret = input[0]; - - for(size_t i=0; i < input[1].size(); i++) { - ret[i]=ret[i]*input[1][i]; + std::vector DoShapeInference(const std::vector & input, const std::vector repeat) { + std::vector ret = input; + for(size_t i=0; i < repeat.size(); i++) { + if (repeat[i] != 1) { + if (ret[i].isParam) { + ret[i] = Dim{ std::string(ret[i].GetVal() + "*" + std::to_string(repeat[i])), static_cast(-1) }; + } else { + ret[i]=Dim { ret[i].dim *repeat[i] }; + } + } } - return {ret}; + return ret; } void Initialize(RModel& model) override { @@ -52,7 +57,7 @@ public: if (model.CheckIfTensorAlreadyExist(fNRepeats) == false){ throw std::runtime_error("TMVA SOFIE Tile Op Input Tensor is not found in model"); } - fShapeInput=model.GetTensorShape(fNInput); + fShapeInput=model.GetDimTensorShape(fNInput); // if repeats vector is not initialized we cannot deduce shape of output // not support for time being this case @@ -79,12 +84,12 @@ public: std::copy(repeats_data, repeats_data + num_elements, repeats_vector.begin()); - fShapeY = ShapeInference({fShapeInput,repeats_vector})[0]; + fShapeY = DoShapeInference(fShapeInput,repeats_vector); model.AddIntermediateTensor(fNY, model.GetTensorType(fNInput), fShapeY); if (model.Verbose()) - std::cout << "Tile: " << fNInput << " " << ConvertShapeToString(fShapeInput) << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY) + std::cout << "Tile: " << fNInput << " " << ConvertDimShapeToString(fShapeInput) << " -> " << fNY << " with shape " << ConvertDimShapeToString(fShapeY) << " given repeats " << ConvertShapeToString(repeats_vector) << std::endl; } @@ -103,9 +108,9 @@ public: std::string output = "tensor_" + fNY; out << "///-------- Tile operator\n"; out << "{\n"; // add scope to re-use same names - out << "const int input_shape[" << fShapeInput.size() << "] = " << ConvertShapeToString(fShapeInput) << ";\n"; + out << "const size_t input_shape[" << fShapeInput.size() << "] = " << ConvertDimShapeToString(fShapeInput) << ";\n"; - out << "int inputLength = " << ConvertShapeToLength(fShapeInput) << ";\n"; + out << "int inputLength = " << ConvertDimShapeToLength(fShapeInput) << ";\n"; out << "int s = 1;\n"; // loop from inverse dim order out << "for (int i = " << fShapeInput.size()-1 << "; i >=0; i--) {\n"; diff --git a/tmva/sofie/inc/TMVA/ROperator_TopK.hxx b/tmva/sofie/inc/TMVA/ROperator_TopK.hxx index 0869437bb6b0c..edee91de8eb57 100644 --- a/tmva/sofie/inc/TMVA/ROperator_TopK.hxx +++ b/tmva/sofie/inc/TMVA/ROperator_TopK.hxx @@ -19,13 +19,13 @@ private: int fAttrLargest; int fAttrSorted; - size_t fK; + Dim fK; std::string fNK; std::string fNX; std::string fNVal; std::string fNInd; - std::vector fShapeX; - std::vector fShapeY; + std::vector fShapeX; + std::vector fShapeY; std::string fType; public: @@ -43,23 +43,10 @@ public: } std::vector TypeInference(std::vector input) override { - ETensorType ret = input[0]; - return {ret, ret}; - } - - std::vector> ShapeInference(std::vector> input) override { - if (input.size() != 2) { - throw std::runtime_error("TMVA SOFIE TopK Op Shape Inference needs exactly 2 input tensors"); - } - - auto shape = input[0]; // Shape format: [ m x n x o x p ... ] - - // set the dimension at the specified axis to k (fAttrAxis is checked before that is in the correct range - shape[fAttrAxis] = fK; // Modified shape: [ m x n x k x p ... ] - return {shape, shape}; + ETensorType ret = input[0]; + return {ret, ret}; } - void Initialize(RModel& model) override { if (model.CheckIfTensorAlreadyExist(fNX) == false) { // input must be a graph input, or already initialized intermediate tensor @@ -70,10 +57,10 @@ public: throw std::runtime_error("TMVA SOFIE TopK Op Input Tensor i.e. K is not found in model"); } - fShapeX = model.GetTensorShape(fNX); + fShapeX = model.GetDimTensorShape(fNX); auto fShapeK = model.GetTensorShape(fNK); auto kptr = static_cast(model.GetInitializedTensorData(fNK).get()); - fK = *kptr; + size_t kval = *kptr; model.SetNotWritableInitializedTensor(fNK); fAttrAxis = fAttrAxis < 0 ? fShapeX.size() + fAttrAxis : fAttrAxis; if(static_cast(fAttrAxis) >= fShapeX.size()){ @@ -81,14 +68,25 @@ public: std::runtime_error("TMVA::SOFIE ONNX TopK op axis = "+ std::to_string(fAttrAxis) +" value exeeds size of tensor " +fNX+" of size "+fShapeX.size()+" ."); } // fK cannot be larger that axis dimension - fK = std::min(fK, fShapeX[fAttrAxis]); + if (fShapeX[fAttrAxis].isParam) + fK = Dim{std::string("std::min(size_t(" + std::to_string(kval) + "), " + fShapeX[fAttrAxis].GetVal() + ")" ), static_cast(-1) }; + else + fK = Dim { std::min(kval, fShapeX[fAttrAxis].dim) }; + + // output shape is equal to input shape apart for value in fAttrAxis + fShapeY = fShapeX; + fShapeY[fAttrAxis] = Dim{fK}; - fShapeY = ShapeInference({fShapeX, fShapeK})[0]; model.AddIntermediateTensor(fNVal, model.GetTensorType(fNX), fShapeY); // output indices should be an int64 tensor model.AddIntermediateTensor(fNInd, ETensorType::INT64, fShapeY); fType = ConvertTypeToString(model.GetTensorType(fNX)); + + if (model.Verbose()) { + std::cout << "TopK " << fNX << " " << ConvertShapeToString(fShapeX) + << "---> " << fNVal << " " << ConvertShapeToString(fShapeY) << std::endl; + } } std::string Generate(std::string OpName) override { @@ -101,19 +99,20 @@ public: size_t axis = fAttrAxis < 0 ? size + fAttrAxis : fAttrAxis; out << "\n" << SP << "//------ TopK\n"; - size_t length=ConvertShapeToLength(fShapeX); + auto length=ConvertDimShapeToLength(fShapeX); auto strideX = UTILITY::ComputeStrideFromShape(fShapeX); auto strideY = UTILITY::ComputeStrideFromShape(fShapeY); // we perform loop on dimension before sorted axis and after sorted axis - size_t n_before = (axis>0) ? length/strideX[axis-1] : 1; - size_t n_after = strideX[axis]; - size_t n_elements = fShapeX[axis]; // number of elements to be sorted + std::vector shape_before(fShapeX.begin(), fShapeX.begin() + axis); // input shape before axis + std::string n_before = (axis>0) ? ConvertDimShapeToLength(shape_before) : "1"; + std::string n_after = strideX[axis].GetVal(); + std::string n_elements = fShapeX[axis].GetVal(); // number of elements to be sorted // } out << SP << "{\n"; // to define a separate scope for the operator code out << SP << "std::vector> elements(" << n_elements << ");\n"; // loop on elements before - if (n_before > 1) { + if (n_before != "1") { out << SP << "for (size_t i = 0; i < " << n_before << "; i++) {\n"; out << SP << SP << "size_t xoffset = i*" << strideX[axis-1] << ";\n"; out << SP << SP << "size_t yoffset = i*" << strideY[axis-1] << ";\n"; @@ -122,7 +121,7 @@ public: out << SP << "size_t xoffset = 0;\n"; out << SP << "size_t yoffset = 0;\n"; } - if (n_after > 1) + if (n_after != "1") out << SP << "for (size_t j = 0; j < " << n_after << "; j++) {\n"; else out << SP << "const size_t j = 0;\n"; @@ -149,8 +148,8 @@ public: out << SP << SP << SP << "tensor_" << fNVal << "[yoffset + " << strideY[axis] << "*l + j] = elements[l].first;\n"; out << SP << SP << SP << "tensor_" << fNInd << "[yoffset + " << strideY[axis] << "*l + j] = elements[l].second;\n"; out << SP << SP << "}\n"; - if (n_after > 1) out << SP << SP << "}\n"; - if (n_before> 1) out << SP << "}\n"; + if (n_after != "1") out << SP << SP << "}\n"; + if (n_before != "1") out << SP << "}\n"; out << SP << "}\n"; // end operator scope return out.str(); } diff --git a/tmva/sofie/inc/TMVA/SOFIE_common.hxx b/tmva/sofie/inc/TMVA/SOFIE_common.hxx index 2dae4f7d03ce7..68a74d08fd93a 100644 --- a/tmva/sofie/inc/TMVA/SOFIE_common.hxx +++ b/tmva/sofie/inc/TMVA/SOFIE_common.hxx @@ -252,8 +252,14 @@ public: bool IsConstantTensor() const { return fConstant;} // query if tensor needs to be written in a weight file. Constant tensors are not written in a file bool IsWeightTensor() const { return !fConstant && !fIsNotWritable;} + // check if a Tensor is Writable (need to be written in the file or in the generated code (e.g. as a constant tensor) + // if an initialized tensors is used in a constant operator at compile time does not need to be written and can be omitted in + // the generated code + bool IsNotWritable() const { return fIsNotWritable; } // set not writable initialized tensors - i.e. tensor that must not be written in a file void SetNotWritable() { fIsNotWritable = true;} + // set as constant (needed for non-float initialized tensors) + void SetConstant() { fConstant = true;} template T const *data() const @@ -805,6 +811,22 @@ void ReadTensorFromStream(std::istream &is, T &target, std::string const &expect } } + +// code for the memory greeding allocations +struct TensorLifeInfo { + int begin; // start time (op index) lifetime + int end; // end time lifetime + size_t size; // size of tensors in bytes +}; + +struct MemoryResult { + std::size_t total_bytes = 0; // total memory needed + std::vector offsets; // resulted offsets for each tensor +}; + +/// Greedy best-fit planner with coalescing free list. +MemoryResult OrganizeMemory(const std::vector & tensorsInfo ); + } // namespace SOFIE } // namespace Experimental } // namespace TMVA diff --git a/tmva/sofie/src/RFunction.cxx b/tmva/sofie/src/RFunction.cxx index a6df8dcb43e61..505d84187ca9a 100644 --- a/tmva/sofie/src/RFunction.cxx +++ b/tmva/sofie/src/RFunction.cxx @@ -26,7 +26,7 @@ RFunction_Update::RFunction_Update(FunctionTarget target, GraphType gType): fTar throw std::runtime_error("Invalid target for Update function"); } fType = FunctionType::UPDATE; - function_block = std::make_unique(fFuncName); + fFunction_block = std::make_unique(fFuncName); if(fGraphType == GraphType::GNN) { if(fTarget == FunctionTarget::EDGES) { @@ -49,25 +49,23 @@ RFunction_Update::RFunction_Update(FunctionTarget target, GraphType gType): fTar // add input tensors, order of provided shapes must be the same as in fInputTensors void RFunction_Update::AddInputTensors(const std::vector>& inputShapes) { for(long unsigned int i=0; iAddInputTensorInfo(fInputTensors[i],ETensorType::FLOAT, inputShapes[i]); - function_block->AddInputTensorName(fInputTensors[i]); + fFunction_block->AddInputTensorInfo(fInputTensors[i],ETensorType::FLOAT, inputShapes[i]); + fFunction_block->AddInputTensorName(fInputTensors[i]); } } void RFunction_Update::AddInputTensors(const std::vector>& inputShapes) { for(long unsigned int i=0; iAddInputTensorInfo(fInputTensors[i],ETensorType::FLOAT, inputShapes[i]); - function_block->AddInputTensorName(fInputTensors[i]); + fFunction_block->AddInputTensorInfo(fInputTensors[i],ETensorType::FLOAT, inputShapes[i]); + fFunction_block->AddInputTensorName(fInputTensors[i]); } } -std::string RFunction_Update::GenerateModel(const std::string& filename, long read_pos, long block_size) { - function_block->SetFilename(filename); +std::string RFunction_Update::GenerateModel(const std::string& filename, long read_pos, long block_size, bool verbose) { + fFunction_block->SetFilename(filename); // use batch size as block size in RModel::generate - function_block->PrintRequiredInputTensors(); - function_block->PrintDynamicTensors(); - function_block->Generate(Options::kGNNComponent,block_size,read_pos); + fFunction_block->Generate(Options::kGNNComponent,block_size,read_pos, verbose); std::string modelGenerationString; - modelGenerationString = "\n//--------- GNN_Update_Function---"+fFuncName+"\n"+function_block->ReturnGenerated(); + modelGenerationString = "\n//--------- GNN_Update_Function---"+fFuncName+"\n"+fFunction_block->ReturnGenerated(); return modelGenerationString; } diff --git a/tmva/sofie/src/RFunction_MLP.cxx b/tmva/sofie/src/RFunction_MLP.cxx index 32148cae36794..c41135de49902 100644 --- a/tmva/sofie/src/RFunction_MLP.cxx +++ b/tmva/sofie/src/RFunction_MLP.cxx @@ -20,9 +20,9 @@ RFunction_MLP::RFunction_MLP(FunctionTarget target, Int_t numLayers, Activation throw std::runtime_error("TMVA SOFIE GNN doesn't currently supports the provided activation function for " + fFuncName + " update."); } - function_block->AddOutputTensorNameList({fFuncName + "Relu" + std::to_string(fNumLayers)}); + fFunction_block->AddOutputTensorNameList({fFuncName + "Relu" + std::to_string(fNumLayers)}); } else { - function_block->AddOutputTensorNameList({fFuncName + "Gemm" + std::to_string(fNumLayers)}); + fFunction_block->AddOutputTensorNameList({fFuncName + "Gemm" + std::to_string(fNumLayers)}); } } @@ -32,7 +32,7 @@ void RFunction_MLP::Initialize() { if(fGraphType == GraphType::GNN) { std::unique_ptr op_concat; op_concat.reset(new ROperator_Concat(fInputTensors,1,0,fFuncName+"InputConcat")); - function_block->AddOperator(std::move(op_concat)); + fFunction_block->AddOperator(std::move(op_concat)); fGemmInput = fFuncName+"InputConcat"; } else if(fGraphType == GraphType::GraphIndependent) { @@ -43,24 +43,24 @@ void RFunction_MLP::Initialize() { for(int i=0; i(1.0,beta,0,0,fGemmInput,UTILITY::Clean_name(fKernelTensors[i]),UTILITY::Clean_name(fBiasTensors[i]),fFuncName+"Gemm"+std::to_string(i))); - function_block->AddOperator(std::move(op_gemm)); + fFunction_block->AddOperator(std::move(op_gemm)); fGemmInput = fFuncName+"Gemm"+i; if (fActivationFunction == Activation::RELU) { std::unique_ptr op_relu; op_relu.reset(new ROperator_Relu(fFuncName+"Gemm"+std::to_string(i), fFuncName+"Relu"+std::to_string(i))); - function_block->AddOperator(std::move(op_relu)); + fFunction_block->AddOperator(std::move(op_relu)); fGemmInput = fFuncName+"Relu"+i; } } double beta = (fBiasTensors.back().empty()) ? 0. : 1.; op_gemm.reset(new ROperator_Gemm(1.0,beta,0,0,fGemmInput,UTILITY::Clean_name(fKernelTensors.back()),UTILITY::Clean_name(fBiasTensors.back()),fFuncName+"Gemm"+std::to_string(fNumLayers))); - function_block->AddOperator(std::move(op_gemm)); + fFunction_block->AddOperator(std::move(op_gemm)); if(fActivateFinal) { if (fActivationFunction == Activation::RELU) { std::unique_ptr op_relu; op_relu.reset(new ROperator_Relu(fFuncName+"Gemm"+std::to_string(fNumLayers), fFuncName+"Relu"+std::to_string(fNumLayers))); - function_block->AddOperator(std::move(op_relu)); + fFunction_block->AddOperator(std::move(op_relu)); } } @@ -68,7 +68,7 @@ void RFunction_MLP::Initialize() { if(fAddlOp.size()) { for(auto &i:fAddlOp) { std::unique_ptr tmp(i); - function_block->AddOperator(std::move(tmp)); + fFunction_block->AddOperator(std::move(tmp)); } } } diff --git a/tmva/sofie/src/RModel.cxx b/tmva/sofie/src/RModel.cxx index 2fa6df3f04f8f..32da75fdc045b 100644 --- a/tmva/sofie/src/RModel.cxx +++ b/tmva/sofie/src/RModel.cxx @@ -9,6 +9,7 @@ #endif #include "TMVA/RModel.hxx" +#include "TMVA/RModelProfiler.hxx" #include "TMVA/SOFIE_common.hxx" namespace TMVA { @@ -164,19 +165,19 @@ void RModel::AddOperator(std::unique_ptr op, int order_execution) { fOperators.insert(fOperators.begin() + order_execution, std::move(op)); } else { fOperators.push_back(std::move(op)); + order_execution = fOperators.size()-1; } - // storing the last usage of tensors which are input to - // operators (but are not inputs to the model, i.e. they are intermediate - // tensors). This information is needed to keep a check on when a - // particular intermediate tensor can be flushed to free up memory for reuse. + // storing the last usage of tensors which are input to the operator + // (excluding tensors which are inputs to the model or the initialized (weights) tensors) + // We call this function during parsing so we don't have yet initialized the operators for(size_t index = 0; index & s fShapeTensors[tensor_name] = std::make_pair(shape_values, scalar); } +void RModel::AddAliasTensor(const std::string & name, const std::string & origin){ + // add an alias tensor to origin + auto tensor_name = UTILITY::Clean_name(name); + auto origin_name = UTILITY::Clean_name(origin); + if (fAliasTensors.count(tensor_name) != 0) { + throw std::runtime_error("TMVA-SOFIE: alias tensor with name " + tensor_name + " already exists \n"); + } + fAliasTensors[tensor_name] = origin_name; +} + bool RModel::IsShapeTensor(const std::string & tensor_name) const { return fShapeTensors.count(tensor_name) != 0; } +bool RModel::IsAliasTensor(const std::string & tensor_name) const { + return fAliasTensors.count(tensor_name) != 0; +} + const std::vector & RModel::GetShapeTensorValues(const std::string & tensor_name) const { //if (!IsShapeTensor(tensor_name) ) return std::vector{}; return fShapeTensors.at(tensor_name).first; @@ -222,6 +237,7 @@ bool RModel::IsInitializedTensor(const std::string& tensorName) const { return fInitializedTensors.find(name) != fInitializedTensors.end(); } bool RModel::IsConstantTensor(const std::string& tensorName) const { + // a constant tensor is an initialized tensor but has the constant flag set std::string name = UTILITY::Clean_name(tensorName); auto itr = fInitializedTensors.find(name); if (itr == fInitializedTensors.end()) return false; @@ -355,6 +371,11 @@ std::string RModel::AllocateIntermediateMemory(std::span fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end()) continue; + // case of alias tensor + if (IsAliasTensor(name)) { + continue; + } + auto tensor_size = GetTypeSize(GetTensorType(name)) * ConvertShapeToLength(GetTensorShape(name)); // important fill the pair in the ordered output tensors with the string view and not the string TensorMemoryInfo tmi = {it, tensor_size}; @@ -434,9 +455,14 @@ void RModel::CheckAndFlushIntermediateMemory(std::span o chunk != fIntermediateMemoryInfo.available_stack.end(); chunk++) { if (fVerbose) std::cout << "-- free chunk " << chunk->first << " size = " << chunk->second << std::endl; } - for (auto &it : op_input_tensors) { + for (auto &iv : op_input_tensors) { // last occurrence of the tensor is reached => flush it from memory - if (fVerbose) std::cout << ".. input tensors : " << it; + if (fVerbose) std::cout << ".. input tensors : " << iv; + + // for alias tensors replace name with its alias + std::string it{iv}; // convert view to string + if (IsAliasTensor(it)) + it = fAliasTensors[it]; if (fIntermediateTensorFrequencyLookup[it] == op_idx) { if (fVerbose) std::cout << " flash condition is met - looping on chunks to find matching one \n"; for (auto chunk = fIntermediateMemoryInfo.total_stack.begin(); @@ -522,6 +548,7 @@ void RModel::Initialize(const std::map & inputParams, bool fIntermediateTensorInfos.clear(); fDynamicTensorInfos.clear(); + // loop on inputs and see if shape can be full specified // if the batch size is provided it can be used to specify the full shape // Add the full specified tensors in fReadyInputTensors collection @@ -576,19 +603,6 @@ void RModel::Initialize(const std::map & inputParams, bool PrintDynamicTensors(); } - // check if there are initialized tensors to write in a weight file - // support for the time being only weight of FLOAT type - if (fUseWeightFile) { - bool modelHasWeights = false; - for (auto &i : fInitializedTensors) { - if (i.second.type() == ETensorType::FLOAT) { - modelHasWeights = true; - break; - } - } - if (!modelHasWeights) - fUseWeightFile = false; - } // Go through model and initialize each operator int i = 0; @@ -602,16 +616,49 @@ void RModel::Initialize(const std::map & inputParams, bool fOperators[op_idx]->Initialize(*this); for(auto &it:fOperators[op_idx]->GetOpOutputTensors()){ std::string name = std::string{it}; + // check if tensor is not an initialized or output tensor and it is not already in the list if (fIntermediateTensorFrequencyLookup.find(it) == fIntermediateTensorFrequencyLookup.end() && std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), name) == fOutputTensorNames.end() && - fInitializedTensors.find(name) == fInitializedTensors.end() && - fDynamicTensorInfos.find(name) == fDynamicTensorInfos.end()){ + fInitializedTensors.find(name) == fInitializedTensors.end()) + { fIntermediateTensorFrequencyLookup[it] = op_idx; } } i++; } + // loop on initialized tensors and make the integers as constant to be + // not written in a weight file + for (auto &it : fInitializedTensors) { + if (it.second.IsWeightTensor() && it.second.type() != ETensorType::FLOAT) + it.second.SetConstant(); + } + + // check if there are initialized tensors to write in a weight file + // support for the time being only weight of FLOAT type + if (fUseWeightFile) { + bool modelHasWeights = false; + for (auto &it : fInitializedTensors) { + if (it.second.IsWeightTensor()) { + modelHasWeights = true; + break; + } + } + if (!modelHasWeights) + fUseWeightFile = false; + } + + // update fIntermediateTensorFrequencyLookup for alias tensors + for (auto & it : fAliasTensors) { + if (fIntermediateTensorFrequencyLookup.find(it.first) == fIntermediateTensorFrequencyLookup.end()) continue; + if (fIntermediateTensorFrequencyLookup.find(it.second) == fIntermediateTensorFrequencyLookup.end() ) + fIntermediateTensorFrequencyLookup[it.second] = fIntermediateTensorFrequencyLookup[it.first]; + else { + // take the largest one + fIntermediateTensorFrequencyLookup[it.second] = std::max(fIntermediateTensorFrequencyLookup[it.second],fIntermediateTensorFrequencyLookup[it.first] ); + } + } + fIsInitialized = true; } @@ -653,7 +700,8 @@ std::string GenerateConstantTensorCode(const std::pair 100) ? false : true; + // also for weights which can be broadcasted do not use stack but allocate as a std::vector + bool allocateOnStack = (length > 100 || t.second.IsWeightTensor()) ? false : true; const T *data = t.second.data(); @@ -676,7 +724,7 @@ std::string GenerateConstantTensorCode(const std::pair(i); fConstantTensorSize += ConvertShapeToLength(i.second.shape()) * 4; @@ -723,7 +773,8 @@ void RModel::GenerateIntermediateTensorInfo() { if (!fIntermediateTensorInfos.empty()) { std::string tensor_declaration_block = ""; for (auto &i : fIntermediateTensorInfos) { - if (i.second.type == ETensorType::BOOL) { + bool is_alias = (IsAliasTensor(i.first)); + if (i.second.type == ETensorType::BOOL && !is_alias) { tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(ConvertShapeToLength(i.second.shape)) + ");\n"; tensor_declaration_block += "std::uint8_t * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n"; continue; @@ -734,7 +785,7 @@ void RModel::GenerateIntermediateTensorInfo() { bool not_in_output_names = (std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) == fOutputTensorNames.end()); - if ((not_in_freq_map && not_in_output_names) || (!not_in_freq_map && !is_extended && not_in_output_names)) { + if (((not_in_freq_map && not_in_output_names) || (!not_in_freq_map && !is_extended && not_in_output_names) ) && !is_alias) { size_t length = ConvertShapeToLength(i.second.shape); if (i.second.type == ETensorType::FLOAT) { @@ -753,6 +804,10 @@ void RModel::GenerateIntermediateTensorInfo() { fOtherTensorSize += 8 * length; } } + if (is_alias) { + tensor_declaration_block += ConvertTypeToString(i.second.type) + " * tensor_" + i.first + " = nullptr;\n"; + } + } if (tensor_declaration_block.length()) { @@ -763,17 +818,10 @@ void RModel::GenerateIntermediateTensorInfo() { if (!fDynamicTensorInfos.empty()) { fGC += "//--- declare the dynamic tensors\n"; for (auto &i : fDynamicTensorInfos) { - if (i.second.type == ETensorType::FLOAT) { - fGC += "std::vector fTensor_" + i.first + ";\n"; - fGC += "float * tensor_" + i.first + " = nullptr;\n"; - } else if (i.second.type == ETensorType::DOUBLE) { - fGC += "std::vector fTensor_" + i.first + ";\n"; - fGC += "double * tensor_" + i.first + " = nullptr;\n"; - } else if (i.second.type == ETensorType::INT64) { - fGC += "std::vector fTensor_" + i.first + ";\n"; - fGC += "int64_t * tensor_" + i.first + " = nullptr;\n"; - } + fGC += ConvertTypeToString(i.second.type) + " * tensor_" + i.first + " = nullptr;\n"; } + fGC += "//--- dynamic tensors pool\n"; + fGC += "std::vector fDynamicMemoryPool;\n"; } } @@ -791,14 +839,87 @@ void RModel::GenerateOperatorDeclarations() { void RModel::GenerateDynamicTensorInfo() { + // generate code for allocating dynamic tensors using the greedy memory allocations + if (fDynamicTensorInfos.empty()) + return; + + if (fVerbose) { + std::cout << "generating code for dynamic tensor management" << std::endl; + PrintDynamicTensors(); + } + std::stringstream out; + out << "// dynamic tensor memory management\n"; + out << SP << "std::vector dynamicTensorInfos;\n"; + out << SP << "dynamicTensorInfos.reserve(" << fDynamicTensorInfos.size() << ");\n"; + + // loop on all the operators to find begin/end life of the tensors + int op_index = 0; + std::vector> tensors; + tensors.reserve(fDynamicTensorInfos.size()); + for (auto & op : fOperators) { + // loop on output tensors - + for (auto &it : op->GetOpOutputTensors()) { + if (fVerbose) { + auto op_ptr = op.get(); + std::cout << "Looping on operator " << op_index << " " << typeid(*op_ptr).name() << std::endl; + } + // check if is a dynamic tensor and not an alias tensor + std::string name = std::string(it); + if ( fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end() && !IsAliasTensor(name)) { + auto tensor_size = ConvertDimShapeToLength(GetDimTensorShape(name)); + auto type = GetTensorType(name); + size_t type_size = GetTypeSize(type); + int begin = op_index; + int end = fOperators.size(); + // look for end + auto it_lookup = fIntermediateTensorFrequencyLookup.find(name); + if (it_lookup != fIntermediateTensorFrequencyLookup.end()) + end = it_lookup->second + 1; // end is last time used + 1 + // // some tensors (like xcol in convolutions) are just used within the operators + // if (end == 0 && begin > 0) end = begin+1; + + if (begin> end) { + std::cout << "op " << op_index << "tensor_" << name << " begin " << begin << " " << " end " << end << std::endl; + throw std::runtime_error("TMVA-SOFIE: RModel::GenerateDynamicTensorInfo: tensor_" + name + " has end before begin"); + } + + // write in code + out << SP << "dynamicTensorInfos.push_back( {" << begin << ", " << end << ", " << type_size << "* (" << tensor_size << ") });" + << " // tensor_" << name << std::endl; + tensors.push_back({name,type}); + } + } + op_index++; // increment operator index + } + out << "\n" << SP << "auto memory_result = OrganizeMemory(dynamicTensorInfos);\n\n"; + out << "// allocating now the memory\n"; + out << SP << "fDynamicMemoryPool = std::vector(memory_result.total_bytes);\n"; + out << SP << "int idx = 0;\n"; + for (auto & it : tensors) { + out << SP << "tensor_" << it.first << " = reinterpret_cast<" << ConvertTypeToString(it.second) << " *>(fDynamicMemoryPool.data() + memory_result.offsets[idx++]);\n"; + } + // check that all dynamic tensors are covered + bool missingTensor = false; for (auto &i : fDynamicTensorInfos) { - auto length = ConvertDynamicShapeToLength(i.second.shape); - out << SP << "if (" << length << " > 0) {\n"; - out << SP << SP << "fTensor_" << i.first << ".resize(" << length << ");\n"; - out << SP << SP << "tensor_" << i.first << " = fTensor_" << i.first << ".data();\n"; - out << SP << "}\n"; + if (IsAliasTensor(i.first)) continue; + if (std::find(tensors.begin(), tensors.end(), std::pair{i.first, i.second.type}) == tensors.end()) { + std::cout << "Dynamic tensors " << i.first << " is not in list of operator input/output " << std::endl; + missingTensor = true; + } } + if (missingTensor) + throw std::runtime_error("TMVA-SOFIE: RModel::GenerateDynamicTensorInfo - some tensors are not in input/output list"); + + + + // for (auto &i : fDynamicTensorInfos) { + // auto length = ConvertDynamicShapeToLength(i.second.shape); + // out << SP << "if (" << length << " > 0) {\n"; + // out << SP << SP << "fTensor_" << i.first << ".resize(" << length << ");\n"; + // out << SP << SP << "tensor_" << i.first << " = fTensor_" << i.first << ".data();\n"; + // out << SP << "}\n"; + // } fGC += out.str(); } @@ -941,7 +1062,7 @@ void RModel::GenerateSessionCode() CheckAndFlushIntermediateMemory(fOperators[op_idx]->GetOpInputTensors(), op_idx); } - // to check remaining unused fragments after memory allocation (lesser the better) + // to check remaining unused fragments after memory allocation (lesser the better) // for (const auto &it: fIntermediateMemoryInfo.available_stack){ // std::cout<<"chunk_idx: "<GenerateSessionMembersCode(opName); + fGC += fOperators[id]->GenerateSessionMembersCode(opName); } fGC += "\n"; // here add initialization and reading of weight tensors @@ -996,6 +1117,8 @@ void RModel::GenerateSessionCode() // add initialization of shape parameters // assume all parameters are of type size_t if (!fDimShapeNames.empty()) { + // sort first the shape parameters in alphabetical order to avoid a random order + std::sort(fDimShapeNames.begin(), fDimShapeNames.end() ); for (auto &p : fDimShapeNames) { fGC += ",\n"; fGC += " size_t " + p + " = " + fShapeParams[p]; @@ -1021,23 +1144,28 @@ void RModel::GenerateSessionCode() fGC += "}\n\n"; } - fGC += doInferSignature + "{\n"; - fGC += "\n"; + if (fProfile) { + RModelProfiler profiler(*this); + profiler.Generate(); + fGC += fProfilerGC; + } else { + fGC += doInferSignature + "{\n"; + fGC += "\n"; - // generate the inference code - if (fVerbose) - std::cout << "Generating main inference code for " << fName << std::endl; + // generate the inference code + if (fVerbose) + std::cout << "Generating main inference code for " << fName << std::endl; - if (fOutputTensorNames.size() == 0) - throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported"); + if (fOutputTensorNames.size() == 0) + throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported"); - for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { - if (fVerbose) + for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { + if (fVerbose) std::cout << "Generating code for operator .... " << op_idx << std::endl; - fGC += (fOperators[op_idx]->Generate(std::to_string(op_idx))); - } + fGC += (fOperators[op_idx]->Generate(std::to_string(op_idx))); + } - fGC += SP + "using TMVA::Experimental::SOFIE::UTILITY::FillOutput;\n\n"; + fGC += SP + "using TMVA::Experimental::SOFIE::UTILITY::FillOutput;\n\n"; for (std::string const &name : fOutputTensorNames) { // need to check is size is the same (don't want to return a vector with @@ -1048,7 +1176,8 @@ void RModel::GenerateSessionCode() fGC += SP + "FillOutput(tensor_" + name + ", output_tensor_" + name + ", " + n + ");\n"; } - fGC += "}\n\n"; + fGC += "}\n\n"; + } // generate the inference overload that returns an output struct GenerateOutput(); @@ -1061,9 +1190,11 @@ void RModel::GenerateSessionCode() void RModel::Generate(std::underlying_type_t options, int batchSize, long pos, bool verbose) { + bool profile = (options & static_cast>(Options::kProfile)); fVerbose = verbose; fBatchSize = batchSize; fReadPos = pos; + fProfile = profile; // session flag is used in operator initialize if (static_cast>(Options::kNoSession) & options) { @@ -1083,14 +1214,21 @@ void RModel::Generate(std::underlying_type_t options, int batchSize, lo "TMVA-SOFIE: RModel::Generate: cannot use a separate weight file without generating a Session class"); } - if (static_cast>(Options::kGNN) & options) + if (static_cast>(Options::kGNN) & options) fIsGNN = true; - if (static_cast>(Options::kGNNComponent) & options) + if (static_cast>(Options::kGNNComponent) & options) fIsGNNComponent = true; // initialize the model including all operators and sub-graphs Initialize(batchSize, verbose); + // if having dynamic tensor we need to have a Session + if (!fDynamicTensorInfos.empty()) { + fUseSession = true; + if (verbose) + std::cout << "Warning: Force having a Session since model has dynamic tensors " << std::endl; + } + std::string hgname; if (!fIsGNNComponent && !fIsSubGraph) { fGC.clear(); @@ -1099,13 +1237,13 @@ void RModel::Generate(std::underlying_type_t options, int batchSize, lo // generate first code for the subgraphs for (auto &graph : fSubGraphs) { - if (fVerbose) + if (fVerbose) std::cout << "generate session code for subgraph " << graph->fName << std::endl; graph->GenerateSessionCode(); fGC += graph->fGC; } - if (fVerbose) + if (fVerbose) std::cout << "generate Main session code - model " << fName << std::endl; // generate main session code @@ -1120,7 +1258,9 @@ void RModel::Generate(std::underlying_type_t options, int batchSize, lo void RModel::ReadInitializedTensorsFromFile(long pos) { // generate the code to read initialized tensors from a text data file if (fWeightFile == WeightFileType::Text) { - if (fInitializedTensors.empty()) return; + // check if there are tensors to write + + if (!fUseWeightFile) return; fGC += " std::ifstream f;\n"; fGC += " f.open(filename);\n"; @@ -1143,7 +1283,7 @@ void RModel::ReadInitializedTensorsFromFile(long pos) { std::string length = std::to_string(ConvertShapeToLength(i.second.shape())); fGC += " ReadTensorFromStream(f, " + tensor_name + ", \"" + tensor_name + "\", " + length + ");\n"; } else { - std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a file"); + throw std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a file"); } } fGC += " f.close();\n"; @@ -1288,7 +1428,7 @@ long RModel::WriteInitializedTensorsToFile(std::string filename) { } } else { - std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be written to a file"); + throw std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be written to a file"); } if (f.fail()) std::runtime_error("tmva-sofie failed to write tensor data to file for " + tensor_name); @@ -1301,7 +1441,7 @@ long RModel::WriteInitializedTensorsToFile(std::string filename) { } } -void RModel::PrintRequiredInputTensors() { +void RModel::PrintRequiredInputTensors() const { std::cout << "Model requires following inputs:\n"; for (auto& inputInfo: fInputTensorInfos) { std::cout << "Parametrised Tensor name: " << inputInfo.first << "\t"; @@ -1331,7 +1471,7 @@ void RModel::PrintRequiredInputTensors() { std::cout << "\n"; } -void RModel::PrintInitializedTensors() { +void RModel::PrintInitializedTensors() const { std::cout << "Model initialized the following tensors:\n"; for (auto& it: fInitializedTensors) { std::cout << "Tensor name: \"" << it.first << "\"\t"; @@ -1349,7 +1489,7 @@ void RModel::PrintInitializedTensors() { std::cout << "\n"; } -void RModel::PrintIntermediateTensors() { +void RModel::PrintIntermediateTensors() const { std::cout << "Model specify the following intermediate tensors:\n"; for (auto& it: fIntermediateTensorInfos) { std::cout << "Tensor name: \"" << it.first << "\"\t"; @@ -1364,7 +1504,7 @@ void RModel::PrintIntermediateTensors() { std::cout << "\n"; } -void RModel::PrintDynamicTensors() { +void RModel::PrintDynamicTensors() const { std::cout << "Model specify the following dynamic tensors:\n"; for (auto& it: fDynamicTensorInfos) { std::cout << "Tensor name: \"" << it.first << "\"\t"; @@ -1379,14 +1519,16 @@ void RModel::PrintDynamicTensors() { std::cout << "\n"; } -void RModel::PrintOutputTensors() { +void RModel::PrintOutputTensors() const { std::cout << "Model specify the following output tensors:\n"; for (auto& it: fOutputTensorNames) { std::cout << "Tensor name: \"" << it << "\"\t"; - if (!IsDynamicTensor(it)) - std::cout << "shape: " << ConvertShapeToString(GetTensorShape(it)) << std::endl; - else - std::cout << "shape: " << ConvertShapeToString(GetDynamicTensorShape(it)) << std::endl; + try { + auto shape = GetDimTensorShape(it); + std::cout << "with shape: " << ConvertShapeToString(shape) << std::endl; + } catch (...) { + std::cout << "with shape not yet defined" << std::endl; + } } std::cout << "\n"; } diff --git a/tmva/sofie/src/RModelProfiler.cxx b/tmva/sofie/src/RModelProfiler.cxx new file mode 100644 index 0000000000000..c56d4127e99b7 --- /dev/null +++ b/tmva/sofie/src/RModelProfiler.cxx @@ -0,0 +1,176 @@ +#include "TMVA/RModelProfiler.hxx" +#include "TMVA/SOFIE_common.hxx" + +namespace TMVA { +namespace Experimental { +namespace SOFIE { + +// The constructor now just registers the necessary C++ libraries. +RModelProfiler::RModelProfiler(RModel &model) : fModel(model) +{ + fModel.AddNeededStdLib("chrono"); // for timing operators + fModel.AddNeededStdLib("vector"); // for storing profiling results + fModel.AddNeededStdLib("string"); // for operator names + fModel.AddNeededStdLib("map"); // for the results map + fModel.AddNeededStdLib("iostream"); // for printing results + fModel.AddNeededStdLib("iomanip"); // for printing results +} + +// This function generates the helper functions inside the Session struct. +void RModelProfiler::GenerateUtilityFunctions() +{ + auto &gc = fModel.fProfilerGC; + + // Generate PrintProfilingResults function + gc += " // generate code for printing operator results. By default order according to time (from higher to lower)\n"; + gc += " void PrintProfilingResults(bool order = true) const {\n"; + gc += " if (fProfilingResults.empty()) {\n"; + gc += " std::cout << \"No profiling results to display.\" << std::endl;\n"; + gc += " return;\n"; + gc += " }\n"; + gc += "\n"; + gc += " // compute summary statistics of profiling results and sort them in decreasing time\n"; + gc += " std::vector> averageResults;\n"; + gc += " std::cout << \"\\n\" << std::string(50, '=') << std::endl;\n"; + gc += " std::cout << \" AVERAGE PROFILING RESULTS\" << std::endl;\n"; + gc += " std::cout << std::string(50, '=') << std::endl;\n"; + gc += " for (const auto& op : fProfilingResults) {\n"; + gc += " double sum = 0.0;\n"; + gc += " double sum2 = 0.0;\n"; + gc += " for (double time : op.second) {\n"; + gc += " sum += time;\n"; + gc += " sum2 += time*time;\n"; + gc += " }\n"; + gc += " double average = sum / op.second.size();\n"; + gc += " double stddev = std::sqrt(( sum2 - sum *average)/ (op.second.size()-1));\n"; + gc += " averageResults.push_back({op.first, average, stddev, op.second.size()});\n"; + gc += " }\n"; + gc += "\n"; + gc += " // sort average results in decreasing time\n"; + gc += " std::sort(averageResults.begin(), averageResults.end(),\n"; + gc += " []( std::tuple a, std::tuple b) {return std::get<1>(a) > std::get<1>(b); });\n"; + gc += "\n"; + gc += " for (const auto & r : averageResults) {\n"; + gc += " std::cout << \" \" << std::left << std::setw(20) << std::get<0>(r)\n"; + gc += " << \": \" << std::fixed << std::setprecision(6) << std::get<1>(r) << \" +/- \" \n"; + gc += " << std::get<2>(r)/std::sqrt(std::get<3>(r)) << \" us\"\n"; + gc += " << \" (over \" << std::get<3>(r) << \" runs)\" << std::endl;\n"; + gc += " }\n"; + gc += " std::cout << std::string(50, '=') << \"\\n\" << std::endl;\n"; + gc += " }\n"; + gc += "\n"; + + // Generate ResetProfilingResults function + gc += " void ResetProfilingResults() {\n"; + gc += " fProfilingResults.clear();\n"; + gc += " }\n"; + gc += "\n"; + + // Generate GetOpAvgTime function + gc += " std::map GetOpAvgTime() const {\n"; + gc += " if (fProfilingResults.empty()) {\n"; + gc += " return {};\n"; + gc += " }\n"; + gc += "\n"; + gc += " std::map avg;\n"; + gc += " for (const auto& op : fProfilingResults) {\n"; + gc += " double mean = 0.0;\n"; + gc += " for (double time : op.second) {\n"; + gc += " mean += time;\n"; + gc += " }\n"; + gc += " mean /= op.second.size();\n"; + gc += " avg[op.first] = mean;\n"; + gc += " }\n"; + gc += "\n"; + gc += " return avg;\n"; + gc += " }\n"; + gc += "\n"; + + // Generate GetOpVariance function + gc += " std::map GetOpVariance() const {\n"; + gc += " if (fProfilingResults.empty()) {\n"; + gc += " return {};\n"; + gc += " }\n"; + gc += "\n"; + gc += " std::map variance;\n"; + gc += " for (const auto& op : fProfilingResults) {\n"; + gc += " // Var[X] = E[X^2] - E[X]^2\n"; + gc += " double mean = 0.0, mean2 = 0.0;\n"; + gc += " for (double time : op.second) {\n"; + gc += " mean += time;\n"; + gc += " mean2 += time * time;\n"; + gc += " }\n"; + gc += " mean /= op.second.size();\n"; + gc += " mean2 /= op.second.size();\n"; + gc += " variance[op.first] = mean2 - mean * mean;\n"; + gc += " }\n"; + gc += "\n"; + gc += " return variance;\n"; + gc += " }\n"; +} + +// Main generation function for the profiler. +void RModelProfiler::Generate() +{ + // Clear the profiler's code string to start fresh. + fModel.fProfilerGC.clear(); + auto &gc = fModel.fProfilerGC; + + // 1. Add the data member to the Session struct to store results. + gc += "public:\n"; + gc += " // Maps an operator name to a vector of its execution times (in microseconds).\n"; + gc += " std::map> fProfilingResults;\n\n"; + + // 2. Generate and add the utility functions like PrintProfilingResults. + GenerateUtilityFunctions(); + + // 3. Generate the signature for the profiled doInfer method. + std::string doInferSignature = fModel.GenerateInferSignature(); + if (!doInferSignature.empty()) doInferSignature += ", "; + for (auto const &name : fModel.GetOutputTensorNames()) { + doInferSignature += " std::vector<" + ConvertTypeToString(fModel.GetTensorType(name)) + "> &output_tensor_" + name + ","; + } + if (!fModel.GetOutputTensorNames().empty()) { + doInferSignature.back() = ' '; + } + gc += "void doInfer(" + doInferSignature + ") {\n"; + + // 4. Generate the body of the doInfer method with timing instrumentation. + gc += " // Timer variable for profiling\n"; + gc += " std::chrono::steady_clock::time_point tp_start, tp_overall_start;\n\n"; + gc += " tp_overall_start = std::chrono::steady_clock::now();\n\n"; + + for (size_t op_idx = 0; op_idx < fModel.fOperators.size(); ++op_idx) { + const auto& op = fModel.fOperators[op_idx]; + gc += " // -- Profiling for operator " + op->name + " --\n"; + gc += " tp_start = std::chrono::steady_clock::now();\n\n"; + + // Add the actual operator inference code + gc += op->Generate(std::to_string(op_idx)); + + // Add the code to stop the timer and store the result + gc += "\n fProfilingResults[\"" + op->name + "\"].push_back(\n"; + gc += " std::chrono::duration_cast>(\n"; + gc += " std::chrono::steady_clock::now() - tp_start).count());\n\n"; + } + + // 5. Generate the code to fill the output tensors. + gc += " using TMVA::Experimental::SOFIE::UTILITY::FillOutput;\n\n"; + for (std::string const &name : fModel.GetOutputTensorNames()) { + bool isIntermediate = fModel.fIntermediateTensorInfos.count(name) > 0; + std::string n = isIntermediate ? std::to_string(ConvertShapeToLength(fModel.GetTensorShape(name))) + : ConvertDynamicShapeToLength(fModel.GetDynamicTensorShape(name)); + gc += " FillOutput(tensor_" + name + ", output_tensor_" + name + ", " + n + ");\n"; + } + + gc += "\n // -- Record overall inference time --\n"; + gc += " fProfilingResults[\"Overall_Time\"].push_back(\n"; + gc += " std::chrono::duration_cast>(\n"; + gc += " std::chrono::steady_clock::now() - tp_overall_start).count());\n"; + + gc += "}\n\n"; // End of doInfer function +} + +} // namespace SOFIE +} // namespace Experimental +} // namespace TMVA diff --git a/tmva/sofie/src/SOFIE_common.cxx b/tmva/sofie/src/SOFIE_common.cxx index c107b489be19e..54fed04ba42b1 100644 --- a/tmva/sofie/src/SOFIE_common.cxx +++ b/tmva/sofie/src/SOFIE_common.cxx @@ -4,6 +4,8 @@ #include #include #include +#include +#include namespace TMVA { namespace Experimental { @@ -89,7 +91,7 @@ std::string ConvertTypeToString(ETensorType type){ return "double"; } case ETensorType::BOOL : { - return "bool"; + return "uint8_t"; } default:{ return "other_" + std::to_string( (int) type); @@ -130,7 +132,7 @@ std::string ConvertDimShapeToString(const std::vector & shape) { std::stringstream out; out << "{ "; for (size_t i = 0; i < shape.size(); i++) { - out << shape[i].GetVal(); + out << shape[i]; if (i < shape.size()-1) out << " , "; } out << " }"; @@ -412,14 +414,15 @@ std::pair> UTILITY::MultidirectionalBroadcastShape(std + " to a common shape."); } } -// unidirectional broadcast- only B changes +// unidirectional broadcast- of shape A to target B std::vector UTILITY::UnidirectionalBroadcastShape(std::vector & shapeA, std::vector & shapeB) { - auto ret = UTILITY::MultidirectionalBroadcastShape(shapeA, shapeB); + auto ret = UTILITY::MultidirectionalBroadcastShape(shapeB, shapeA); if (ret.first > 1) { - std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape " - + ConvertShapeToString(shapeA) + " and " + ConvertShapeToString(shapeB) - + " to a common shape."); + throw + std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape " + + ConvertShapeToString(shapeA) + " to " + ConvertShapeToString(shapeB) + + " in a common shape."); } return ret.second; } @@ -547,6 +550,130 @@ std::vector UTILITY::ComputeStrideFromShape(const std::vector & shape) return strides; } +struct FreeBlock { + std::size_t offset; + std::size_t size; + bool operator<(const FreeBlock& other) const { + // order by offset for deterministic coalescing + return offset < other.offset; + } +}; + +struct MemoryEvent { + int t; // time (i.e. operator index) + int type; // 0 = END first, 1 = START + int idx; // tensor index + bool operator<(const MemoryEvent& o) const { + if (t != o.t) return t < o.t; + return type < o.type; // END before START at the same time + } +}; + +/// Greedy best-fit planner with coalescing free list. +MemoryResult OrganizeMemory(const std::vector & tensorsInfo ) +{ + // Basic validation + for (const auto &t : tensorsInfo) { + if (!(t.end > t.begin)) { + throw std::runtime_error("Each tensor must have end > begin."); + } + } + + // Build events: free before allocate at equal times. + std::vector events; + events.reserve(tensorsInfo.size() * 2); + for (int i = 0; i < (int)tensorsInfo.size(); ++i) { + events.push_back({tensorsInfo[i].end, 0, i}); // END + events.push_back({tensorsInfo[i].begin, 1, i}); // START + } + std::sort(events.begin(), events.end()); + + std::vector tensorsOffset(tensorsInfo.size()); + + // Free list ordered by offset (for O(log n) coalescing) + // and faster insert/erase with respect to a vector + std::set free_list; + + // Bookkeeping: size/offset map for frees. + std::unordered_map live_size; + std::unordered_map live_offset; + + std::size_t total_bytes = 0; + + auto allocate_best_fit = [&](std::size_t need) -> std::size_t { + // Find the *smallest* block whose size >= need (best-fit). + // Since free_list is ordered by offset, we scan to find best by size. + // (For very large sets you could maintain a multimap by size as well.) + auto best = free_list.end(); + for (auto it = free_list.begin(); it != free_list.end(); ++it) { + if (it->size >= need) { + if (best == free_list.end() || it->size < best->size) + best = it; + } + } + if (best != free_list.end()) { + std::size_t off = best->offset; + if (best->size == need) { + free_list.erase(best); + } else { + FreeBlock updated{best->offset + need, best->size - need}; + free_list.erase(best); + free_list.insert(updated); + } + return off; + } + // No free block large enough; grow the heap. + std::size_t off = total_bytes; + total_bytes += need; + return off; + }; + + auto try_coalesce = [&](std::set::iterator it) { + // Coalesce with previous + if (it != free_list.begin()) { + auto prev = std::prev(it); + if (prev->offset + prev->size == it->offset) { + FreeBlock merged{prev->offset, prev->size + it->size}; + free_list.erase(prev); + it = free_list.erase(it); + it = free_list.insert(merged).first; + } + } + // Coalesce with next + auto next = std::next(it); + if (next != free_list.end() && it->offset + it->size == next->offset) { + FreeBlock merged{it->offset, it->size + next->size}; + free_list.erase(next); + it = free_list.erase(it); + free_list.insert(merged); + } + }; + + // Sweep through time. + for (const auto &e : events) { + if (e.type == 0) { // END: free + auto it_sz = live_size.find(e.idx); + auto it_off = live_offset.find(e.idx); + if (it_sz != live_size.end() && it_off != live_offset.end()) { + FreeBlock fb{it_off->second, it_sz->second}; + // Insert and coalesce with neighbors + auto it = free_list.insert(fb).first; + try_coalesce(it); + live_size.erase(it_sz); + live_offset.erase(it_off); + } + } else { // START: allocate + auto &t = tensorsInfo[e.idx]; + std::size_t off = allocate_best_fit(t.size); + tensorsOffset[e.idx] = off; + live_size[e.idx] = t.size; + live_offset[e.idx] = off; + } + } + + return MemoryResult{total_bytes, std::move(tensorsOffset)}; +} + } // namespace SOFIE } // namespace Experimental } // namespace TMVA diff --git a/tmva/sofie/test/TestCustomModelsFromONNX.cxx b/tmva/sofie/test/TestCustomModelsFromONNX.cxx index 5b77caf2aed1d..401afb8257e25 100644 --- a/tmva/sofie/test/TestCustomModelsFromONNX.cxx +++ b/tmva/sofie/test/TestCustomModelsFromONNX.cxx @@ -323,6 +323,8 @@ #include "ScatterElements_FromONNX.hxx" +#include "MatMul_Stacked_FromONNX.hxx" + #include "gtest/gtest.h" constexpr float DEFAULT_TOLERANCE = 1e-3f; @@ -2856,7 +2858,7 @@ TEST(ONNX, RangeFloat) { float start = 1.; float limit = 10.; float delta = 2.; - TMVA_SOFIE_RangeFloat::Session s("RangeFloat_FromONNX.dat"); + TMVA_SOFIE_RangeFloat::Session s("RangeFloat_FromONNX.dat",5); std::vector output(s.infer(&start, &limit, &delta)); // Checking the output size @@ -2875,7 +2877,7 @@ TEST(ONNX, RangeInt) { int64_t start = 1; int64_t limit = 10; int64_t delta = 2; - TMVA_SOFIE_RangeInt::Session s("RangeInt_FromONNX.dat"); + TMVA_SOFIE_RangeInt::Session s("RangeInt_FromONNX.dat",5); std::vector output(s.infer(&start, &limit, &delta)); // Checking the output size @@ -2947,7 +2949,7 @@ TEST(ONNX, Where) { // test also the broadcast of boolean tensors std::vector input1 = {1,2}; std::vector input2 = {3,4,5,6}; - bool cond[] = {true, false, true}; // need to pass arrays for booleans + uint8_t cond[] = {true, false, true}; // need to pass arrays for booleans std::vector correct = {1,2,5,6,1,2}; TMVA_SOFIE_Where::Session s("Where_FromONNX.dat"); std::vector output(s.infer(input1.data(), input2.data(), cond)); @@ -3214,3 +3216,24 @@ TEST(ONNX, ScatterElements) EXPECT_LE(std::abs(output[i] - correct_output[i]), DEFAULT_TOLERANCE); } } + +TEST(ONNX, MatMul_Stacked) +{ + // test scatter elements (similar test as in ONNX doc) + std::vector input1 = {1,2,3,4,5,6,7,8}; // input tensor shape is (2,2,2) + std::vector input2 = {2,3}; // shape is (2,1) + + std::vector correct_output = {8,18, 28,38}; + + // model is dynamic , use N = 2 + TMVA_SOFIE_MatMul_Stacked::Session s("MatMul_Stacked_FromONNX.dat", 2); + + auto output = s.infer(2, input1.data(), input2.data()); + + // Checking output size + EXPECT_EQ(output.size(), correct_output.size()); + // Checking output + for (size_t i = 0; i < output.size(); ++i) { + EXPECT_LE(std::abs(output[i] - correct_output[i]), DEFAULT_TOLERANCE); + } +} diff --git a/tmva/sofie/test/TestCustomModelsFromROOT.cxx b/tmva/sofie/test/TestCustomModelsFromROOT.cxx index d077aede3e2e6..7e3c8c9c2fc09 100644 --- a/tmva/sofie/test/TestCustomModelsFromROOT.cxx +++ b/tmva/sofie/test/TestCustomModelsFromROOT.cxx @@ -891,7 +891,8 @@ TEST(ROOT, RangeFloat) { float start = 1.; float limit = 10.; float delta = 2.; - std::vector output = TMVA_SOFIE_RangeFloat::infer(&start, &limit, &delta); + TMVA_SOFIE_RangeFloat::Session s("",5); + std::vector output(s.infer(&start, &limit, &delta)); // Checking the output size EXPECT_EQ(output.size(), sizeof(RangeFloat_ExpectedOutput::outputs) / sizeof(float)); @@ -909,7 +910,8 @@ TEST(ROOT, RangeInt) { int64_t start = 1; int64_t limit = 10; int64_t delta = 2; - std::vector output = TMVA_SOFIE_RangeInt::infer(&start, &limit, &delta); + TMVA_SOFIE_RangeInt::Session s("",5); + std::vector output(s.infer(&start, &limit, &delta)); // Checking the output size EXPECT_EQ(output.size(), sizeof(RangeInt_ExpectedOutput::outputs) / sizeof(int64_t)); diff --git a/tmva/sofie/test/input_models/MatMul_Stacked.onnx b/tmva/sofie/test/input_models/MatMul_Stacked.onnx new file mode 100644 index 0000000000000..19c39ee2adddd --- /dev/null +++ b/tmva/sofie/test/input_models/MatMul_Stacked.onnx @@ -0,0 +1,19 @@ + + onnx-example:„ + +input1 +input2output"MatMulAddGraphZ +input1 + +N + +Z +input2 +  + +b +output + +N + +B \ No newline at end of file diff --git a/tmva/sofie_parsers/src/RModelParser_ONNX.cxx b/tmva/sofie_parsers/src/RModelParser_ONNX.cxx index 7b4ade2b6bc09..4903c8d1c6511 100644 --- a/tmva/sofie_parsers/src/RModelParser_ONNX.cxx +++ b/tmva/sofie_parsers/src/RModelParser_ONNX.cxx @@ -731,7 +731,8 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto & std::cout << "\t" << i << " " << nodesOrder[i] << " parsing operator " << op_type << std::endl; } - std::unique_ptr op = ParseOperator(i, graph, nodesOrder, nodesChildren[i]); + std::unique_ptr op = ParseOperator(i, graph, nodesOrder, nodesChildren[nodesOrder[i]]); + if (!op) { if (verbose) { std::cout << "\t\tskipping operator since it is fused with previous one" << std::endl; @@ -739,6 +740,12 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto & // for skipping the fused nodes like Add after MatMul continue; } + const auto &nodeproto = graph.node(nodesOrder[i]); + op->name = nodeproto.name(); + if (op->name.empty()) { + op->name = op_type + "_" + std::to_string(i); + } + rmodel.AddOperator(std::move(op), node_order_exec++); } diff --git a/tutorials/machine_learning/TMVA_SOFIE_ONNX.C b/tutorials/machine_learning/TMVA_SOFIE_ONNX.C index 8c192789e1210..878167db8c791 100644 --- a/tutorials/machine_learning/TMVA_SOFIE_ONNX.C +++ b/tutorials/machine_learning/TMVA_SOFIE_ONNX.C @@ -19,7 +19,7 @@ void TMVA_SOFIE_ONNX(std::string inputFile = ""){ SOFIE::RModel model = parser.Parse(inputFile, true); //Generating inference code - model.Generate(); + model.Generate(SOFIE::Options::kProfile); // write the code in a file (by default Linear_16.hxx and Linear_16.dat model.OutputGenerated();