From 9edf0fafa307dbfcba33ef3d0b583df2ae2092bf Mon Sep 17 00:00:00 2001
From: moneta <lorenzo.moneta@cern.ch>
Date: Mon, 10 Nov 2025 17:30:32 +0100
Subject: [PATCH 01/12] [tmva][sofie]  Apply fixes for supporting Dynamic
 tensors

Add missing support for Dynamic tensors for some operators.
With this commit a full support for dynamic tensor is available for ParticleNet model.
Fix also a bug in Concat operator when the concat axis is not the first one
---
 tmva/sofie/inc/TMVA/ROperator_BasicNary.hxx   | 189 ++++++---
 tmva/sofie/inc/TMVA/ROperator_Cast.hxx        |  21 +-
 tmva/sofie/inc/TMVA/ROperator_Comparision.hxx | 362 +++++++++++-------
 tmva/sofie/inc/TMVA/ROperator_Concat.hxx      |  27 +-
 tmva/sofie/inc/TMVA/ROperator_Constant.hxx    |   4 +-
 tmva/sofie/inc/TMVA/ROperator_Conv.hxx        |  15 +-
 tmva/sofie/inc/TMVA/ROperator_Gather.hxx      |   9 +-
 tmva/sofie/inc/TMVA/ROperator_Range.hxx       | 134 +++++--
 tmva/sofie/inc/TMVA/ROperator_Reduce.hxx      |   2 +-
 tmva/sofie/inc/TMVA/ROperator_Reshape.hxx     |   8 +-
 tmva/sofie/inc/TMVA/ROperator_Slice.hxx       |  14 +-
 tmva/sofie/inc/TMVA/ROperator_Tile.hxx        |  31 +-
 tmva/sofie/inc/TMVA/ROperator_TopK.hxx        |  59 ++-
 tmva/sofie/inc/TMVA/SOFIE_common.hxx          |   6 +
 tmva/sofie/src/RModel.cxx                     |  22 +-
 15 files changed, 598 insertions(+), 305 deletions(-)
diff --git a/tmva/sofie/inc/TMVA/ROperator_BasicNary.hxx b/tmva/sofie/inc/TMVA/ROperator_BasicNary.hxx
index bcc0e52a40ca3..f73bd34e53386 100644
--- a/tmva/sofie/inc/TMVA/ROperator_BasicNary.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_BasicNary.hxx
@@ -23,10 +23,11 @@ struct NaryOperatorTraits<T, EBasicNaryOperator::Max> {
    static const std::string Name() {return "Max";}
    static std::string Op(const std::string& res, std::vector<std::string>& inputs) {
       std::stringstream out;
-      out << "\t" << "\t" << res << " = " << inputs[0] << ";\n";
+      out << res << " = std::max({ " << inputs[0];
       for (size_t i = 1; i < inputs.size(); i++) {
-         out << "\t" << "\t" << res << " = std::max(" << res << ", " << inputs[i] << ");\n";
+         out << ", " << inputs[i];
       }
+      out << "});\n";
       return out.str();
    }
 };
@@ -36,10 +37,11 @@ struct NaryOperatorTraits<T, EBasicNaryOperator::Min> {
    static const std::string Name() {return "Min";}
    static std::string Op(const std::string& res, std::vector<std::string>& inputs) {
       std::stringstream out;
-      out << "\t" << "\t" << res << " = " << inputs[0] << ";\n";
+       out << res << " = std::min({ " << inputs[0];
       for (size_t i = 1; i < inputs.size(); i++) {
-         out << "\t" << "\t" << res << " = std::min(" << res << ", " << inputs[i] << ");\n";
+         out << ", " << inputs[i];
       }
+      out << "});\n";
       return out.str();
    }
 };
@@ -52,7 +54,7 @@ struct NaryOperatorTraits<float, EBasicNaryOperator::Mean> {
    static const std::string Name() {return "Mean";}
    static std::string Op(const std::string& res, std::vector<std::string>& inputs) {
       std::stringstream out;
-      out << "\t" << "\t" << res << " = (" << inputs[0];
+      out << res << " = (" << inputs[0];
       for (size_t i = 1; i < inputs.size(); i++) {
          out << " + " << inputs[i];
       }
@@ -66,7 +68,7 @@ struct NaryOperatorTraits<T, EBasicNaryOperator::Sum> {
    static const std::string Name() {return "Sum";}
    static std::string Op(const std::string& res, std::vector<std::string>& inputs) {
       std::stringstream out;
-      out << "\t" << "\t" << res << " = " << inputs[0];
+      out << res << " = " << inputs[0];
       for (size_t i = 1; i < inputs.size(); i++) {
          out << " + " << inputs[i];
       }
@@ -83,10 +85,11 @@ private:
 
    std::vector<std::string> fNInputs;
    std::string fNY;
-   std::vector<std::vector<size_t>> fShapeInputs;
+   std::vector<std::vector<Dim>> fShapeInputs;
 
    std::vector<std::string> fNBroadcastedInputs;
    std::vector<size_t> fShapeY;
+   std::vector<Dim> fDimShapeY;
 
    bool fBroadcast = false;
 
@@ -119,64 +122,164 @@ public:
    }
 
    void Initialize(RModel& model) override {
+      std::vector<std::vector<size_t>> inputShapes;
       for (auto &it : fNInputs) {
          if (!model.CheckIfTensorAlreadyExist(it)) {
             throw std::runtime_error("TMVA SOFIE BasicNary Op Input Tensor " + it + " is not found in model");
          }
-         fShapeInputs.push_back(model.GetTensorShape(it));
+         fShapeInputs.push_back(model.GetDimTensorShape(it));
+         if (fNInputs.size()> 2) {
+            if (model.IsDimInputTensor(it))
+               throw std::runtime_error("TMVA SOFIE BasicNary : supports only 2 inputs for dynamic tensors");
+            else
+               inputShapes.push_back(model.GetTensorShape(it));
+         }
       }
       // Find the common shape of the input tensors
-      fShapeY = UTILITY::MultidirectionalBroadcastShape(fShapeInputs);
-      model.AddIntermediateTensor(fNY, model.GetTensorType(fNInputs[0]), fShapeY);
-      // Broadcasting
-      size_t N = fNInputs.size();
-      fNBroadcastedInputs.reserve(N);
-      for (size_t i = 0; i < N; i++) {
-         if (!UTILITY::AreSameShape(model.GetTensorShape(fNInputs[i]), fShapeY)) {
-            fBroadcast = true;
-            std::string name = "Broadcasted"  + fNInputs[i];
-            model.AddIntermediateTensor(name, model.GetTensorType(fNInputs[0]), fShapeY);
-            fNBroadcastedInputs.emplace_back("tensor_" + name);
-         } else {
-            fNBroadcastedInputs.emplace_back("tensor_" + fNInputs[i]);
+      if (fShapeInputs.size() > 2 ) {
+         // support dynamic tensors now for input list of size=2
+         auto shapeY = UTILITY::MultidirectionalBroadcastShape(inputShapes);
+         fDimShapeY = ConvertShapeToDim(shapeY);
+      } else if (fShapeInputs.size() == 2 ) {
+         auto ret  = UTILITY::MultidirectionalBroadcastShape(fShapeInputs[0], fShapeInputs[1]);
+         // use same code as in BinaryOperator (need to extend for input sizes > 2)
+         fBroadcast = ret.first;
+         fDimShapeY = ret.second;
+         // case of all parametric shapes and MultiDirectionalBroadcastShape  return the max of the 2
+         // need to do before we declare the output tensor shape and the broadcasted ones
+         if (ret.first & 4) {
+            // check if one of the parameter is an input dimension
+            // define function to find this
+            auto IsInputDimParam = [&](const std::string &p) {
+               auto inputNames = model.GetInputTensorNames();
+               for (auto &input : inputNames) {
+                  for (auto &i_s : model.GetDimTensorShape(input)) {
+                     if (i_s.isParam && i_s.param == p)
+                        return true;
+                  }
+               }
+               return false;
+            };
+            auto & shapeA = fShapeInputs[0];
+            auto & shapeB = fShapeInputs[1];
+            for (size_t i = 0; i < fDimShapeY.size(); i++) {
+               auto &s = fDimShapeY[i];
+               if (s.isParam && s.param.find("std::max") != std::string::npos) {
+                  if (IsInputDimParam(shapeA[i].param)) {
+                     // case dim is 1 we indicate that the input parameter is equal to 1
+                     if (shapeA[i].dim != 1)
+                        s = shapeA[i];
+                     else
+                        s = shapeB[i];
+                  } else if (IsInputDimParam(shapeB[i].param)) {
+                     if (shapeB[i].dim != 1)
+                        s = shapeB[i];
+                     else
+                        s = shapeA[i];
+                  }
+               }
+            }
          }
+      } else if  (fShapeInputs.size() == 1 ) {
+         fDimShapeY = fShapeInputs[0];
       }
+      if (!fShapeY.empty())
+         model.AddIntermediateTensor(fNY, model.GetTensorType(fNInputs[0]), fShapeY);
+      else
+         model.AddIntermediateTensor(fNY, model.GetTensorType(fNInputs[0]), fDimShapeY);
+
+
       fType = ConvertTypeToString(model.GetTensorType(fNInputs[0]));
+
+      if (model.Verbose()) {
+         std::cout << NaryOperatorTraits<T, Op>::Name() << " : ";
+         if (fNInputs.size() == 2)
+            std::cout << ConvertShapeToString(fShapeInputs[0]) << " , "
+                      << ConvertShapeToString(fShapeInputs[1]);
+         std::cout << " --> " << ConvertShapeToString(fDimShapeY) << std::endl;
+      }
    }
 
    std::string Generate(std::string OpName) override {
       OpName = "op_" + OpName;
-      if (fShapeY.empty()) {
+      if (fDimShapeY.empty()) {
          throw std::runtime_error("TMVA SOFIE BasicNary called to Generate without being initialized first");
       }
       std::stringstream out;
-      size_t length = ConvertShapeToLength(fShapeY);
+      auto length = ConvertDimShapeToLength(fDimShapeY);
       out << SP << "\n//------ BasicNary operator\n";
-      if (fBroadcast) {
-         for (size_t i = 0; i < fNInputs.size(); i++) {
-            if (fNBroadcastedInputs[i] != fNInputs[i]) {
-               out << SP << SP << "// Broadcasting " << fNInputs[i] << " to " << ConvertShapeToString(fShapeY) << "\n";
-               out << SP << SP << "{\n";
-               out << SP << SP << SP << fType << "* data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<" << fType << ">(tensor_" + fNInputs[i] << ", " << ConvertShapeToString(fShapeInputs[i]);
-               out << ", " << ConvertShapeToString(fShapeY) << ");\n";
-               out << SP << SP << SP << "std::copy(data, data + " << length << ", " << fNBroadcastedInputs[i] << ");\n";
-               out << SP << SP << SP << "delete[] data;\n";
-               out << SP << SP << "}\n";
-            }
-         }
-      }
 
-      if (fNInputs.size() == 1) {
+      int nInputs = fNInputs.size();
+
+      if (nInputs == 1) {
          out << SP << "std::copy(tensor_" << fNInputs[0] << ", tensor_" << fNInputs[0] << " + ";
          out << length << ", tensor_" << fNY << ");\n";
       } else {
-         std::vector<std::string> inputs(fNBroadcastedInputs.size());
-         for (size_t i = 0; i < fNBroadcastedInputs.size(); i++) {
-            inputs[i] = fNBroadcastedInputs[i] + "[id]";
+
+         // implement operator without broadcasting, but using loos on all indices
+         std::vector<std::vector<Dim>> inputStrides(nInputs);
+         for (int i = 0; i < nInputs; i++)
+            inputStrides[i] = UTILITY::ComputeStrideFromShape(fShapeInputs[i]);
+
+         auto stridesY = UTILITY::ComputeStrideFromShape(fDimShapeY);
+
+         // make loop on output indices
+         std::string compute_idx_Y;
+         int nloop = 0;
+         if (fDimShapeY.empty() ||
+               std::all_of(fDimShapeY.begin(), fDimShapeY.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+            compute_idx_Y = "0";
+         } else {
+            for (size_t i = 0; i < fDimShapeY.size(); ++i) {
+               if (fDimShapeY[i].dim != 1 && fDimShapeY[i].GetVal() != "1") {
+                  nloop++;
+                  for (int j = 0; j < nloop; j++) out << SP;
+                  out << "for (size_t idx_" << i << " = 0; idx_" << i << " < " << fDimShapeY[i]
+                      << "; ++idx_" << i << "){\n";
+                  compute_idx_Y += "idx_" + std::to_string(i);
+                  if (stridesY[i].GetVal() != "1")
+                     compute_idx_Y += " * " + stridesY[i].GetVal();
+                  compute_idx_Y += " + ";
+               }
+            }
+            // remove last 3 characters " + "
+            for (int j = 0; j < 3; j++)
+               compute_idx_Y.pop_back();
+         }
+         // find indices for input tensors
+         std::vector<std::string> inputs(nInputs);
+         for (int ipt = 0; ipt < nInputs; ipt++ ) {
+            std::string compute_idx_X;
+            auto & shape = fShapeInputs[ipt];
+            auto & stride = inputStrides[ipt];
+            if (shape.empty() ||
+                std::all_of(shape.begin(), shape.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+               compute_idx_X = "0";
+            } else {
+               for (size_t i = 0; i < shape.size(); ++i) {
+                  if (shape[i].dim == 1 || shape[i].GetVal() == "1")
+                     continue;
+                  compute_idx_X += "idx_" + std::to_string(i + (fDimShapeY.size() - shape.size()));
+                  if (stride[i].GetVal() != "1")
+                     compute_idx_X += " * " + stride[i].GetVal();
+                  compute_idx_X += " + ";
+               }
+               // remove last 3 character " + "
+               for (int j = 0; j < 3; j++)
+                  compute_idx_X.pop_back();
+            }
+            inputs[ipt] = "tensor_" + fNInputs[ipt] + "[" + compute_idx_X + "]";
+         }
+
+         // perform the operation
+         for (int j = 0; j < nloop + 1; j++) out << SP;
+         std::string output = "tensor_" + fNY + "[" + compute_idx_Y + "]";
+         out << NaryOperatorTraits<T,Op>::Op(output, inputs);
+
+         for (int i = nloop; i > 0; i--) {
+            for (int j = 0; j < i; j++) out << SP;
+            out << "}\n";
          }
-         out << SP << "for (size_t id = 0; id < " << length << "; id++) {\n";
-         out << NaryOperatorTraits<T,Op>::Op("tensor_" + fNY + "[id]", inputs);
-         out << SP << "}\n";
       }
       return out.str();
    }
diff --git a/tmva/sofie/inc/TMVA/ROperator_Cast.hxx b/tmva/sofie/inc/TMVA/ROperator_Cast.hxx
index f48e27ee4f264..8267bb8a7e4f4 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Cast.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Cast.hxx
@@ -46,7 +46,7 @@ public:
         throw std::runtime_error("TMVA SOFIE Cast Op Input Tensor is not found in model");
       }
       fShape = model.GetDimTensorShape(fNX);
-      // shoud we add a check if the same type
+      // should we add a check if the same type
       auto inputType = model.GetTensorType(fNX);
       if (model.IsInitializedTensor(fNX)) {
          fIsOutputConstant = true;
@@ -57,29 +57,30 @@ public:
          }
          else
             fIsOutputConstant = false;
+      } else if (model.IsShapeTensor(fNX) && ConvertStringToType(fAttrType) == ETensorType::INT64) {
+         auto shapeData = model.GetShapeTensorValues(fNX);
+         model.AddShapeTensor(fNY, shapeData, fShape.size() == 0);
+         fIsOutputConstant = true;
       }
       if (!fIsOutputConstant)
          model.AddIntermediateTensor(fNY, ConvertStringToType(fAttrType), fShape);
       if (model.Verbose()) {
-         std::cout << "Cast : " << ConvertTypeToString(inputType) << " " << fNX << " -> " << fAttrType << " for " << fNY;
+         std::cout << "Cast : " << ConvertTypeToString(inputType) << " " << fNX << " -> " << fAttrType << " for " << fNY
+                  << " shape " << ConvertDimShapeToString(fShape);
          if (fIsOutputConstant) std::cout << " (constant) ";
          std::cout << std::endl;
       }
    }
 
 
-   std::string Generate(std::string OpName) override {
-      if (fIsOutputConstant) return "";
+   std::string Generate(std::string opName) override {
+
+      // output shape can be empty if is a scalar
 
-      OpName = "op_" + OpName;
-      if (fShape.empty()) {
-         throw std::runtime_error("TMVA SOFIE Cast called to Generate without being initialized first");
-      }
       std::stringstream out;
       auto length = ConvertDimShapeToLength(fShape);
 
-      // out << SP << ETensorType << " " << OpName << "_attr = "  << fattr << ";\n";
-      out << "\n//------ CAST\n";
+      out << "\n//------ CAST " << opName << " ---> " << fNY << "  " << ConvertDimShapeToString(fShape) << "\n";
        // no generated code for constant outputs
       if (fIsOutputConstant) return out.str();
 
diff --git a/tmva/sofie/inc/TMVA/ROperator_Comparision.hxx b/tmva/sofie/inc/TMVA/ROperator_Comparision.hxx
index 0d365ae517de5..40c8923676aaf 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Comparision.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Comparision.hxx
@@ -65,11 +65,10 @@ private:
    std::vector<Dim> fDimShapeX1;
    std::vector<Dim> fDimShapeX2;
    std::vector<size_t> fShapeY;
-   std::string fNBroadcastedX1;
-   std::string fNBroadcastedX2;
+   std::vector<Dim> fDimShapeY;
    ETensorType fTensorType1 = ETensorType::UNDEFINED;
    ETensorType fTensorType2 = ETensorType::UNDEFINED;
-   bool fBroadcast = false;
+   int fBroadcastFlag = 0;
 
 
 public:
@@ -115,136 +114,175 @@ public:
       }
       fTensorType1 = model.GetTensorType(fNX1);
       fTensorType2 = model.GetTensorType(fNX2);
-      bool broadcast = !UTILITY::AreSameShape(fShapeX1, fShapeX2);
-      if (broadcast) {
-         // Y is the common shape of A and B
-         fShapeY = UTILITY::UnidirectionalBroadcastShape(fShapeX1, fShapeX2);
-         bool broadcastX1 = !UTILITY::AreSameShape(fShapeX1, fShapeY);
-         bool broadcastX2 = !UTILITY::AreSameShape(fShapeX2, fShapeY);
-         // Broadcast A to Y
-         if (broadcastX1) {
-            if (model.IsInitializedTensor(fNX1)) {
-               auto data = model.GetInitializedTensorData(fNX1);
-               std::shared_ptr<void> broadcastedData(
-                  UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeX1, fShapeY),
-                  std::default_delete<T[]>());
-               // Update the data and the shape of A
-               model.UpdateInitializedTensor(fNX1, model.GetTensorType(fNX1), fShapeY, broadcastedData);
-               fShapeX1 = fShapeY;
-            } else {
-               // Add an intermediate tensor for broadcasting A
-               fNBroadcastedX1 = "Broadcasted" + fNX1;
-               model.AddIntermediateTensor(fNBroadcastedX1, model.GetTensorType(fNX1), fShapeY);
+      // case of non dynamic tensors
+      if (!fShapeX1.empty() && !fShapeX2.empty()) {
+         bool broadcastX1 = false;
+         bool broadcastX2 = false;
+         if (UTILITY::AreSameShape(fShapeX1, fShapeX2)) {
+            // no broadcast needed
+            fShapeY = fShapeX1;
+         } else  {
+            // Y is the common shape of A and B
+            fShapeY = UTILITY::UnidirectionalBroadcastShape(fShapeX1, fShapeX2);
+            broadcastX1 = !UTILITY::AreSameShape(fShapeX1, fShapeY);
+            broadcastX2 = !UTILITY::AreSameShape(fShapeX2, fShapeY);
+         }
+
+
+         // analyze case of constant tensors or shape tensors (which have known shapes but data as Dim values
+         // normal case with non-dynamic tensor is also here
+         T *data1 = nullptr;
+         T *data2 = nullptr;
+         std::unique_ptr<T> broadcastedData1;
+         std::unique_ptr<T> broadcastedData2;
+         // data for shape tensors
+         std::vector<Dim> shapeData1;
+         std::vector<Dim> shapeData2;
+         size_t length = ConvertShapeToLength(fShapeY);
+         bool *outData = new bool[length];
+         if (model.IsInitializedTensor(fNX1)) {
+            data1 = static_cast<T *>(model.GetInitializedTensorData(fNX1).get());
+            if (broadcastX1) {
+               broadcastedData1 = std::unique_ptr<T>(
+                  UTILITY::UnidirectionalBroadcast<T>(data1, fShapeX1, fShapeY));
+               data1 = broadcastedData1.get();
             }
+
+         } else if (model.IsShapeTensor(fNX1)) {
+            shapeData1 = model.GetShapeTensorValues(fNX1);
          }
-         // Broadcast B to Y
-         if (broadcastX2) {
-            if (model.IsInitializedTensor(fNX2)) {
-               auto data = model.GetInitializedTensorData(fNX2);
-               std::shared_ptr<void> broadcastedData(
-                  UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeX2, fShapeY),
-                  std::default_delete<T[]>());
-               // Update the data and the shape of B
-               model.UpdateInitializedTensor(fNX2, model.GetTensorType(fNX2), fShapeY, broadcastedData);
-               fShapeX2 = fShapeY;
-            } else {
-               // Add an intermediate tensor for broadcasting B
-               fNBroadcastedX2 = "Broadcasted" + fNX2;
-               model.AddIntermediateTensor(fNBroadcastedX2, model.GetTensorType(fNX2), fShapeY);
+         if (model.IsInitializedTensor(fNX2)) {
+            data2 = static_cast<T *>(model.GetInitializedTensorData(fNX2).get());
+            if (broadcastX2) {
+               broadcastedData2 = std::unique_ptr<T>(
+                  UTILITY::UnidirectionalBroadcast<T>(data2, fShapeX2, fShapeY));
+               data2 = broadcastedData2.get();
             }
+         } else if (model.IsShapeTensor(fNX2)) {
+            shapeData2 = model.GetShapeTensorValues(fNX2);
          }
-      } else {
-         fShapeY = fShapeX1;
-      }
-      // case of constant tensors
-      T * data1 = nullptr;
-      T * data2 = nullptr;
-      std::vector<Dim> shapeData1;
-      std::vector<Dim> shapeData2;
-      size_t length = ConvertShapeToLength(fShapeY);
-      bool *  outData = new bool[length];
-      if (model.IsInitializedTensor(fNX1)) {
-         data1 = static_cast<T *>(model.GetInitializedTensorData(fNX1).get());
-      } else if (model.IsShapeTensor(fNX1)) {
-         shapeData1 = model.GetShapeTensorValues(fNX1);
-      }
-      if (model.IsInitializedTensor(fNX2)) {
-         data2 = static_cast<T *>(model.GetInitializedTensorData(fNX2).get());
-      } else if (model.IsShapeTensor(fNX2)) {
-         shapeData2 = model.GetShapeTensorValues(fNX2);
-      }
-      if (data1 && data2) {
-         fIsOutputConstant = true;
-         for (size_t i = 0; i < length; i++)
-            outData[i] = ComparisionTrait<T,Op>::Result(data1[i], data2[i]);
-         model.AddConstantTensor(fNY, fShapeY, outData);
-         if (model.Verbose())
-            std::cout <<  ComparisionTrait<T,Op>::Name() << " op ---> " << fNY << "  " << ConvertShapeToString(fShapeY) << " : "
-               << ConvertValuesToString(length,outData) << std::endl;
-      } else if ((data1 || !shapeData1.empty()) && (data2 || !shapeData2.empty())) {
-         fIsOutputConstant = true;
-         if (data1 && !data2) {
-            // data 1 is constant and data2 is shape
-            for (size_t i = 0; i < length; i++) {
-               if (shapeData2[i].isParam) {
-                  if (shapeData2[i].dim == size_t(-1) || data1[i] > 0) {
-                     fIsOutputConstant = false;
-                     break;
-                  } else {
-                     // assume a comparison is done with .dim = 0
-                     shapeData2[i].dim = 0;
+         if (data1 && data2) {
+            fIsOutputConstant = true;
+            for (size_t i = 0; i < length; i++)
+               outData[i] = ComparisionTrait<T, Op>::Result(data1[i], data2[i]);
+            model.AddConstantTensor(fNY, fShapeY, outData);
+            if (model.Verbose())
+               std::cout << ComparisionTrait<T, Op>::Name() << " op ---> " << fNY << "  "
+                         << ConvertShapeToString(fShapeY) << " : " << ConvertValuesToString(length, outData)
+                         << std::endl;
+         } else if ((data1 || !shapeData1.empty()) && (data2 || !shapeData2.empty())) {
+            fIsOutputConstant = true;
+            if (data1 && !data2) {
+               // data 1 is constant and data2 is shape
+               for (size_t i = 0; i < length; i++) {
+                  if (shapeData2[i].isParam) {
+                     if (shapeData2[i].dim == size_t(-1) || data1[i] > 0) {
+                        fIsOutputConstant = false;
+                        break;
+                     } else {
+                        // assume a comparison is done with .dim = 0
+                        shapeData2[i].dim = 0;
+                     }
                   }
+                  outData[i] = ComparisionTrait<T, Op>::Result(data1[i], static_cast<T>(shapeData2[i].dim));
                }
-               outData[i] = ComparisionTrait<T,Op>::Result(data1[i], static_cast<T>(shapeData2[i].dim));
-            }
-         } else if (!data1 && data2) {
-            // data 1 is shape and dat2 is constant
-            for (size_t i = 0; i < length; i++) {
-               if (shapeData1[i].isParam) {
-                  if (shapeData1[i].dim == size_t(-1) || data2[i] > 0) {
+            } else if (!data1 && data2) {
+               // data 1 is shape and dat2 is constant
+               for (size_t i = 0; i < length; i++) {
+                  if (shapeData1[i].isParam) {
+                     if (shapeData1[i].dim == size_t(-1) || data2[i] > 0) {
+                        fIsOutputConstant = false;
+                        break;
+                     } else {
+                        // assume a comparison is done with .dim = 0
+                        shapeData1[i].dim = 0;
+                     }
+                  }
+                  outData[i] = ComparisionTrait<T, Op>::Result(static_cast<T>(shapeData1[i].dim), data2[i]);
+               }
+            } else if (!shapeData1.empty() && !shapeData2.empty()) {
+               // both data1 and data2 are shape tensors
+               for (size_t i = 0; i < length; i++) {
+                  if (!shapeData1[i].isParam && !shapeData2[i].isParam) {
+                     outData[i] = ComparisionTrait<T, Op>::Result(shapeData1[i].dim, shapeData2[i].dim);
+                  } else if (shapeData1[i].isParam && shapeData2[i].isParam) {
+                     if (shapeData1[i].param == shapeData2[i].param)
+                        outData[i] = ComparisionTrait<int, Op>::Result(1, 1); // comparison of two equal value
+                     else {
+                        fIsOutputConstant = false;
+                        break;
+                     }
+                  } else {
                      fIsOutputConstant = false;
                      break;
-                  } else {
-                     // assume a comparison is done with .dim = 0
-                     shapeData1[i].dim = 0;
                   }
                }
-               outData[i] = ComparisionTrait<T,Op>::Result(static_cast<T>(shapeData1[i].dim), data2[i]);
             }
-         } else if (!shapeData1.empty() && !shapeData2.empty() ) {
-            // both data1 and data2 are shape tensors
-            for (size_t i = 0; i < length; i++) {
-               if (!shapeData1[i].isParam && !shapeData2[i].isParam) {
-                  outData[i] = ComparisionTrait<T,Op>::Result(shapeData1[i].dim, shapeData2[i].dim);
-               }
-               else if (shapeData1[i].isParam && shapeData2[i].isParam) {
-                  if (shapeData1[i].param == shapeData2[i].param)
-                     outData[i] = ComparisionTrait<int,Op>::Result(1,1); // comparison of two equal value
-                  else {
-                     fIsOutputConstant = false;
-                     break;
+            if (fIsOutputConstant) {
+               model.AddConstantTensor(fNY, fShapeY, outData);
+               if (model.Verbose())
+                  std::cout << ComparisionTrait<T, Op>::Name() << " op ---> " << fNY << "  "
+                            << ConvertShapeToString(fShapeY) << " : " << ConvertValuesToString(length, outData)
+                            << " (constant) " << std::endl;
+            }
+         }
+         delete[] outData;
+         // case of non constant output (no constant or shape tensors)
+         if (!fIsOutputConstant && !fShapeY.empty()) {
+            model.AddIntermediateTensor(fNY, ETensorType::BOOL, fShapeY);
+            fDimShapeY = ConvertShapeToDim(fShapeY);
+            if (model.Verbose())
+               std::cout << ComparisionTrait<T, Op>::Name() << " op ---> " << fNY << "  "
+                         << ConvertShapeToString(fShapeY) << std::endl;
+         }
+      } else {
+         // case of dynamic tensors
+          // case A or B have dynamic shapes. We need to broadcast if shape are not same
+         auto ret = UTILITY::MultidirectionalBroadcastShape(fDimShapeX1, fDimShapeX2);
+         fBroadcastFlag = ret.first;
+         fDimShapeY = ret.second;
+         // case of all parametric shapes and MultiDirectionalBroadcastShape  return the max of the 2
+         // need to do before we declare the output tensor shape and the broadcasted ones
+         if (ret.first & 4) {
+            // check if one of the parameter is an input dimension
+            // define function to find this
+            auto IsInputDimParam = [&](const std::string &p) {
+               auto inputNames = model.GetInputTensorNames();
+               for (auto &input : inputNames) {
+                  for (auto &i_s : model.GetDimTensorShape(input)) {
+                     if (i_s.isParam && i_s.param == p)
+                        return true;
                   }
                }
-               else {
-                  fIsOutputConstant = false;
-                  break;
+               return false;
+            };
+            for (size_t i = 0; i < fDimShapeY.size(); i++) {
+               auto &s = fDimShapeY[i];
+               if (s.isParam && s.param.find("std::max") != std::string::npos) {
+                  if (IsInputDimParam(fDimShapeX1[i].param)) {
+                     // case dim is 1 we indicate that the input parameter is equal to 1
+                     if (fDimShapeX1[i].dim != 1)
+                        s = fDimShapeX1[i];
+                     else
+                        s = fDimShapeX2[i];
+                  } else if (IsInputDimParam(fDimShapeX2[i].param)) {
+                     if (fDimShapeX2[i].dim != 1)
+                        s = fDimShapeX2[i];
+                     else
+                        s = fDimShapeX1[i];
+                  }
                }
             }
          }
-         if (fIsOutputConstant) {
-            model.AddConstantTensor(fNY, fShapeY, outData);
-            if (model.Verbose())
-               std::cout <<  ComparisionTrait<T,Op>::Name() << " op ---> " << fNY << "  " << ConvertShapeToString(fShapeY) << " : "
-                  << ConvertValuesToString(length,outData) << " (constant) " << std::endl;
 
+         model.AddIntermediateTensor(fNY, ETensorType::BOOL, fDimShapeY);
+         if (model.Verbose()) {
+            std::cout << ComparisionTrait<T, Op>::Name()  << " : " << fNX1 << "  " << ConvertShapeToString(fDimShapeX1) << " , "
+                                                          << fNX2 << "  " << ConvertShapeToString(fDimShapeX2) << " --> "
+                                                          << fNY  << "  " << ConvertShapeToString(fDimShapeY) << std::endl;
+            model.PrintIntermediateTensors();
          }
       }
-      delete [] outData;
-      if (!fIsOutputConstant) {
-         model.AddIntermediateTensor(fNY, ETensorType::BOOL , fShapeY);
-         if (model.Verbose())
-               std::cout <<  ComparisionTrait<T,Op>::Name() << " op ---> " << fNY << "  " << ConvertShapeToString(fShapeY) << std::endl;
-      }
 
       // check if this is not output operators to add a specific line for definining the tensor_xxx variable
       const auto & outputTensorNames = model.GetOutputTensorNames();
@@ -257,39 +295,85 @@ public:
       if (fIsOutputConstant) return "";
       opName = "op_" + opName;
 
-     if (fShapeY.empty()) {
+     if (fDimShapeY.empty()) {
          throw std::runtime_error("TMVA SOFIE Comparision Op called to Generate without being initialized first");
       }
       std::stringstream out;
       out << SP << "\n//------ " << ComparisionTrait<T,Op>::Name() << "  " << opName
                                  << " --> " << ConvertShapeToString(fShapeY) << "\n";
-      size_t length = ConvertShapeToLength(fShapeY);
-      // Broadcast A if it's uninitialized
-      if (!fNBroadcastedX1.empty()) {
-         std::string type1 = ConvertTypeToString(fTensorType1);
-         out << SP << "// Broadcasting uninitialized tensor " << fNX1 << "\n";
-         out << SP << "{\n";
-         out << SP << SP << type1 << "* data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<" << type1 << ">(tensor_" << fNX1 << ", " << ConvertShapeToString(fShapeX1) << ", " << ConvertShapeToString(fShapeY) << ");\n";
-         out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNBroadcastedX1 << ");\n";
-         out << SP << SP << "delete[] data;\n";
-         out << SP << "}\n";
+
+      // need to add check if tensors are compatible as in binary operator
+
+      // use same code as Binary operator
+      auto stridesA = UTILITY::ComputeStrideFromShape(fDimShapeX1);
+      auto stridesB = UTILITY::ComputeStrideFromShape(fDimShapeX2);
+      auto stridesY = UTILITY::ComputeStrideFromShape(fDimShapeY);
+
+      std::string compute_idx_X1, compute_idx_X2, compute_idx_Y;
+      if (fDimShapeX1.empty() ||
+          std::all_of(fDimShapeX1.begin(), fDimShapeX1.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+         compute_idx_X1 = "0";
+      } else {
+         for (size_t i = 0; i < fDimShapeX1.size(); ++i) {
+            if (fDimShapeX1[i].dim == 1 || fDimShapeX1[i].GetVal() == "1")
+               continue;
+            compute_idx_X1 += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeX1.size()));
+            if (stridesA[i].GetVal() != "1")
+               compute_idx_X1 += " * " + stridesA[i].GetVal();
+            compute_idx_X1 += " + ";
+         }
+         // remove last 3 character " + "
+         for (int j = 0; j < 3; j++)
+            compute_idx_X1.pop_back();
+      }
+      if (fDimShapeX2.empty() ||
+          std::all_of(fDimShapeX2.begin(), fDimShapeX2.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+         compute_idx_X2 = "0";
+      } else {
+         for (size_t i = 0; i < fDimShapeX2.size(); ++i) {
+            if (fDimShapeX2[i].dim == 1 || fDimShapeX2[i].GetVal() == "1")
+               continue;
+            compute_idx_X2 += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeX2.size()));
+            if (stridesB[i].GetVal() != "1")
+               compute_idx_X2 += " * " + stridesB[i].GetVal();
+            compute_idx_X2 += " + ";
+         }
+          // remove last 3 character " + "
+         for (int j = 0; j < 3; j++)
+            compute_idx_X2.pop_back();
       }
-      // Broadcast B if it's uninitialized
-      if (!fNBroadcastedX2.empty()) {
-         std::string type2 = ConvertTypeToString(fTensorType2);
-         out << SP << "// Broadcasting uninitialized tensor " << fNX2 << "\n";
-         out << SP << "{\n";
-         out << SP << SP << type2 << "* data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<" << type2 << ">(tensor_" << fNX2 << ", " << ConvertShapeToString(fShapeX2) << ", " << ConvertShapeToString(fShapeY) << ");\n";
-         out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNBroadcastedX2 << ");\n";
-         out << SP << SP << "delete[] data;\n";
-         out << SP << "}\n";
+      int nloop = 0;
+      if (fDimShapeY.empty() ||
+          std::all_of(fDimShapeY.begin(), fDimShapeY.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+         compute_idx_Y = "0";
+      } else {
+         for (size_t i = 0; i < fDimShapeY.size(); ++i) {
+            if (fDimShapeY[i].dim != 1 && fDimShapeY[i].GetVal() != "1") {
+               nloop++;
+               for (int j = 0; j < nloop; j++) out << SP;
+               out << "for (size_t idx_" << i << " = 0; idx_" << i << " < " << fDimShapeY[i]
+                   << "; ++idx_" << i << "){\n";
+               compute_idx_Y += "idx_" + std::to_string(i);
+               if (stridesY[i].GetVal() != "1")
+                  compute_idx_Y += " * " + stridesY[i].GetVal();
+               compute_idx_Y += " + ";
+            }
+         }
+         // remove last 3 characters " + "
+         for (int j = 0; j < 3; j++)
+            compute_idx_Y.pop_back();
+      }
+      for (int j = 0; j < nloop + 1; j++) out << SP;
+      out << "tensor_" << fNY << "[" << compute_idx_Y << "] = "
+          << ComparisionTrait<T,Op>::Op( "tensor_" + fNX1 + "[" + compute_idx_X1 + "]" ,
+                                         "tensor_" + fNX2 + "[" + compute_idx_X2 + "]") <<  " ;\n";
+
+
+      for (int i = nloop; i > 0; i--) {
+         for (int j = 0; j < i; j++) out << SP;
+         out << "}\n";
       }
-      const std::string& nameX1 = fNBroadcastedX1.empty()? fNX1 : fNBroadcastedX1;
-      const std::string& nameX2 = fNBroadcastedX2.empty()? fNX2 : fNBroadcastedX2;
 
-      out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n";
-      out << SP << SP << "fTensor_" << fNY << "[id] = " << ComparisionTrait<T,Op>::Op( "tensor_" + nameX1 + "[id]" , "tensor_" + nameX2 + "[id]") <<  " ;\n";
-      out << SP << "}\n";
       // since output is a boolean need to add the tensor_xxx variable since it is not defined as a pointer to a boolean std::vector
       if (!fIsModelOutput)
          out << SP << "const std::vector<std::uint8_t> & tensor_" << fNY << " = fTensor_" << fNY << ";\n";
diff --git a/tmva/sofie/inc/TMVA/ROperator_Concat.hxx b/tmva/sofie/inc/TMVA/ROperator_Concat.hxx
index ad855341dfc17..d8155195c9f49 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Concat.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Concat.hxx
@@ -123,7 +123,7 @@
                            concat_dim = inputs[i][iaxis];
                         else if (inputs[i][iaxis].isParam || concat_dim.isParam) {
                            concat_dim =
-                              Dim{ concat_dim.GetVal() + std::string("+ ") + inputs[i][iaxis].GetVal(),
+                              Dim{ concat_dim.GetVal() + std::string(" + ") + inputs[i][iaxis].GetVal(),
                                  static_cast<size_t>(-1)};
                         } else {
                            concat_dim = Dim { concat_dim.dim + inputs[i][iaxis].dim };
@@ -156,7 +156,7 @@
                }
 
                // output shape for concatenated axis
-               ret[fAxis] = Dim{concat_dim};
+               ret[fAxis] = concat_dim;
 
             }
             // case of stacking (not supported yet)
@@ -205,7 +205,7 @@
                      size_t inputLength = ConvertShapeToLength(inputShape);
                      std::copy(inputData, inputData + inputLength, outputData.begin() + offset );
                      offset += inputLength;
-                     // data do not need to be written as a weight
+                     // data do not need to be written in teh generated code
                      model.SetNotWritableInitializedTensor(input);
                   }
                   model.AddConstantTensor<int64_t>(fOutput, outputShape, outputData.data());
@@ -221,15 +221,18 @@
                      std::vector<Dim> inputData;
                      auto inputShape = model.GetTensorShape(input); // shape is not dynamic
                      size_t inputLength = ConvertShapeToLength(inputShape); // shape can be a scalar
-                     if (model.IsShapeTensor(input))
+                     if (model.IsShapeTensor(input)) {
                         inputData = model.GetShapeTensorValues(input);
-                     else if (model.IsConstantTensor(input)) {
+                     } else if (model.IsInitializedTensor(input)) {
                         inputData.resize(inputLength);
                         auto intData = static_cast<int64_t*>(model.GetInitializedTensorData(input).get());
                         for (size_t i = 0; i < inputData.size(); i++)
                            inputData[i] = Dim{ static_cast<size_t>(intData[i])};
                      }
-                     std::cout << "concatenating input data " << inputLength << "  " << inputData[0] << std::endl;
+                     else {
+                        // this should not happen
+                        throw std::runtime_error("TMVA SOFIE Concat Operator- invalid input type for shape output type");
+                     }
                      std::copy(inputData.begin(), inputData.end(), outputData.begin() + offset );
                      offset += inputLength;
                   }
@@ -251,13 +254,15 @@
          }
 
          std::string Generate(std::string opName) override {
-            if (fIsOutputConstant) return "";
             opName = "op_" + opName;
+            std::stringstream out;
+            out<<"\n//--------- Concat " << opName << " --> " << fOutput << "  " << ConvertShapeToString(fOutputShape) << "\n";
+
+            if (fIsOutputConstant) return out.str();
+
             if(fOutputShape.empty()){
                   throw std::runtime_error("TMVA SOFIE Concat called to Generate without being initialized first");
             }
-            std::stringstream out;
-            out<<"\n//--------- Concat " << opName << " --> " << ConvertShapeToString(fOutputShape) << "\n";
             // special case when memory is contiguous
             bool hasShapeOnes = true;
             for(int i = 0; i<fAxis; ++i){
@@ -299,14 +304,14 @@
 
                for (size_t j = 0; j < fInputs.size(); j++) {
                   if (j>0)
-                  out << SP << SP << SP << "idxOut += " << fInputShapes[j-1][fAxis].GetVal() << ";\n";
+                  out << SP << SP << SP << "idxOut += " << inStrides[j-1][fAxis-1].GetVal() << ";\n";
                   out << SP << SP << SP << "int idxIn" << j <<" = ";
                   for (int k = 0; k < fAxis; k++) {
                      if (k > 0) out << " + ";
                      out << inStrides[j][k].GetVal() << "*i" << k;
                   }
                   out << ";\n";
-                  out << SP << SP << SP << "for (size_t iC = 0; iC < " << fInputShapes[j][fAxis].GetVal() << "; ++iC) {\n";
+                  out << SP << SP << SP << "for (size_t iC = 0; iC < " << inStrides[j][fAxis-1].GetVal() << "; ++iC) {\n";
                   out << SP << SP << SP << SP << "tensor_" << fOutput << "[idxOut+iC] = tensor_" << fInputs[j] << "[idxIn" << j << "+iC];\n";
                   out << SP << SP << SP << "}\n";
                // concatenate the axis values
diff --git a/tmva/sofie/inc/TMVA/ROperator_Constant.hxx b/tmva/sofie/inc/TMVA/ROperator_Constant.hxx
index 1cf5d13f5cd6f..3b339e3440488 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Constant.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Constant.hxx
@@ -136,9 +136,9 @@ public:
       std::stringstream out;
       if (fIsOutputConstant) {
          if (fNX.empty())
-            out <<  "// ---- Constant (no-op) " << opName << " --> " << ConvertShapeToString(fDimOutputShape) << "\n";
+            out <<  "// ---- Constant (no-op) " << opName << " --> " << fNY << " " << ConvertShapeToString(fDimOutputShape) << "\n";
          else
-            out << "// ---- ConstantOfShape (no-op) " << opName << " --> " << ConvertShapeToString(fDimOutputShape) << "\n";
+            out << "// ---- ConstantOfShape (no-op) " << opName << " --> " << fNY << " " << ConvertShapeToString(fDimOutputShape) << "\n";
          return out.str();
       }
       // Only ConstantOfShape might require generation code
diff --git a/tmva/sofie/inc/TMVA/ROperator_Conv.hxx b/tmva/sofie/inc/TMVA/ROperator_Conv.hxx
index 95f226ca91d4b..2681eeb2dd84c 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Conv.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Conv.hxx
@@ -297,16 +297,25 @@ public:
             }
          }
       }
-      // output channel size can be parametric
+      // output channel size can be parametric and is an expression
       std::vector<Dim> outputDims = std::vector<Dim>(fShapeY.begin()+2, fShapeY.end());
-      auto outputChannelSize = ConvertDimShapeToLength(outputDims); // size/channel = D * H * W
+      //check if shape is not parametric
+      std::vector<size_t> outputInts = ConvertShapeToInt(outputDims);
+      Dim channelDim;
+      if (outputInts.empty()) {
+         auto outputChannelSize = ConvertDimShapeToLength(outputDims); // size/channel = D * H * W
+         channelDim = Dim{ outputChannelSize, static_cast<size_t>(-1)};
+      } else {
+         size_t outputChannelSize = ConvertShapeToLength(outputInts);
+         channelDim = Dim{ outputChannelSize };
+      }
       size_t kernelSize = fAttrKernelShape[0];
       for (size_t i = 1; i < fDim; i++) {
          kernelSize *= fAttrKernelShape[i];
       }
 
       std::vector<size_t> shape1 = {fShapeW[0], fShapeW[1], kernelSize};
-      std::vector<Dim> shape2 = {Dim{fShapeW[1]}, Dim{kernelSize}, Dim{outputChannelSize}};
+      std::vector<Dim> shape2 = {Dim{fShapeW[1]}, Dim{kernelSize}, channelDim };
       model.AddIntermediateTensor(fNX +"_f", ConvertStringToType(fType), shape1 );
       model.AddIntermediateTensor(fNX +"_xcol", ConvertStringToType(fType), shape2 );
       convK = fNX +"_f";
diff --git a/tmva/sofie/inc/TMVA/ROperator_Gather.hxx b/tmva/sofie/inc/TMVA/ROperator_Gather.hxx
index 81411b8ebf71a..1d51c59380dae 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Gather.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Gather.hxx
@@ -153,13 +153,14 @@ public:
    }
 
    std::string Generate(std::string opName) override {
+      opName = "op_" + opName;
+      std::stringstream out;
+      out << "//--------- Gather " << opName << " --> " << fNY << "  " << ConvertShapeToString(fShapeY) << "\n";
       if (fIsOutputConstant) {
          // no code to generate here for constant output. Tensor output is defined in Session constructor
-         return "//---------------------------------------\n";
+         out << "//--------------------(constant)----------\n";
+         return out.str();
       }
-      opName = "op_" + opName;
-      std::stringstream out;
-      out << "//--------- Gather " << opName << " --> " << ConvertShapeToString(fShapeY) << "\n";
       // The shape of the output is q + r - 1
       size_t r = fShapeX.size();
       // Indices of shape q
diff --git a/tmva/sofie/inc/TMVA/ROperator_Range.hxx b/tmva/sofie/inc/TMVA/ROperator_Range.hxx
index 9cac15a14fc52..7c138c3b3def5 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Range.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Range.hxx
@@ -39,15 +39,6 @@ public:
                   "TMVA::SOFIE - Unsupported type by Range operator");
    }
 
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      return input;
-   }
-
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      auto ret = input; //suggest copy to compiler
-      return ret;
-   }
-
    void Initialize(RModel& model) override {
        //input must be a graph input, or already initialized intermediate tensor
       if (!model.CheckIfTensorAlreadyExist(fNStart)) {
@@ -63,32 +54,94 @@ public:
             std::runtime_error("TMVA SOFIE Range Op Input Tensor " + fNDelta + "is not found in model");
       }
       ETensorType type = ConvertStringToType(fType);
-      if (model.IsInitializedTensor(fNStart) && model.IsInitializedTensor(fNDelta) && model.IsInitializedTensor(fNLimit)) {
-         T * start = static_cast<T*>(model.GetInitializedTensorData(fNStart).get());
-         T * limit = static_cast<T*>(model.GetInitializedTensorData(fNLimit).get());
-         T * delta = static_cast<T*>(model.GetInitializedTensorData(fNDelta).get());
-         if (!start || !delta || !limit)
-            std::runtime_error("TMVA SOFIE Range Op Input Tensor has invalid input data");
-         T a = *start;
-         T b = *limit;
-         T d = *delta;
-         int number_of_elements = std::max( static_cast<double>(std::ceil( (b - a) / d )) , 0. );
+
+
+
+      auto analyzeInput = [&](const std::string & tName, T & value, Dim & dim) {
+         int ftype = 0; // type of input (0 intermediate, 1 constant , 2 shape)
+         if (model.IsInitializedTensor(tName)) {
+            T * data = static_cast<T*>(model.GetInitializedTensorData(tName).get());
+            if (!data)
+               std::runtime_error("TMVA SOFIE Range Op Input Tensor has invalid input  data");
+            value = *data;
+            ftype = 1;
+         } else if (model.IsShapeTensor(tName)) {
+            auto data = model.GetShapeTensorValues(tName);
+            dim = data[0];
+            if (!dim.isParam) {
+               value = static_cast<T>(dim.dim);
+               ftype = 1;
+            } else
+               ftype = 2;
+         }
+         return ftype;
+      };
+
+      T start_value;
+      T limit_value;
+      T delta_value;
+      Dim start_dim;
+      Dim limit_dim;
+      Dim delta_dim;
+      int res1 = analyzeInput(fNStart, start_value, start_dim);
+      int res2 = analyzeInput(fNLimit, limit_value, limit_dim);
+      int res3 = analyzeInput(fNDelta, delta_value, delta_dim);
+      if (res1 == 0 || res2 == 0 || res3 == 0) {
+         // cannot know at compile time- need to do fully at run time
+         //
+         fShape = {Dim{"range_size_" + fNStart + "_" + fNLimit}};
+         model.AddDynamicTensor(fNOutput, type, fShape);
+      } else if (res1 == 1 && res2 == 1 && res3 == 1) {
+         size_t number_of_elements = std::max(static_cast<int>(std::ceil((limit_value - start_value) / delta_value )) , 0 );
+         fIsOutputConstant = true;
+
+         // compute output
          std::vector<T> output(number_of_elements);
-         for (int i=0; i<number_of_elements; ++i) {
-            output[i] =  a + (i * d);
+         for (size_t i=0; i<number_of_elements; ++i) {
+            output[i] =  start_value + (i * delta_value);
          }
-         std::vector<size_t> shape = {static_cast<size_t>(number_of_elements)};
+         std::vector<size_t> shape = {number_of_elements};
          model.AddConstantTensor(fNOutput,shape, output.data());
-         fIsOutputConstant = true;
-         // set the input tensor not writable
+         fShape = ConvertShapeToDim(shape);
+
+          // set the input tensor not writable
          model.SetNotWritableInitializedTensor(fNStart);
          model.SetNotWritableInitializedTensor(fNDelta);
          model.SetNotWritableInitializedTensor(fNLimit);
+
+      } else { // case of a shape tensor
+         std::string start = (res1 == 1) ? std::to_string(start_value) : start_dim.GetVal();
+         std::string limit = (res2 == 1) ? std::to_string(limit_value) : limit_dim.GetVal();
+         std::string delta = (res3 == 1) ? std::to_string(delta_value) : delta_dim.GetVal();
+         std::stringstream s;
+         if (type == ETensorType::FLOAT ) {
+            if (delta_value == 1)
+               s <<  "std::max(std::ceil("<< limit << " - " << start << "),0.0f)";
+            else
+               s <<  "std::max(std::ceil(("<< limit << " - " << start << ")/" << delta << "),0.0f)";
+         } else if (type == ETensorType::INT64 ) {
+            if (delta == "1") {
+               if (start == "0")
+                  s <<  limit;
+               else
+                  s << "std::max((" << limit << " - " << start << "),0L)";
+            } else {
+               if (start == "0")
+                  s <<  "((" << limit << ")/" << delta << ")";
+               else
+                  s << "std::max((" << limit << " - " << start << ")/"<< delta << "),0L)";
+            }
+         } else {
+            throw
+               std::runtime_error("TMVA SOFIE Range Op Input Tensor " + ConvertTypeToString(type) + "is not supported");
+         }
+
+
+         fShape = { Dim {s.str(), static_cast<size_t>(-1)} };
+         model.AddDynamicTensor(fNOutput,type, fShape);
       }
-      else {
-         fShape = {Dim{"range_size"}};
-         model.AddDynamicTensor(fNOutput, type, fShape);
-      }
+
+
       if (model.Verbose()) {
          std::cout << "Range -> output is " << fNOutput << " : " << ConvertShapeToString(fShape);
          if (fIsOutputConstant) std::cout << " : " << ConvertValuesToString(model.GetTensorData<T>(fNOutput));
@@ -96,26 +149,31 @@ public:
       }
    }
 
-   std::string Generate(std::string OpName) override {
+   std::string Generate(std::string opName) override {
 
       std::stringstream out;
-      out << "\n//------ Range\n";
+      out << "\n//------ Range " << opName << "---> " << ConvertDimShapeToString(fShape) << "\n";
       if (fIsOutputConstant) return out.str();
 
-      OpName = "op_" + OpName;
+      opName = "op_" + opName;
       if (fShape.empty()) {
          throw std::runtime_error("TMVA SOFIE Range operator called to Generate without being initialized first");
       }
 
       std::string sizeName = fShape[0].param;
-      out << SP << "size_t " << sizeName << " = static_cast<size_t>(std::max(std::ceil((static_cast<float>(*tensor_" << fNLimit << ") - static_cast<float>(*tensor_" << fNStart << ")) / static_cast<float>(*tensor_" << fNDelta << ")), 0.0f));\n";
-      out << SP << "if (" << sizeName << " > " << "fTensor_" << fNOutput << ".size() ){\n";
-      out << SP << SP << "fTensor_" << fNOutput << ".resize(" << sizeName << ");\n";
+      if (sizeName.find("range_size") != std::string::npos)
+         sizeName = "static_cast<size_t>(std::max(std::ceil((static_cast<float>(*tensor_" + fNLimit +
+                ") - static_cast<float>(*tensor_" + fNStart + ")) / static_cast<float>(*tensor_" + fNDelta + ")), 0.0f))";
+      out << SP << "{\n";
+      out << SP << SP << "size_t range" << " = " << sizeName << ";\n";
+      out << SP << SP << "if ( range > " << "fTensor_" << fNOutput << ".size() ){\n";
+      out << SP << SP << SP << "fTensor_" << fNOutput << ".resize(range);\n";
       // need to re-initialized pointer to tensor data
-      out << SP << SP << "tensor_" << fNOutput << " = fTensor_" << fNOutput << ".data();\n";
-      out << SP << "}\n";
-      out << SP << "for (size_t i = 0; i < " << sizeName << "; i++) {\n";
-      out << SP << SP << "fTensor_" << fNOutput << "[i] = *tensor_" << fNStart << " + i * (*tensor_" << fNDelta << ");\n";
+      out << SP << SP << SP << "tensor_" << fNOutput << " = fTensor_" << fNOutput << ".data();\n";
+      out << SP << SP << "}\n";
+      out << SP << SP << "for (size_t i = 0; i < range; i++) {\n";
+      out << SP << SP << SP << "fTensor_" << fNOutput << "[i] = *tensor_" << fNStart << " + i * (*tensor_" << fNDelta << ");\n";
+      out << SP << SP << "}\n";
       out << SP << "}\n";
       return out.str();
    }
diff --git a/tmva/sofie/inc/TMVA/ROperator_Reduce.hxx b/tmva/sofie/inc/TMVA/ROperator_Reduce.hxx
index 1204770d3d321..1da588e965a01 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Reduce.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Reduce.hxx
@@ -166,7 +166,7 @@ public:
       std::string reducedLength;
       if (fInputDimShape) {
          reducedLength = "reducedLength_" + opName;
-         out << SP << "size_t " << reducedLength << " = " <<  inputLength << " / " << outputLength << ";\n";
+         out << SP << "size_t " << reducedLength << " = (" <<  inputLength << ") / (" << outputLength << ");\n";
       } else {
          int rLength = std::stoi(inputLength) / std::stoi(outputLength);
          reducedLength = std::to_string(rLength);
diff --git a/tmva/sofie/inc/TMVA/ROperator_Reshape.hxx b/tmva/sofie/inc/TMVA/ROperator_Reshape.hxx
index 2634b68dbc875..a3ed28c4860bc 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Reshape.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Reshape.hxx
@@ -108,6 +108,9 @@ public:
 
                if (IsInteger(tmp_length) && IsInteger(input_length))
                   output_shape[i] = Dim{static_cast<size_t>(std::stoi(input_length) / std::stoi(tmp_length))};
+               else if (IsInteger(tmp_length) && std::stoi(tmp_length) == 1) {
+                  output_shape[i] = Dim{input_length, static_cast<size_t>(-1)};
+               }
                else {
                   //we can try simplifying expression if tmp_length is integer and part of input_length
                   // contains tmp_length
@@ -243,7 +246,7 @@ public:
       // check if optional tensor exists defining shape or axes
       if (!fNInput2.empty()) {
          if (model.CheckIfTensorAlreadyExist(fNInput2)) {
-            if (model.IsConstantTensor(fNInput2) || model.IsInitializedTensor(fNInput2)) {
+            if (model.IsInitializedTensor(fNInput2)) {
                // assume input shape is an initialized tensor
                auto dptr = model.GetInitializedTensorData(fNInput2);
                auto values = static_cast<int64_t *>(dptr.get());
@@ -260,6 +263,9 @@ public:
                fShapeOutput = ShapeInference({fShapeInput})[0];
                // set flag to not write tensor in weight file. Its data will be hard-coded in way model is constructed
                model.SetNotWritableInitializedTensor(fNInput2);
+            } else if (model.IsShapeTensor(fNInput2)) {
+               auto shapeData = model.GetShapeTensorValues(fNInput2);
+               fShapeOutput = shapeData;
             } else {
                // we cannot get shape at initialization time but at run-time
                fDynamicShape = true;
diff --git a/tmva/sofie/inc/TMVA/ROperator_Slice.hxx b/tmva/sofie/inc/TMVA/ROperator_Slice.hxx
index b23e3b0a86d21..3add774b0d8d4 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Slice.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Slice.hxx
@@ -235,6 +235,8 @@ public:
                if (iend < 0) {
                   std::string send = std::string("(") + fShapeInput[fAxes[i]].param + "-" + std::to_string(-iend) +")";
                   fEnd[fAxes[i]] = Dim{send,size_t(-1)};
+               } else if (iend == std::numeric_limits<IType>::max()){
+                  fEnd[fAxes[i]] = fShapeInput[fAxes[i]];
                } else {
                  fEnd[fAxes[i]] = Dim{size_t(iend)};
                }
@@ -332,23 +334,23 @@ public:
       else {
          model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput);
          if (model.Verbose()) {
-            std::cout << "Slice ---> " << fNOutput << " " <<  ConvertShapeToString(fShapeOutput) << std::endl;
+            std::cout << "Slice " << fNData << "  " << ConvertShapeToString(fShapeInput)
+                      << "---> " << fNOutput << " " <<  ConvertShapeToString(fShapeOutput) << std::endl;
          }
       }
    }
 
-   std::string Generate(std::string OpName) override {
-      if (fIsOutputConstant) return "";  //no op for constant tensors
+   std::string Generate(std::string opName) override {
 
-      OpName = "op_" + OpName;
       if (fShapeInput.empty() || fShapeOutput.empty()){
          throw std::runtime_error("TMVA SOFIE Slice Op called to Generate without being initialized first");
       }
 
       std::stringstream out;
-      //std::string opName = "Slice";
 
-      out << SP << "///------- Slice operator\n" << std::endl;
+      out << "///------- Slice operator " << opName << "---> " << fNOutput << " "
+          << ConvertDimShapeToString(fShapeOutput) << "\n" << std::endl;
+      if (fIsOutputConstant) return out.str();  //no op for constant tensors
       // loop on the dimensions depending no the orders
       size_t ndim = fShapeInput.size();
       auto strides = UTILITY::ComputeStrideFromShape(fShapeInput);
diff --git a/tmva/sofie/inc/TMVA/ROperator_Tile.hxx b/tmva/sofie/inc/TMVA/ROperator_Tile.hxx
index 1086f72eae71c..9b291b40e0854 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Tile.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Tile.hxx
@@ -20,8 +20,8 @@ private:
    std::string fNRepeats;
    std::string fNInput;
    std::string fNY;
-   std::vector<size_t>fShapeInput;
-   std::vector<size_t> fShapeY;
+   std::vector<Dim>fShapeInput;
+   std::vector<Dim> fShapeY;
 
 public:
    ROperator_Tile(){}
@@ -35,13 +35,18 @@ public:
       return input;
    }
 
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      std::vector<size_t> ret = input[0];
-
-      for(size_t i=0; i < input[1].size(); i++) {
-            ret[i]=ret[i]*input[1][i];
+   std::vector<Dim> DoShapeInference(const std::vector<Dim> & input, const std::vector<size_t> repeat)  {
+      std::vector<Dim> ret = input;
+      for(size_t i=0; i < repeat.size(); i++) {
+         if (repeat[i] != 1) {
+            if (ret[i].isParam) {
+               ret[i] = Dim{ std::string(ret[i].GetVal() + "*" + std::to_string(repeat[i])), static_cast<size_t>(-1) };
+            } else {
+               ret[i]=Dim { ret[i].dim *repeat[i] };
+            }
+         }
       }
-      return {ret};
+      return ret;
    }
 
    void Initialize(RModel& model) override {
@@ -52,7 +57,7 @@ public:
       if (model.CheckIfTensorAlreadyExist(fNRepeats) == false){
         throw std::runtime_error("TMVA SOFIE Tile Op Input Tensor is not found in model");
       }
-      fShapeInput=model.GetTensorShape(fNInput);
+      fShapeInput=model.GetDimTensorShape(fNInput);
 
       // if repeats vector is not initialized we cannot deduce shape of output
       // not support for time being this case
@@ -79,12 +84,12 @@ public:
       std::copy(repeats_data, repeats_data + num_elements, repeats_vector.begin());
 
 
-      fShapeY = ShapeInference({fShapeInput,repeats_vector})[0];
+      fShapeY = DoShapeInference(fShapeInput,repeats_vector);
 
       model.AddIntermediateTensor(fNY, model.GetTensorType(fNInput), fShapeY);
 
       if (model.Verbose())
-         std::cout <<  "Tile: " << fNInput << " " << ConvertShapeToString(fShapeInput) << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY)
+         std::cout <<  "Tile: " << fNInput << " " << ConvertDimShapeToString(fShapeInput) << " -> " << fNY << " with shape " << ConvertDimShapeToString(fShapeY)
             << " given repeats " << ConvertShapeToString(repeats_vector) << std::endl;
    }
 
@@ -103,9 +108,9 @@ public:
       std::string output = "tensor_" + fNY;
       out << "///-------- Tile operator\n";
       out << "{\n"; // add scope to re-use same names
-      out << "const int input_shape[" << fShapeInput.size() << "] = " << ConvertShapeToString(fShapeInput) << ";\n";
+      out << "const size_t input_shape[" << fShapeInput.size() << "] = " << ConvertDimShapeToString(fShapeInput) << ";\n";
 
-      out << "int inputLength = " << ConvertShapeToLength(fShapeInput) << ";\n";
+      out << "int inputLength = " << ConvertDimShapeToLength(fShapeInput) << ";\n";
       out << "int s = 1;\n";
       // loop from inverse dim order
       out << "for (int i = " << fShapeInput.size()-1 << "; i >=0; i--) {\n";
diff --git a/tmva/sofie/inc/TMVA/ROperator_TopK.hxx b/tmva/sofie/inc/TMVA/ROperator_TopK.hxx
index 0869437bb6b0c..edee91de8eb57 100644
--- a/tmva/sofie/inc/TMVA/ROperator_TopK.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_TopK.hxx
@@ -19,13 +19,13 @@ private:
    int fAttrLargest;
    int fAttrSorted;
 
-   size_t fK;
+   Dim fK;
    std::string fNK;
    std::string fNX;
    std::string fNVal;
    std::string fNInd;
-   std::vector<size_t> fShapeX;
-   std::vector<size_t> fShapeY;
+   std::vector<Dim> fShapeX;
+   std::vector<Dim> fShapeY;
    std::string fType;
 
 public:
@@ -43,23 +43,10 @@ public:
         }
 
    std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-         ETensorType ret = input[0];
-         return {ret, ret};
-      }
-
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      if (input.size() != 2) {
-         throw std::runtime_error("TMVA SOFIE TopK Op Shape Inference needs exactly 2 input tensors");
-      }
-
-      auto shape = input[0]; // Shape format: [ m x n x o x p ... ]
-
-      // set the dimension at the specified axis to k  (fAttrAxis is checked before that is in the correct range
-      shape[fAttrAxis] = fK; // Modified shape: [ m x n x k x p ... ]
-      return {shape, shape};
+      ETensorType ret = input[0];
+      return {ret, ret};
    }
 
-
    void Initialize(RModel& model) override {
       if (model.CheckIfTensorAlreadyExist(fNX) == false) {
          // input must be a graph input, or already initialized intermediate tensor
@@ -70,10 +57,10 @@ public:
          throw std::runtime_error("TMVA SOFIE TopK Op Input Tensor i.e. K is not found in model");
       }
 
-      fShapeX = model.GetTensorShape(fNX);
+      fShapeX = model.GetDimTensorShape(fNX);
       auto fShapeK = model.GetTensorShape(fNK);
       auto kptr = static_cast<int64_t *>(model.GetInitializedTensorData(fNK).get());
-      fK = *kptr;
+      size_t kval = *kptr;
       model.SetNotWritableInitializedTensor(fNK);
       fAttrAxis = fAttrAxis < 0 ? fShapeX.size() + fAttrAxis : fAttrAxis;
       if(static_cast<size_t>(fAttrAxis) >=  fShapeX.size()){
@@ -81,14 +68,25 @@ public:
             std::runtime_error("TMVA::SOFIE ONNX TopK op axis = "+ std::to_string(fAttrAxis) +" value exeeds size of tensor " +fNX+" of size "+fShapeX.size()+" .");
       }
       // fK cannot be larger that axis dimension
-      fK = std::min(fK, fShapeX[fAttrAxis]);
+      if (fShapeX[fAttrAxis].isParam)
+         fK = Dim{std::string("std::min(size_t(" + std::to_string(kval) + "), " + fShapeX[fAttrAxis].GetVal() + ")" ), static_cast<size_t>(-1) };
+      else
+         fK = Dim { std::min(kval, fShapeX[fAttrAxis].dim) };
+
+      // output shape is equal to input shape apart for value in fAttrAxis
+      fShapeY = fShapeX;
+      fShapeY[fAttrAxis] = Dim{fK};
 
-      fShapeY = ShapeInference({fShapeX, fShapeK})[0];
       model.AddIntermediateTensor(fNVal, model.GetTensorType(fNX), fShapeY);
 
       // output indices should be an int64 tensor
       model.AddIntermediateTensor(fNInd, ETensorType::INT64, fShapeY);
       fType = ConvertTypeToString(model.GetTensorType(fNX));
+
+      if (model.Verbose()) {
+         std::cout << "TopK " << fNX << "  " << ConvertShapeToString(fShapeX)
+                      << "---> " << fNVal << " " <<  ConvertShapeToString(fShapeY) << std::endl;
+      }
    }
 
    std::string Generate(std::string OpName) override {
@@ -101,19 +99,20 @@ public:
       size_t axis = fAttrAxis < 0 ? size + fAttrAxis : fAttrAxis;
       out << "\n" << SP << "//------ TopK\n";
 
-      size_t length=ConvertShapeToLength(fShapeX);
+      auto length=ConvertDimShapeToLength(fShapeX);
       auto strideX = UTILITY::ComputeStrideFromShape(fShapeX);
       auto strideY = UTILITY::ComputeStrideFromShape(fShapeY);
       // we perform loop on dimension before sorted axis and after sorted axis
-      size_t n_before = (axis>0) ? length/strideX[axis-1] : 1;
-      size_t n_after = strideX[axis];
-      size_t n_elements = fShapeX[axis]; // number of elements to be sorted
+      std::vector<Dim> shape_before(fShapeX.begin(), fShapeX.begin() + axis);   // input shape before axis
+      std::string n_before = (axis>0) ? ConvertDimShapeToLength(shape_before) : "1";
+      std::string n_after = strideX[axis].GetVal();
+      std::string n_elements = fShapeX[axis].GetVal(); // number of elements to be sorted
 
       // }
       out << SP << "{\n"; // to define a separate scope for the operator code
       out << SP << "std::vector<std::pair<float,int64_t>> elements(" << n_elements << ");\n";
       // loop on elements before
-      if (n_before > 1) {
+      if (n_before != "1") {
          out << SP << "for (size_t i = 0; i < " << n_before << "; i++) {\n";
          out << SP << SP << "size_t xoffset = i*" << strideX[axis-1] << ";\n";
          out << SP << SP << "size_t yoffset = i*" << strideY[axis-1] << ";\n";
@@ -122,7 +121,7 @@ public:
          out << SP << "size_t xoffset = 0;\n";
          out << SP << "size_t yoffset = 0;\n";
       }
-      if (n_after > 1)
+      if (n_after !=  "1")
          out << SP << "for (size_t j = 0; j < " << n_after << "; j++) {\n";
       else
          out << SP << "const size_t j = 0;\n";
@@ -149,8 +148,8 @@ public:
       out << SP << SP << SP << "tensor_" << fNVal   << "[yoffset + " << strideY[axis] << "*l + j] = elements[l].first;\n";
       out << SP << SP << SP << "tensor_" << fNInd << "[yoffset + " << strideY[axis] << "*l + j] = elements[l].second;\n";
       out << SP << SP << "}\n";
-      if (n_after > 1) out << SP << SP << "}\n";
-      if (n_before> 1) out << SP << "}\n";
+      if (n_after != "1") out << SP << SP << "}\n";
+      if (n_before != "1") out << SP << "}\n";
       out << SP << "}\n"; // end operator scope
       return out.str();
    }
diff --git a/tmva/sofie/inc/TMVA/SOFIE_common.hxx b/tmva/sofie/inc/TMVA/SOFIE_common.hxx
index 2dae4f7d03ce7..dfa46a44c03b0 100644
--- a/tmva/sofie/inc/TMVA/SOFIE_common.hxx
+++ b/tmva/sofie/inc/TMVA/SOFIE_common.hxx
@@ -252,8 +252,14 @@ public:
    bool IsConstantTensor() const { return fConstant;}
    // query if tensor needs to be written in a weight file. Constant tensors are not written in a file
    bool IsWeightTensor() const { return !fConstant && !fIsNotWritable;}
+   // check if a Tensor is Writable (need to be written in teh file or in the generated code (e.g. as a costant tensor)
+   // if an initialized tensors is used in a constant operator at compile time does not need to be written and can be omitted in
+   // the generated code
+   bool IsNotWritable() const { return fIsNotWritable; }
    // set not writable initialized tensors - i.e. tensor that must not be written in a file
    void SetNotWritable() { fIsNotWritable = true;}
+   // set as constant (needed for non-flot initialized tensors)
+   void SetConstant() { fConstant = true;}
 
    template <class T = void>
    T const *data() const
diff --git a/tmva/sofie/src/RModel.cxx b/tmva/sofie/src/RModel.cxx
index 2fa6df3f04f8f..32a1d3f235e11 100644
--- a/tmva/sofie/src/RModel.cxx
+++ b/tmva/sofie/src/RModel.cxx
@@ -222,6 +222,7 @@ bool RModel::IsInitializedTensor(const std::string& tensorName) const {
     return fInitializedTensors.find(name) != fInitializedTensors.end();
 }
 bool RModel::IsConstantTensor(const std::string& tensorName) const {
+   // a constant tensor is an initialized tensor but has the constant flag set
     std::string name = UTILITY::Clean_name(tensorName);
     auto itr = fInitializedTensors.find(name);
     if (itr == fInitializedTensors.end()) return false;
@@ -522,6 +523,7 @@ void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool
    fIntermediateTensorInfos.clear();
    fDynamicTensorInfos.clear();
 
+
    // loop on inputs and see if shape can be  full specified
    // if the batch size is provided it can be used to specify the full shape
    // Add the full specified tensors in fReadyInputTensors collection
@@ -581,7 +583,7 @@ void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool
    if (fUseWeightFile) {
       bool modelHasWeights = false;
       for (auto &i : fInitializedTensors) {
-         if (i.second.type() == ETensorType::FLOAT) {
+         if (i.second.IsWeightTensor()) {
             modelHasWeights = true;
             break;
          }
@@ -612,6 +614,13 @@ void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool
       i++;
    }
 
+   // loop on initialized tensors and make the integers as constant to be
+   // not written in a weight file
+   for (auto &i : fInitializedTensors) {
+      if (i.second.IsWeightTensor() && i.second.type() !=  ETensorType::FLOAT)
+         i.second.SetConstant();
+   }
+
    fIsInitialized = true;
 }
 
@@ -684,9 +693,11 @@ std::string GenerateConstantTensorCode(const std::pair<std::string, InitializedT
 void RModel::GenerateInitializedTensorInfo()
 {
    if (!fInitializedTensors.empty())
-      fGC += "// initialized tensors\n";
+      fGC += "// initialized (weights and constant) tensors\n";
 
+   // here are constant tensor or initialized ones which are not weights (e.g. int64_t tensors )
    for (auto &i : fInitializedTensors) {
+      if (i.second.IsNotWritable())  continue;
       if (!fUseWeightFile || i.second.IsConstantTensor()) {
          if (i.second.type() == ETensorType::FLOAT) {
             fGC += GenerateConstantTensorCode<float>(i);
@@ -772,6 +783,9 @@ void RModel::GenerateIntermediateTensorInfo() {
          } else if (i.second.type == ETensorType::INT64) {
             fGC += "std::vector<int64_t> fTensor_" + i.first + ";\n";
             fGC += "int64_t * tensor_" + i.first + " = nullptr;\n";
+         } else if (i.second.type == ETensorType::BOOL) {
+            fGC += "std::vector<uint8_t> fTensor_" + i.first + ";\n";
+            fGC += "uint8_t * tensor_" + i.first + " = nullptr;\n";
          }
       }
    }
@@ -1143,7 +1157,7 @@ void RModel::ReadInitializedTensorsFromFile(long pos) {
                std::string length = std::to_string(ConvertShapeToLength(i.second.shape()));
                fGC += "   ReadTensorFromStream(f, " + tensor_name + ", \"" + tensor_name + "\", " + length + ");\n";
             } else {
-               std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a file");
+               throw std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a file");
             }
         }
         fGC += "   f.close();\n";
@@ -1288,7 +1302,7 @@ long RModel::WriteInitializedTensorsToFile(std::string filename) {
                }
             }
             else {
-               std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be written to a file");
+               throw std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be written to a file");
             }
             if (f.fail())
                std::runtime_error("tmva-sofie failed to write tensor data to file for  " + tensor_name);

From aa4d008b35d8dc2eae37a4c1a12b4b3e19ece00b Mon Sep 17 00:00:00 2001
From: moneta <lorenzo.moneta@cern.ch>
Date: Mon, 10 Nov 2025 23:16:35 +0100
Subject: [PATCH 02/12] [tmva][sofie] Remove special case handling bool outputs

Since we use now for boolean tensors a std::vector<uint8_t> it is not needed to
have a special treatment when the output ttype of the operator is a boolean
(e.g. in Comparison)
---
 tmva/sofie/inc/TMVA/ROperator_Comparision.hxx | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/tmva/sofie/inc/TMVA/ROperator_Comparision.hxx b/tmva/sofie/inc/TMVA/ROperator_Comparision.hxx
index 40c8923676aaf..734434357a149 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Comparision.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Comparision.hxx
@@ -56,7 +56,6 @@ template<typename T, EComparisionOperator Op>
 class ROperator_Comparision final : public ROperator{
 private:
 
-   bool fIsModelOutput = false;
    std::string fNX1;
    std::string fNX2;
    std::string fNY;
@@ -283,12 +282,6 @@ public:
             model.PrintIntermediateTensors();
          }
       }
-
-      // check if this is not output operators to add a specific line for definining the tensor_xxx variable
-      const auto & outputTensorNames = model.GetOutputTensorNames();
-      fIsModelOutput = false;
-      if (std::find(outputTensorNames.begin(), outputTensorNames.end(), fNY) != outputTensorNames.end())
-         fIsModelOutput = true;
    }
 
    std::string Generate(std::string opName) override {
@@ -374,9 +367,6 @@ public:
          out << "}\n";
       }
 
-      // since output is a boolean need to add the tensor_xxx variable since it is not defined as a pointer to a boolean std::vector
-      if (!fIsModelOutput)
-         out << SP << "const std::vector<std::uint8_t> & tensor_" << fNY << " = fTensor_" << fNY << ";\n";
 
       return out.str();
    }

From 6b5c35ba01fab8ad89e2e175a7b2e66144383057 Mon Sep 17 00:00:00 2001
From: moneta <lorenzo.moneta@cern.ch>
Date: Wed, 12 Nov 2025 09:30:56 +0100
Subject: [PATCH 03/12] [tmva][sofie] Add support for greedy memory allocation
 for dynammic tensors

Add a new function in SOFIE_common OrganizeMemory which computes the total memory and the offset for each tensor given tensor begin /end life and size.

Fix also some small issue with dynamic tensor.
One is for the bias of Gemm and Conv. The broadcasting of bias is done for dynamic tensor in the Session constructor only if needed. For the broadcasted tensor  there is no need to create a new tensor, but the existing one is resized to the  broadcasted needed size using vector::resize
---
 .../inc/TMVA/ROperator_BatchNormalization.hxx |   6 +-
 tmva/sofie/inc/TMVA/ROperator_Constant.hxx    |   5 +-
 tmva/sofie/inc/TMVA/ROperator_Conv.hxx        |  35 +++--
 tmva/sofie/inc/TMVA/ROperator_Gemm.hxx        |  52 ++++---
 tmva/sofie/inc/TMVA/ROperator_Range.hxx       |  17 ++-
 tmva/sofie/inc/TMVA/SOFIE_common.hxx          |  16 +++
 tmva/sofie/src/RModel.cxx                     | 115 ++++++++++++----
 tmva/sofie/src/SOFIE_common.cxx               | 128 +++++++++++++++++-
 8 files changed, 305 insertions(+), 69 deletions(-)

diff --git a/tmva/sofie/inc/TMVA/ROperator_BatchNormalization.hxx b/tmva/sofie/inc/TMVA/ROperator_BatchNormalization.hxx
index f2d31796bbbcd..c37e7fc4b68de 100644
--- a/tmva/sofie/inc/TMVA/ROperator_BatchNormalization.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_BatchNormalization.hxx
@@ -141,8 +141,8 @@ public:
       }
    }
 
-   std::string Generate(std::string OpName) override {
-      OpName = "op_" + OpName;
+   std::string Generate(std::string opName) override {
+      opName = "op_" + opName;
       if (fShapeX.empty()){
          throw std::runtime_error("TMVA SOFIE Batch Normalization called to Generate without being initialized first");
       }
@@ -158,7 +158,7 @@ public:
          spatial_dim = ConvertDimShapeToLength( spatialShape);
       }
 
-      out << "\n\n//---- BatchNorm" << (fActivation == EActivationType::RELU ? " + ReLU" : "") << "\n";
+      out << "\n\n//---- BatchNorm" << (fActivation == EActivationType::RELU ? " + ReLU " : " ") << opName << "\n";
       out << SP << "{\n";
       out << SP << "   size_t i = 0;\n";
       out << SP << "   for (size_t n = 0; n < " << batchSize << "; ++n) {\n";
diff --git a/tmva/sofie/inc/TMVA/ROperator_Constant.hxx b/tmva/sofie/inc/TMVA/ROperator_Constant.hxx
index 3b339e3440488..93f3c43feceb9 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Constant.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Constant.hxx
@@ -128,6 +128,7 @@ public:
          }
       } else {
          model.AddIntermediateTensor(fNY, ConvertStringToType(TensorType<T>::Name()), fDimOutputShape);
+         fOutputTensorNames.emplace_back(fNY);
       }
    }
 
@@ -153,9 +154,7 @@ public:
       }
       auto length = ConvertDimShapeToLength(fDimOutputShape);
       // vector is already allocated- fill with values
-      out << SP << "if (" << length << " > fTensor_" << fNY << ".size())\n";
-      out << SP << SP << "fTensor_" << fNY << ".resize(" << length  << ");\n";
-      out << SP << "std::fill(fTensor_" << fNY << ".begin(), fTensor_" << fNY << ".end(), " << fValues[0] << ");\n";
+      out << SP << "std::fill(tensor_" << fNY << ", tensor_" << fNY << " + " << length << ", " << fValues[0] << ");\n";
       return out.str();
    }
 };
diff --git a/tmva/sofie/inc/TMVA/ROperator_Conv.hxx b/tmva/sofie/inc/TMVA/ROperator_Conv.hxx
index 2681eeb2dd84c..823e7fa04717e 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Conv.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Conv.hxx
@@ -20,6 +20,8 @@ template<typename T>
 class ROperator_Conv final : public ROperator
 {
 private:
+   bool fBroadcastBias = false;
+
    std::string fAttrAutopad;
    std::vector<size_t> fAttrDilations;
    size_t fAttrGroup;
@@ -30,7 +32,6 @@ private:
    std::string fNX;
    std::string fNW;
    std::string fNB;
-   std::string fNB2; // bias tensor name after broadcasting
    std::string fNY;
 
    std::string convK;
@@ -262,6 +263,9 @@ public:
                std::runtime_error("TMVA SOFIE Conv op Input Tensor " + fNB + " is not found in model");
          }
          fShapeB = model.GetTensorShape(fNB);
+         if (fShapeB.size() != 1)
+            throw
+               std::runtime_error("TMVA SOFIE Conv op : invalid shape for Bias tensor (is not 1D)");
          std::vector<Dim> targetShape(fShapeY.begin() + 1, fShapeY.end());
          auto shapeDimB = model.GetDimTensorShape(fNB);
          bool broadcast_needed = !UTILITY::AreSameShape(shapeDimB, targetShape);
@@ -278,7 +282,9 @@ public:
             if (fType != "float")
                throw std::runtime_error("TMVA SOFIE Conv op: Broadcasting for non-float type tensors is not supported");
             // here is the actual broadcasting
+            fBroadcastBias = true;
             if (!fUseSession) {
+               // do here broadcasting
                std::vector<size_t> shape(fDim + 1, 1);
                shape[0] = fShapeB[0];
                auto intTargetShape = ConvertShapeToInt(targetShape);
@@ -287,13 +293,6 @@ public:
                   std::default_delete<float[]>());
                model.UpdateInitializedTensor(fNB, model.GetTensorType(fNB), intTargetShape, new_data_ptr);
                fShapeB = model.GetTensorShape(fNB);
-               fNB2 = fNB;   // use same name
-            }
-            else {
-               // In case of session add broadcasting code in Session constructor and in GenerateInitCode
-               // we need to add a new intermediate tensor for broadcasted bias tensor
-               fNB2 = fNB + "bcast";
-               model.AddIntermediateTensor(fNB2, model.GetTensorType(fNB), targetShape);
             }
          }
       }
@@ -334,15 +333,25 @@ public:
    std::string GenerateInitCode() override {
       std::stringstream out;
       // Generate initialization code for broadcasting of bias tensor
-      if (!fNB2.empty()) {
+      if (fBroadcastBias) {
          // include a separate scope to avoid defining unique operator temp variables
          std::vector<size_t> shape(fDim + 1, 1);
+         // bias (is a 1D tensor)
          shape[0] = fShapeB[0];
          std::vector<Dim> targetShape(fShapeY.begin() + 1, fShapeY.end());
-         out << SP << "{\n";
+         out << "//--- broadcast bias tensor " << fNB << "for Conv op if needed \n";
+         // in case of dynamic tensors check needs to be done at run time
+         bool isOutDynamic = ConvertShapeToInt(targetShape).empty();
+         auto length = ConvertDimShapeToLength(targetShape);
+         if (isOutDynamic)
+            out << SP << "if (" << length << " > " << ConvertShapeToLength(shape) << ") {\n";
+         else
+            out << SP << "{\n";
          out << SP << SP << "float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_"
              << fNB << ", " << ConvertShapeToString(shape) << ", " << ConvertShapeToString(fShapeY) << ");\n";
-         out << SP << SP << "std::copy(data, data + " << ConvertDimShapeToLength(targetShape) << ", tensor_" << fNB2 << ");\n";
+         out << SP << SP << "fTensor_" << fNB << ".resize(" << length << ");\n";
+         out << SP << SP << "tensor_" << fNB << " = fTensor_" << fNB << ".data();\n";
+         out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNB << ");\n";
          out << SP << SP << "delete[] data;\n";
          out << SP << "}\n";
       }
@@ -562,13 +571,13 @@ public:
          out << SP << SP << "}\n"; // end of group loop
       }
 
-      if (fNB2 != "") {
+      if (fNB != "") {
          out << SP << "int " << OpName << "_size = " << outputBatchStride << ";\n";
          out << SP << "float " << OpName << "_gamma = 1.0;\n";
          out << SP << "int " << OpName << "_incx = 1;\n";
          out << SP << "int " << OpName << "_incy = 1;\n";
 
-         out << SP << "BLAS::saxpy_(&" << OpName << "_size, &" << OpName << "_gamma, tensor_" << fNB2 << ", &"
+         out << SP << "BLAS::saxpy_(&" << OpName << "_size, &" << OpName << "_gamma, tensor_" << fNB << ", &"
              << OpName << "_incx, tensor_" << fNY << " + out_offset, &" << OpName << "_incy);\n";
 
       }
diff --git a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
index d954720396151..1c8b51d991af2 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
@@ -24,6 +24,7 @@ namespace SOFIE{
 
    private:
       bool fIsDynamic = false;
+      bool fBroadcastBias = false;
 
       float fAttrAlpha = 1.0;
       float fAttrBeta = 1.0;
@@ -33,7 +34,6 @@ namespace SOFIE{
       std::string fNA;
       std::string fNB;
       std::string fNC = "";
-      std::string fNC2; // bias tensor name after broadcasting
       std::string fNY;
       std::string fType;
       EActivationType fActivation;
@@ -222,7 +222,6 @@ namespace SOFIE{
                throw std::runtime_error("TMVA SOFIE Gemm Op Input Tensor" + fNC + " is dynamic and is not supported");
             }
             fShapeC = model.GetTensorShape(fNC);
-            fNC2 = fNC;
             size_t lengthC = ConvertShapeToLength(fShapeC);
             size_t lengthY = ConvertShapeToLength(shapeY);
             // for dynamic outputs broadcasting is always done
@@ -230,6 +229,7 @@ namespace SOFIE{
 
 
             if (broadcast_needed) {
+               fBroadcastBias = true;
                if (!model.UseSession()) {
                   // without session dynamic tensors not supported in Gemm
                   if (fIsDynamic) {
@@ -246,14 +246,18 @@ namespace SOFIE{
                      fShapeC = shapeY;
                   }
                } else {
-                  // In case of session add broadcasting code in Session constructor and in GenerateInitCode
-                  // we need to add a new intermediate tensor for broadcasted bias tensor
-                  fNC2 = fNC + "bcast";
-                  if (!fIsDynamic) {
-                     model.AddIntermediateTensor(fNC2, model.GetTensorType(fNC), shapeY);
-                  }
-                  else
-                     model.AddDynamicTensor(fNC2,model.GetTensorType(fNC), fShapeY);
+                  // /d to add a new intermediate tensor for broadcasted bias tensor
+                  // fNC2 = fNC + "bcast";
+                  // if (!fIsDynamic) {
+                  //    model.AddIntermed/ In case of session add broadcasting code in Session constructor and in GenerateInitCode
+                  // // we neeiateTensor(fNC2, model.GetTensorType(fNC), shapeY);
+                  // }
+                  // else
+                  //    model.AddDynamicTensor(fNC2,model.GetTensorType(fNC), fShapeY);
+                  // // do not add to lists of input/output tensors since broadcasted tensors are special
+                  // // and we manage their memory separatly
+                  // //fInputTensorNames.emplace_back(fNC2);
+                  // //fOutputTensorNames.emplace_back(fNC2);
                }
             }
          }
@@ -291,18 +295,26 @@ namespace SOFIE{
       std::string GenerateInitCode() override {
          std::stringstream out;
          // generate initialization code for broadcasting of bias tensor
-         if (fShapeC.size() != fShapeY.size() && fNC != fNC2) {
+         if (fShapeC.size() != fShapeY.size() && fBroadcastBias) {
             // we broadcast here always C in Y output, so target shape is the one of Y
             // no need to call UTILITY::UnidirectionalBroadcastShape.
             // here in case of parametric shape we need to assume that the parameters will be defined in the initialization code.
-            auto targetShape = fShapeY;
-            // include a separate scope to avoid defining unique operator temp variables
-            out << "//--- broadcast bias tensor " << fNC << "for Gemm op\n";
-            out << SP << "{\n";
-            out << "      float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_"
-               << fNC << "," << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY) << ");\n";
             auto length = ConvertDimShapeToLength(fShapeY); // output size
-            out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNC2 << ");\n";
+            // include a separate scope to avoid defining unique operator temp variables
+            out << "//--- broadcast bias tensor " << fNC << "for Gemm op if needed \n";
+            // in case of dynamic tensors check needs to be done at run time
+            bool isOutDynamic = ConvertShapeToInt(fShapeY).empty();
+            if (isOutDynamic)
+               out << SP << "if (" << length << " > " << ConvertShapeToLength(fShapeC) << ") {\n";
+            else
+               out << SP << "{\n";
+            // here we broadcast
+            out << SP << SP << "float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_"
+                << fNC << "," << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY) << ");\n";
+
+            out << SP << SP << "fTensor_" << fNC << ".resize(" << length << ");\n";
+            out << SP << SP << "tensor_" << fNC << " = fTensor_" << fNC << ".data();\n";
+            out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNC << ");\n";
             out << SP << SP << "delete [] data;\n";
             out << SP << "}\n";
          }
@@ -338,7 +350,7 @@ namespace SOFIE{
 
          // case bias is present
          if (!fNC.empty()){
-            if (fNC2 == fNC) {
+            if (!fBroadcastBias) {
                // add a check in case broadcasting was not needed or done outside of session
                // C should have smaller dimension of Y
                if (!fIsDynamic) {
@@ -381,7 +393,7 @@ namespace SOFIE{
             out << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrBeta << ",";
             // in the case of bias
              if (!fNC.empty())
-               out << "tensor_" << fNC2;
+               out << "tensor_" << fNC;
              else
                out << "nullptr";
              out << ");\n";
diff --git a/tmva/sofie/inc/TMVA/ROperator_Range.hxx b/tmva/sofie/inc/TMVA/ROperator_Range.hxx
index 7c138c3b3def5..16d2cb689d518 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Range.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Range.hxx
@@ -37,6 +37,10 @@ public:
       }
       static_assert( (std::is_same_v<T, float> || std::is_same_v<T, int64_t>),
                   "TMVA::SOFIE - Unsupported type by Range operator");
+      {
+         fInputTensorNames = { fNStart, fNLimit, fNDelta };
+         fOutputTensorNames = { fNOutput };
+      }
    }
 
    void Initialize(RModel& model) override {
@@ -166,13 +170,14 @@ public:
                 ") - static_cast<float>(*tensor_" + fNStart + ")) / static_cast<float>(*tensor_" + fNDelta + ")), 0.0f))";
       out << SP << "{\n";
       out << SP << SP << "size_t range" << " = " << sizeName << ";\n";
-      out << SP << SP << "if ( range > " << "fTensor_" << fNOutput << ".size() ){\n";
-      out << SP << SP << SP << "fTensor_" << fNOutput << ".resize(range);\n";
-      // need to re-initialized pointer to tensor data
-      out << SP << SP << SP << "tensor_" << fNOutput << " = fTensor_" << fNOutput << ".data();\n";
-      out << SP << SP << "}\n";
+      if (sizeName != fShape[0].param) {
+         out << SP << SP << "if ( range > " << "fTensor_" << fNOutput << ".size() ){\n";
+         // we should probably resize the tensor here
+         out << SP << SP << SP << "throw std::runtime_error(\"wrong size allocated for output of range\");\n";
+         out << SP << SP << "}\n";
+      }
       out << SP << SP << "for (size_t i = 0; i < range; i++) {\n";
-      out << SP << SP << SP << "fTensor_" << fNOutput << "[i] = *tensor_" << fNStart << " + i * (*tensor_" << fNDelta << ");\n";
+      out << SP << SP << SP << "tensor_" << fNOutput << "[i] = *tensor_" << fNStart << " + i * (*tensor_" << fNDelta << ");\n";
       out << SP << SP << "}\n";
       out << SP << "}\n";
       return out.str();
diff --git a/tmva/sofie/inc/TMVA/SOFIE_common.hxx b/tmva/sofie/inc/TMVA/SOFIE_common.hxx
index dfa46a44c03b0..7abb7df68d997 100644
--- a/tmva/sofie/inc/TMVA/SOFIE_common.hxx
+++ b/tmva/sofie/inc/TMVA/SOFIE_common.hxx
@@ -811,6 +811,22 @@ void ReadTensorFromStream(std::istream &is, T &target, std::string const &expect
    }
 }
 
+
+// code for the memory greeding allocations
+struct TensorLifeInfo {
+   int begin;   // start time (op index) lifetime
+   int end;     //  end time lifetime
+   size_t size; // size of tensors in bytes
+};
+
+struct MemoryResult {
+  std::size_t total_bytes = 0;  // total memory needed
+  std::vector<size_t> offsets; // resulted offsets for each tensor
+};
+
+/// Greedy best-fit planner with coalescing free list.
+MemoryResult OrganizeMemory(const std::vector<TensorLifeInfo> & tensorsInfo );
+
 } // namespace SOFIE
 } // namespace Experimental
 } // namespace TMVA
diff --git a/tmva/sofie/src/RModel.cxx b/tmva/sofie/src/RModel.cxx
index 32a1d3f235e11..d7ab2b4ad39af 100644
--- a/tmva/sofie/src/RModel.cxx
+++ b/tmva/sofie/src/RModel.cxx
@@ -167,16 +167,15 @@ void RModel::AddOperator(std::unique_ptr<ROperator> op, int order_execution) {
     }
 
     // storing the last usage of tensors which are input to
-    // operators (but are not inputs to the model, i.e. they are intermediate
-    // tensors). This information is needed to keep a check on when a
-    // particular intermediate tensor can be flushed to free up memory for reuse.
+    // operators (but are not inputs to the model or they are not initialized)
+    // We call this function during parsing so we don't have yet initialized the operators
    for(size_t index = 0; index<op_input_tensors.size() &&
-         fInitializedTensors.find(UTILITY::Clean_name(std::string(op_input_tensors[index]))) == fInitializedTensors.end() &&
-         std::find(fInputTensorNames.begin(), fInputTensorNames.end(),
-                   UTILITY::Clean_name(std::string(op_input_tensors[index]))) == fInputTensorNames.end() &&
-         fDynamicTensorInfos.find(UTILITY::Clean_name(std::string(op_input_tensors[index]))) == fDynamicTensorInfos.end();
-         ++index){
-            fIntermediateTensorFrequencyLookup[op_input_tensors[index]] = order_execution;
+            fInitializedTensors.find(UTILITY::Clean_name(std::string(op_input_tensors[index]))) == fInitializedTensors.end() &&
+            std::find(fInputTensorNames.begin(), fInputTensorNames.end(),
+                      UTILITY::Clean_name(std::string(op_input_tensors[index]))) == fInputTensorNames.end();
+            ++index)
+   {
+      fIntermediateTensorFrequencyLookup[op_input_tensors[index]] = order_execution;
    }
 }
 
@@ -604,10 +603,11 @@ void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool
       fOperators[op_idx]->Initialize(*this);
       for(auto &it:fOperators[op_idx]->GetOpOutputTensors()){
          std::string name = std::string{it};
+         // check if tensor is not an initialized or output tensor and it is not already in the list
          if (fIntermediateTensorFrequencyLookup.find(it) == fIntermediateTensorFrequencyLookup.end() &&
              std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), name) == fOutputTensorNames.end() &&
-             fInitializedTensors.find(name) == fInitializedTensors.end() &&
-             fDynamicTensorInfos.find(name) == fDynamicTensorInfos.end()){
+             fInitializedTensors.find(name) == fInitializedTensors.end())
+         {
             fIntermediateTensorFrequencyLookup[it] = op_idx;
          }
       }
@@ -616,9 +616,9 @@ void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool
 
    // loop on initialized tensors and make the integers as constant to be
    // not written in a weight file
-   for (auto &i : fInitializedTensors) {
-      if (i.second.IsWeightTensor() && i.second.type() !=  ETensorType::FLOAT)
-         i.second.SetConstant();
+   for (auto &it : fInitializedTensors) {
+      if (it.second.IsWeightTensor() && it.second.type() !=  ETensorType::FLOAT)
+         it.second.SetConstant();
    }
 
    fIsInitialized = true;
@@ -775,19 +775,21 @@ void RModel::GenerateIntermediateTensorInfo() {
       fGC += "//--- declare the dynamic tensors\n";
       for (auto &i : fDynamicTensorInfos) {
          if (i.second.type == ETensorType::FLOAT) {
-            fGC += "std::vector<float> fTensor_" + i.first + ";\n";
+            //fGC += "std::vector<float> fTensor_" + i.first + ";\n";
             fGC += "float * tensor_" + i.first + " = nullptr;\n";
          } else if (i.second.type == ETensorType::DOUBLE) {
-            fGC += "std::vector<double> fTensor_" + i.first + ";\n";
+            //fGC += "std::vector<double> fTensor_" + i.first + ";\n";
             fGC += "double * tensor_" + i.first + " = nullptr;\n";
          } else if (i.second.type == ETensorType::INT64) {
-            fGC += "std::vector<int64_t> fTensor_" + i.first + ";\n";
+            //fGC += "std::vector<int64_t> fTensor_" + i.first + ";\n";
             fGC += "int64_t * tensor_" + i.first + " = nullptr;\n";
          } else if (i.second.type == ETensorType::BOOL) {
-            fGC += "std::vector<uint8_t> fTensor_" + i.first + ";\n";
+            //fGC += "std::vector<uint8_t> fTensor_" + i.first + ";\n";
             fGC += "uint8_t * tensor_" + i.first + " = nullptr;\n";
          }
       }
+      fGC += "//--- dynamic tensors pool\n";
+      fGC += "std::vector<char> fDynamicMemoryPool;\n";
    }
 }
 
@@ -805,14 +807,81 @@ void RModel::GenerateOperatorDeclarations() {
 
 void RModel::GenerateDynamicTensorInfo()
 {
+   // generate code for allocating dynamic tensors using the greedy memory allocations
+   if (fDynamicTensorInfos.empty())
+      return;
+
    std::stringstream out;
+   out << "//  dynamic tensor memory management\n";
+   out << SP << "std::vector<TMVA::Experimental::SOFIE::TensorLifeInfo> dynamicTensorInfos;\n";
+   out << SP << "dynamicTensorInfos.reserve(" << fDynamicTensorInfos.size() << ");\n";
+
+   // loop on all the operators to find begin/end life of the tensors
+   int op_index = 0;
+   std::vector<std::pair<std::string, ETensorType>> tensors;
+   tensors.reserve(fDynamicTensorInfos.size());
+   for (auto & op : fOperators) {
+      // loop on output tensors -
+      for (auto &it : op->GetOpOutputTensors()) {
+         if (fVerbose) {
+            auto op_ptr = op.get();
+            std::cout << "Looping on operator " << op_index << "   " << typeid(*op_ptr).name() << std::endl;
+         }
+         // check if is a dynamic tensor
+         std::string name = std::string(it);
+         if ( fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end() ) {
+            auto tensor_size =  ConvertDimShapeToLength(GetDimTensorShape(name));
+            auto type = GetTensorType(name);
+            size_t type_size = GetTypeSize(type);
+            int begin = op_index;
+            int end = fOperators.size();
+            // look for end
+            auto it_lookup = fIntermediateTensorFrequencyLookup.find(name);
+            if (it_lookup != fIntermediateTensorFrequencyLookup.end())
+               end = it_lookup->second + 1;  // end is last time used + 1
+            // // some tensors (like xcol in convolutions) are just used within the operators
+            // if (end == 0 && begin > 0) end = begin+1;
+
+            if (begin> end) {
+               std::cout << "op " << op_index << "tensor_" << name << " begin " << begin << "  "  << " end " << end << std::endl;
+               throw std::runtime_error("TMVA-SOFIE: RModel::GenerateDynamicTensorInfo: tensor_" + name + " has end before begin");
+            }
+
+            // write in code
+            out << SP << "dynamicTensorInfos.push_back( {" << begin << ", " << end << ", " << type_size << "* (" << tensor_size << ") });"
+                << " // tensor_" << name << std::endl;
+            tensors.push_back({name,type});
+         }
+      }
+      op_index++; // increment operator index
+   }
+   out << "\n" << SP << "auto memory_result = OrganizeMemory(dynamicTensorInfos);\n\n";
+   out << "//  allocating now the memory\n";
+   out << SP << "fDynamicMemoryPool = std::vector<char>(memory_result.total_bytes);\n";
+   out << SP << "int idx = 0;\n";
+   for (auto & it : tensors) {
+      out << SP << "tensor_" << it.first << " = reinterpret_cast<" << ConvertTypeToString(it.second) << " *>(fDynamicMemoryPool.data() + memory_result.offsets[idx++]);\n";
+   }
+   // check that all dynamic tensors are covered
+   bool missingTensor = false;
    for (auto &i : fDynamicTensorInfos) {
-      auto length = ConvertDynamicShapeToLength(i.second.shape);
-      out << SP << "if (" << length << " > 0) {\n";
-      out << SP << SP << "fTensor_" << i.first << ".resize(" << length << ");\n";
-      out << SP << SP << "tensor_" << i.first << " = fTensor_" << i.first << ".data();\n";
-      out << SP << "}\n";
+      if (std::find(tensors.begin(), tensors.end(), std::pair<std::string,ETensorType>{i.first, i.second.type}) == tensors.end()) {
+         std::cout << "Dynamic tensors " << i.first << " is not in list of operator input/output " << std::endl;
+         missingTensor = true;
+      }
    }
+   if (missingTensor)
+      throw std::runtime_error("TMVA-SOFIE: RModel::GenerateDynamicTensorInfo - some tensors are not in input/output list");
+
+
+
+   // for (auto &i : fDynamicTensorInfos) {
+   //    auto length = ConvertDynamicShapeToLength(i.second.shape);
+   //    out << SP << "if (" << length << " > 0) {\n";
+   //    out << SP << SP << "fTensor_" << i.first << ".resize(" << length << ");\n";
+   //    out << SP << SP << "tensor_" << i.first << " = fTensor_" << i.first << ".data();\n";
+   //    out << SP << "}\n";
+   // }
    fGC += out.str();
 }
 
diff --git a/tmva/sofie/src/SOFIE_common.cxx b/tmva/sofie/src/SOFIE_common.cxx
index c107b489be19e..1ff510842643a 100644
--- a/tmva/sofie/src/SOFIE_common.cxx
+++ b/tmva/sofie/src/SOFIE_common.cxx
@@ -4,6 +4,8 @@
 #include <sstream>
 #include <stdexcept>
 #include <charconv>
+#include <unordered_map>
+#include <set>
 
 namespace TMVA {
 namespace Experimental {
@@ -89,7 +91,7 @@ std::string ConvertTypeToString(ETensorType type){
          return "double";
       }
       case ETensorType::BOOL : {
-         return "bool";
+         return "uint8_t";
       }
       default:{
          return "other_" + std::to_string( (int) type);
@@ -547,6 +549,130 @@ std::vector<Dim> UTILITY::ComputeStrideFromShape(const std::vector<Dim> & shape)
    return strides;
 }
 
+struct FreeBlock {
+  std::size_t offset;
+  std::size_t size;
+  bool operator<(const FreeBlock& other) const {
+    // order by offset for deterministic coalescing
+    return offset < other.offset;
+  }
+};
+
+struct MemoryEvent {
+  int t;      // time (i.e. operator index)
+  int type;   // 0 = END first, 1 = START
+  int idx;    // tensor index
+  bool operator<(const MemoryEvent& o) const {
+    if (t != o.t) return t < o.t;
+    return type < o.type; // END before START at the same time
+  }
+};
+
+/// Greedy best-fit planner with coalescing free list.
+MemoryResult OrganizeMemory(const std::vector<TensorLifeInfo> & tensorsInfo )
+{
+   // Basic validation
+   for (const auto &t : tensorsInfo) {
+      if (!(t.end > t.begin)) {
+         throw std::runtime_error("Each tensor must have end > begin.");
+      }
+   }
+
+   // Build events: free before allocate at equal times.
+   std::vector<MemoryEvent> events;
+   events.reserve(tensorsInfo.size() * 2);
+   for (int i = 0; i < (int)tensorsInfo.size(); ++i) {
+      events.push_back({tensorsInfo[i].end, 0, i});   // END
+      events.push_back({tensorsInfo[i].begin, 1, i}); // START
+   }
+   std::sort(events.begin(), events.end());
+
+   std::vector<size_t> tensorsOffset(tensorsInfo.size());
+
+   // Free list ordered by offset (for O(log n) coalescing)
+   // and faster insert/erase with respect to a vector
+   std::set<FreeBlock> free_list;
+
+   // Bookkeeping: size/offset map for frees.
+   std::unordered_map<int, std::size_t> live_size;
+   std::unordered_map<int, std::size_t> live_offset;
+
+   std::size_t total_bytes = 0;
+
+   auto allocate_best_fit = [&](std::size_t need) -> std::size_t {
+      // Find the *smallest* block whose size >= need (best-fit).
+      // Since free_list is ordered by offset, we scan to find best by size.
+      // (For very large sets you could maintain a multimap by size as well.)
+      auto best = free_list.end();
+      for (auto it = free_list.begin(); it != free_list.end(); ++it) {
+         if (it->size >= need) {
+            if (best == free_list.end() || it->size < best->size)
+               best = it;
+         }
+      }
+      if (best != free_list.end()) {
+         std::size_t off = best->offset;
+         if (best->size == need) {
+            free_list.erase(best);
+         } else {
+            FreeBlock updated{best->offset + need, best->size - need};
+            free_list.erase(best);
+            free_list.insert(updated);
+         }
+         return off;
+      }
+      // No free block large enough; grow the heap.
+      std::size_t off = total_bytes;
+      total_bytes += need;
+      return off;
+   };
+
+   auto try_coalesce = [&](std::set<FreeBlock>::iterator it) {
+      // Coalesce with previous
+      if (it != free_list.begin()) {
+         auto prev = std::prev(it);
+         if (prev->offset + prev->size == it->offset) {
+            FreeBlock merged{prev->offset, prev->size + it->size};
+            free_list.erase(prev);
+            it = free_list.erase(it);
+            it = free_list.insert(merged).first;
+         }
+      }
+      // Coalesce with next
+      auto next = std::next(it);
+      if (next != free_list.end() && it->offset + it->size == next->offset) {
+         FreeBlock merged{it->offset, it->size + next->size};
+         free_list.erase(next);
+         it = free_list.erase(it);
+         free_list.insert(merged);
+      }
+   };
+
+   // Sweep through time.
+   for (const auto &e : events) {
+      if (e.type == 0) { // END: free
+         auto it_sz = live_size.find(e.idx);
+         auto it_off = live_offset.find(e.idx);
+         if (it_sz != live_size.end() && it_off != live_offset.end()) {
+            FreeBlock fb{it_off->second, it_sz->second};
+            // Insert and coalesce with neighbors
+            auto it = free_list.insert(fb).first;
+            try_coalesce(it);
+            live_size.erase(it_sz);
+            live_offset.erase(it_off);
+         }
+      } else { // START: allocate
+         auto &t = tensorsInfo[e.idx];
+         std::size_t off = allocate_best_fit(t.size);
+         tensorsOffset[e.idx] = off;
+         live_size[e.idx] = t.size;
+         live_offset[e.idx] = off;
+      }
+   }
+
+   return MemoryResult{total_bytes, std::move(tensorsOffset)};
+}
+
 } // namespace SOFIE
 } // namespace Experimental
 } // namespace TMVA

From 556f0d701718a577ebe6757dfc55e45fbd7940d4 Mon Sep 17 00:00:00 2001
From: moneta <lorenzo.moneta@cern.ch>
Date: Fri, 14 Nov 2025 10:41:42 +0100
Subject: [PATCH 04/12] [tmva][sofie] Fix an issue in genereting code for
 dynamic tensor when broadcasting

The assert that was generated when broadcasting dynamic tensors was not correct
---
 tmva/sofie/inc/TMVA/ROperator_Gemm.hxx | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
index 1c8b51d991af2..1a0fa7b16868b 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
@@ -207,13 +207,7 @@ namespace SOFIE{
          }
 
          fShapeY = DynamicShapeInference({fShapeA, fShapeB});
-         std::vector<size_t> shapeY;
-         if (!fIsDynamic) {
-            shapeY = ConvertShapeToInt(fShapeY);
-            if (shapeY.empty()) {
-               throw std::runtime_error("TMVA SOFIE Gemm Op " + fNY + " has invalid shape" + ConvertShapeToString(fShapeY));
-            }
-         }
+         std::vector<size_t> shapeY = ConvertShapeToInt(fShapeY);
 
          // bias is normally not dynamic (not support it for time being)
          if (fNC != ""){
@@ -225,7 +219,11 @@ namespace SOFIE{
             size_t lengthC = ConvertShapeToLength(fShapeC);
             size_t lengthY = ConvertShapeToLength(shapeY);
             // for dynamic outputs broadcasting is always done
-            bool broadcast_needed = lengthC != lengthY;
+            bool broadcast_needed = false;
+            if (fIsDynamic && shapeY.empty())
+               broadcast_needed = true;
+            else
+               broadcast_needed = lengthC != lengthY;
 
 
             if (broadcast_needed) {
@@ -359,7 +357,7 @@ namespace SOFIE{
                             + ConvertShapeToString(fShapeC) + " output length " + lengthGemm);
                } else {
                   // add a dynamic check (C should not be a dynamic tensor)
-                  out << SP << "assert(" << lengthGemm << " != " <<  ConvertShapeToLength(fShapeC) << ");\n";
+                  out << SP << "assert(" << lengthGemm << " == " <<  ConvertShapeToLength(fShapeC) << ");\n";
                }
             }
          } else {

From 3e6691e478f45ea545606ece67768e02570907bf Mon Sep 17 00:00:00 2001
From: moneta <lorenzo.moneta@cern.ch>
Date: Wed, 17 Dec 2025 22:32:27 +0100
Subject: [PATCH 05/12] [tmva][sofie] Fix stacked MatMul and speedup LayerNorm

Apply also other fixes for the SOFIE tests and add a new test for StackMul
---
 tmva/sofie/inc/TMVA/ROperator_Gather.hxx      |   2 -
 tmva/sofie/inc/TMVA/ROperator_Gemm.hxx        |  46 ++-
 .../inc/TMVA/ROperator_LayerNormalization.hxx | 265 +++++++++---------
 tmva/sofie/inc/TMVA/ROperator_Range.hxx       |  29 +-
 tmva/sofie/src/RModel.cxx                     |  33 ++-
 tmva/sofie/test/TestCustomModelsFromONNX.cxx  |  29 +-
 .../test/input_models/MatMul_Stacked.onnx     |  19 ++
 7 files changed, 251 insertions(+), 172 deletions(-)
 create mode 100644 tmva/sofie/test/input_models/MatMul_Stacked.onnx

diff --git a/tmva/sofie/inc/TMVA/ROperator_Gather.hxx b/tmva/sofie/inc/TMVA/ROperator_Gather.hxx
index 1d51c59380dae..0d50c0747c028 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Gather.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Gather.hxx
@@ -72,8 +72,6 @@ public:
           // empty shape Indices is a scalar value for the indices
          size_t indicesLength = ConvertShapeToLength(model.GetTensorShape(fNIndices));
          int64_t* indicesData = static_cast<int64_t*>(model.GetInitializedTensorData(fNIndices).get());
-         //flag index tensor as not writable (not sure this is needed since index tensor might be used in generated code)
-         model.SetNotWritableInitializedTensor(fNIndices);
          // update indices data in case of negative dim values
          for (size_t i = 0; i < indicesLength; i++) {
             // move this at generation time?
diff --git a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
index 1a0fa7b16868b..47bc5392fede4 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
@@ -337,6 +337,8 @@ namespace SOFIE{
          auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal());
          auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal());
          auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal());
+         // size of A: if (trasposeA) is m*k else k*m
+         // size of B  n*k
          std::vector<Dim> sY = {fShapeY[dimY-2], fShapeY[dimY-1]};
          // extra dimensions in case of stacked MatMul
          std::vector<Dim> sA;
@@ -371,9 +373,32 @@ namespace SOFIE{
          // include MatMul case where we stack the Gemm operations
          // exclude case where we have only 1's in the additional dims
          bool doStackMul = dimY > 2 && ( fIsDynamic  || std::stoi(lengthExtra) > 1);
+         // compute input offset for stack multiplications
+         std::string lengthExtra_A;
+         std::string lengthExtra_B;
+         std::string increment_A;
+         std::string increment_B;
+
+         if (doStackMul) {
+            std::vector<Dim> sA(fShapeA.begin(), fShapeA.begin()+dimA-2);
+            std::vector<Dim> sB(fShapeB.begin(), fShapeB.begin()+dimB-2);
+            std::vector<Dim> mA = {fShapeA[dimA-2], fShapeA[dimA-1]};
+            std::vector<Dim> mB = {fShapeA[dimB-2], fShapeB[dimB-1]};
+            lengthExtra_A = ConvertDimShapeToLength(sA);
+            lengthExtra_B = ConvertDimShapeToLength(sB);
+            // size of A performing matmul is m*k and n*k for B
+            increment_A = ConvertDimShapeToLength(mA);
+            increment_B = ConvertDimShapeToLength(mB);
+         }
+         bool extraA = (doStackMul && lengthExtra_A != "1");
+         bool extraB = (doStackMul && lengthExtra_B != "1");
          if (doStackMul) {
-            out << SP << "size_t " << opName << "_yoffset = 0;\n"; // needed if we stack the gemm operations
-            out << SP << "for (int i = 0; i < " << lengthExtra << "; i++){\n";
+            out << SP << "size_t " << opName << "_y_offset = 0;\n"; // needed if we stack the gemm operations
+            if (extraA)
+               out << SP << "size_t " << opName << "_A_offset = 0;\n";
+            if (extraB)
+               out << SP << "size_t " << opName << "_B_offset = 0;\n";
+            out << SP << "for (size_t i = 0; i < " << lengthExtra << "; i++){\n";
             out << SP;
          }
 
@@ -381,14 +406,16 @@ namespace SOFIE{
 
             out << SP << "TMVA::Experimental::SOFIE::Gemm_Call("
              << "tensor_" << fNY;
-             if (doStackMul) out << " + " << opName << "_yoffset";
+             if (doStackMul) out << " + " << opName << "_y_offset";
             out <<   ", "
              << (fAttrTransB ? "true, " : "false, ")
              << (fAttrTransA ? "true, " : "false, ")
              << n << ", " << m << ", " << k << ", ";
-            out << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrAlpha << ",";
-            out << "tensor_" << fNB << ", " << "tensor_" << fNA << ", ";
-            out << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrBeta << ",";
+            out << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrAlpha << ", tensor_" << fNB;
+            if (extraB) out << " + " << opName << "_B_offset";
+            out << ", tensor_" << fNA;
+            if (extraA) out << " + " << opName << "_A_offset";
+            out << ", " << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrBeta << ",";
             // in the case of bias
              if (!fNC.empty())
                out << "tensor_" << fNC;
@@ -404,7 +431,12 @@ namespace SOFIE{
          }
 
          if (doStackMul) {
-            out << SP << SP <<  opName << "_yoffset += " << lengthGemm << ";\n";
+            out << SP << SP <<  opName << "_y_offset += " << lengthGemm << ";\n";
+            if (lengthExtra_A != "1")
+               out << SP << SP << opName << "_A_offset += " << increment_A << ";\n";
+            if (lengthExtra_B != "1")
+               out << SP << SP << opName << "_B_offset += " << increment_B << ";\n";
+
             out << "}\n"; // end of loop on the stacked multiplications
          }
 
diff --git a/tmva/sofie/inc/TMVA/ROperator_LayerNormalization.hxx b/tmva/sofie/inc/TMVA/ROperator_LayerNormalization.hxx
index 239c5332172b0..f98ce201d400d 100644
--- a/tmva/sofie/inc/TMVA/ROperator_LayerNormalization.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_LayerNormalization.hxx
@@ -14,6 +14,7 @@ namespace SOFIE {
 template <typename T>
 class ROperator_LayerNormalization : public ROperator {
 private:
+   bool fCastToFloat = false;  // flag to indicate if operation 1 are in floats (to be  impl)
    int fAttrAxis;
    float fAttrEpsilon;
    size_t fAttrStashType;
@@ -31,7 +32,7 @@ private:
 
    std::vector<Dim> fShapeX;
    std::vector<Dim> fShapeScale;
-   std::vector<size_t> fShapeB;  // shape of input Bias (B) is assumed to be fully defined
+   std::vector<Dim> fShapeB;
    std::vector<Dim> fShapeY;
    std::vector<Dim> fShapeMean;
    std::vector<Dim> fShapeInvStdDev;
@@ -40,8 +41,8 @@ private:
    size_t fSize; // Size of the input
    // size_t fAxisDim;
 
-   std::vector<Dim> fNormalizedShape;
-   std::vector<Dim> fAxesShape;
+   std::vector<Dim> fNormalizedShape;  // shape from X[ axis,...,N-1]
+   std::vector<Dim> fAxesShape;        // shape from X[0,..,axis-1]
    // lengths in string format
    std::string fLength; // Length of the input
    std::string fNormalizedLength;
@@ -79,7 +80,7 @@ public:
 
    void Initialize(RModel& model) override {
       if (!model.CheckIfTensorAlreadyExist(fNX)) {
-         throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found.");
+         throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Tensor " + fNX + " not found.");
       }
       bool isDynamic = model.IsDynamicTensor(fNX);
       fShapeX = model.GetDimTensorShape(fNX);
@@ -104,8 +105,7 @@ public:
       // Type of mean and std
       ETensorType type = (fAttrStashType == 1) ? ETensorType::FLOAT : model.GetTensorType(fNX);
       // Mean
-      if (fNMean.empty()) {
-         fNMean = "Mean" + fNX;
+      if (!fNMean.empty()) {
          // cannot use initializer list with one element since it is ambiguous
          if (isDynamic)
             // add size_t(-1) to indicate that shape is an expression
@@ -114,29 +114,60 @@ public:
             model.AddIntermediateTensor(fNMean, type, std::vector<size_t>(1,std::stoi(fAxesLength)));
       }
       // Inverse Standard Deviation
-      if (fNInvStdDev.empty()) {
-         fNInvStdDev = "InvStdDev" + fNX;
+      if (!fNInvStdDev.empty()) {
          if (isDynamic)
             model.AddIntermediateTensor(fNInvStdDev, type, std::vector<Dim>(1,Dim{fAxesLength,std::size_t(-1)}));
          else
             model.AddIntermediateTensor(fNInvStdDev, type, std::vector<size_t>(1,std::stoi(fAxesLength)));
       }
+      // if mean and stdev are not empty they are not defined in the output list
       // Cast X to float
       if (fAttrStashType == 1 && model.GetTensorType(fNX) != ETensorType::FLOAT) {
-         fNCastedX = "Casted" + fNX;
-         model.AddIntermediateTensor(fNCastedX, ETensorType::FLOAT, fShapeX);
-         fNNormalizedX = "Normalized" + fNX;
-         model.AddIntermediateTensor(fNNormalizedX, ETensorType::FLOAT, fShapeX);
+         fCastToFloat = true;
+         fType = "float";
+         // fNCastedX = "Casted" + fNX;
+         // model.AddIntermediateTensor(fNCastedX, ETensorType::FLOAT, fShapeX);
+         // fNNormalizedX = "Normalized" + fNX;
+         // model.AddIntermediateTensor(fNNormalizedX, ETensorType::FLOAT, fShapeX);
+      }
+      // scale shape
+      fShapeScale = model.GetDimTensorShape(fNScale);
+      // appends 1 to scale shapes if missing
+      size_t dimScale = fShapeScale.size();
+      if (dimScale < fSize) {
+         for (size_t i = 0; i < fSize-dimScale; i++)
+            fShapeScale.insert(fShapeScale.begin(), Dim{1});
+      }
+      // check also shape if consistent now
+      for (size_t i = 0; i < fSize; i++) {
+         if (fShapeScale[i].dim != 1 && fShapeScale[i] != fShapeX[i])
+            throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Scale Tensor has invalid shape " + ConvertDimShapeToString(fShapeScale));
       }
-      // Broadcast the bias
       if (!fNB.empty()) {
-         fShapeB = model.GetTensorShape(fNB);
-         size_t lengthB = ConvertShapeToLength(fShapeB);
-         if (isDynamic || lengthB < static_cast<size_t>(std::stoi(fLength))) {
-            fNBroadcastedB = "Broadcasted" + fNB;
-            model.AddIntermediateTensor(fNBroadcastedB, ConvertStringToType(fType), fShapeX);
+         fShapeB = model.GetDimTensorShape(fNB);
+         // appends 1 to bias shapes if missing
+         size_t dimB = fShapeB.size();
+         if (dimB < fShapeX.size()) {
+            for (size_t i = 0; i < fSize-dimB; i++)
+               fShapeB.insert(fShapeB.begin(), Dim{1});
+         }
+         for (size_t i = 0; i < fSize; i++) {
+            if (fShapeB[i].dim != 1 && fShapeB[i] != fShapeX[i])
+               throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Bias Tensor has invalid shape " + ConvertDimShapeToString(fShapeScale));
          }
       }
+
+      std::cout << "bias + scale " << ConvertDimShapeToString(fShapeB) << "  " << ConvertDimShapeToString(fShapeScale) << std::endl;
+
+      // // Broadcast the bias
+      // if (!fNB.empty()) {
+      //    fShapeB = model.GetTensorShape(fNB);
+      //    size_t lengthB = ConvertShapeToLength(fShapeB);
+      //    if (isDynamic || lengthB < static_cast<size_t>(std::stoi(fLength))) {
+      //       fNBroadcastedB = "Broadcasted" + fNB;
+      //       model.AddIntermediateTensor(fNBroadcastedB, ConvertStringToType(fType), fShapeX);
+      //    }
+      // }
       model.AddNeededStdLib("cmath");
    }
 
@@ -162,10 +193,6 @@ public:
          throw std::runtime_error("TMVA::SOFIE LayerNormalization operator " + opName +
                                   " called to generate without being initialized first.");
       }
-      if (fShapeX.size() > 5) {
-         throw std::runtime_error("TMVA::SOFIE LayerNormalization operator not "
-                                  "implemented for input tensor of size > 5.");
-      }
 
       std::stringstream out;
 
@@ -179,10 +206,32 @@ public:
       }
 
       auto strides = UTILITY::ComputeStrideFromShape(fShapeX);
-      std::string InputIndex = "axis_0 * " + strides[0].GetVal();
+      std::string inputIndex = "axis_0 * " + strides[0].GetVal();
       for (size_t i = 1; i < fSize; i++) {
-         InputIndex += " + axis_" + std::to_string(i) + " * " + strides[i].GetVal();
+         inputIndex += " + axis_" + std::to_string(i);
+         if (i < fSize-1) inputIndex += " * " + strides[i].GetVal();
       }
+      auto scaleStrides = UTILITY::ComputeStrideFromShape(fShapeScale);
+      std::string scaleIndex;
+      for (size_t i = 0; i < fSize; i++) {
+         if (fShapeScale[i].dim != 1) {
+            if (!scaleIndex.empty()) scaleIndex += " + ";
+            scaleIndex += "axis_" + std::to_string(i);
+            if ( scaleStrides[i].dim != 1) scaleIndex +=  " * " + scaleStrides[i].GetVal();
+         }
+      }
+      if (scaleIndex.empty()) scaleIndex = "0";
+
+      auto biasStrides = UTILITY::ComputeStrideFromShape(fShapeB);
+      std::string biasIndex;
+      for (size_t i = 0; i < fSize; i++) {
+         if (fShapeB[i].dim != 1) {
+            if (!biasIndex.empty()) biasIndex += " + ";
+            biasIndex += "axis_" + std::to_string(i);
+            if ( biasStrides[i].dim != 1) biasIndex +=  " * " + biasStrides[i].GetVal();
+         }
+      }
+      if (biasIndex.empty()) biasIndex = "0";
 
       auto axesStrides = UTILITY::ComputeStrideFromShape(fAxesShape);
       std::string axesIndex = "axis_" + std::to_string(0) + " * " + axesStrides[0].GetVal();
@@ -190,51 +239,42 @@ public:
          axesIndex += " + axis_" + std::to_string(i) + " * " + axesStrides[i].GetVal();
       }
 
-      auto normalizedStrides = UTILITY::ComputeStrideFromShape(fNormalizedShape);
-      std::string normalizedIndex = "axis_" + std::to_string(fAxis) + " * " + normalizedStrides[0].GetVal();
-      for (size_t i = fAxis + 1; i < fSize; i++) {
-         normalizedIndex += " + axis_" + std::to_string(i) + " * " + normalizedStrides[i - fAxis].GetVal();
-      }
 
-      if (!fNCastedX.empty()) {
-         // Cast X to float
-         out << SP << "for (size_t i = 0; i < " << fLength << "; i++) {\n";
-         out << SP << SP << "tensor_" << fNCastedX << "[i] = " << "static_cast<float>(tensor_" << fNX;
-         out << "[i]);\n";
-         out << SP << "}\n";
-      }
+      // compute mean and std-dev. Save in tensors if requested
 
       out << SP << "// Compute the mean\n";
-      // Loop over the normalized dimensions
+      // Loop over all the dims in [0, fAxis)
       for (size_t i = 0; i < fAxis; i++) {
          std::string iIdx = "axis_" + std::to_string(i);
          out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
                       << "; " << iIdx << "++) {\n";
       }
-      out << SP << SP << fType << " sum = 0.;\n";
-      // loop over all the dims in [0, fAxis)
+      out << SP << SP << fType << " mean = 0.;\n";
+      // loop over the normalized dimensions (fAxis,....,N-1)
       for (size_t j = fAxis; j < fSize; j++) {
          std::string jIdx = "axis_" + std::to_string(j);
          out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
                          << "; " << jIdx << "++) {\n";
       }
-      out << SP << SP << SP << "sum += tensor_" << fNX << "[" << InputIndex << "];\n";
+      out << SP << SP << SP << "mean += tensor_" << fNX << "[" << inputIndex << "];\n";
       for (size_t j = fAxis; j < fSize; j++) {
          out << SP << SP << "}\n";
       }
-      out << SP << SP << "tensor_" << fNMean << "[" << axesIndex << "] = sum / " << fType << "(";
-      out << fNormalizedLength << ");\n";
-      for (size_t i = fAxis; i < fSize; i++) {
-         out << SP << "}\n";
-      }
+      out << SP << SP << "mean  /= " << fType << "(" << fNormalizedLength << ");\n";
+
+      // for (size_t i = fAxis; i < fSize; i++) {
+      //    out << SP << "}\n";
+      // }
+      // tensor_" << fNMean << "[" << axesIndex << "]
 
       out << SP << "// Compute the inverse Standard Deviation\n";
       // Loop over the normalized dimensions
-      for (size_t i = 0; i < fAxis; i++) {
-         std::string iIdx = "axis_" + std::to_string(i);
-         out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
-                   << "; " << iIdx << "++){\n";
-      }
+      // for (size_t i = 0; i < fAxis; i++) {
+      //    std::string iIdx = "axis_" + std::to_string(i);
+      //    out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
+      //              << "; " << iIdx << "++){\n";
+      // }
+
       // Set sum = 0
       out << SP << SP << fType << " sum = 0.;\n";
       // loop over all the dims in [0, fAxis)
@@ -243,92 +283,63 @@ public:
          out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
                           << "; " << jIdx << "++){\n";
       }
-      out << SP << SP << SP << "float tmp = tensor_" << fNX << "[" << InputIndex << "] - tensor_"
-                            << fNMean << "[" << axesIndex << "];\n";
+      out << SP << SP << SP << "float tmp = tensor_" << fNX << "[" << inputIndex << "] - mean;\n";
       out << SP << SP << SP << "sum += tmp*tmp;\n";
       for (size_t j = fAxis; j < fSize; j++) {
          out << SP << SP << "}\n";
       }
-      out << SP << SP << "tensor_" << fNInvStdDev << "[" << axesIndex << "] = 1 / std::sqrt(";
+      out << SP << SP << fType << " invStdDev = 1 / std::sqrt(";
       out << "sum / " << fType << "(" << fNormalizedLength << ") + " << fAttrEpsilon << ");\n";
-      for (size_t i = 0; i < fAxis; i++) {
-         out << SP << "}\n";
-      }
 
-      if (!fNCastedX.empty()) {
-         out << "// NormalizedX = InvStdDev * (CastedX - Mean)\n";
-         for (size_t i = 0; i < fAxis; i++) {
-            std::string iIdx = "axis_" + std::to_string(i);
-            out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
-                          << "; " << iIdx << "++){\n";
-         }
-         for (size_t j = fAxis; j < fSize; j++) {
-            std::string jIdx = "axis_" + std::to_string(j);
-            out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
-                             << "; " << jIdx << "++){\n";
-         }
-         out << SP << SP << SP << "tensor_" << fNNormalizedX << "[" << InputIndex << "] = tensor_";
-         out << fNInvStdDev << "[" << axesIndex << "] * (tensor_" << fNCastedX << "[" << InputIndex;
-         out << "] - tensor_" << fNMean << "[" << axesIndex << "])\n";
-         for (size_t j = fAxis; j < fSize; j++) {
-            out << SP << SP << "}\n";
-         }
-         for (size_t i = fAxis; i < fSize; i++) {
-            out << SP << "}\n";
-         }
-         out << "// Y = Scale o NormalizedX";
-         for (size_t i = 0; i < fAxis; i++) {
-            std::string iIdx = "axis_" + std::to_string(i);
-            out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
-                      << "; " << iIdx << "++){\n";
-         }
-         for (size_t j = fAxis; j < fSize; j++) {
-            std::string jIdx = "axis_" + std::to_string(j);
-            out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
-                            << "; " << jIdx << "++){\n";
-         }
-         out << SP << SP << SP << "tensor_" << fNY << "[" << InputIndex << "] = tensor_" << fNScale;
-         out << "[" << axesIndex << "] * static_cast<" << fType << ">(tensor_" << fNCastedX << "[" << InputIndex;
-         out << "]);\n";
-         for (size_t j = fAxis; j < fSize; j++) {
-            out << SP << SP << "}\n";
-         }
-         for (size_t i = fAxis; i < fSize; i++) {
-            out << SP << "}\n";
-         }
-      } else {
-         out << SP << "// Y = Scale o InvStdDev (X - Mean)\n";
-         for (size_t i = 0; i < fAxis; i++) {
-            std::string iIdx = "axis_" + std::to_string(i);
-            out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
-                         << "; " << iIdx << "++){\n";
-         }
-         for (size_t j = fAxis; j < fSize; j++) {
-            std::string jIdx = "axis_" + std::to_string(j);
-            out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
-                           << "; " << jIdx << "++){\n";
-         }
-         out << SP << SP << SP << "tensor_" << fNY << "[" << InputIndex << "] = tensor_" << fNScale;
-         out << "[" << normalizedIndex << "] * tensor_" << fNInvStdDev << "[" << axesIndex;
-         out << "] * (tensor_" << fNX << "[" << InputIndex << "] - tensor_" << fNMean << "[";
-         out << axesIndex << "]);\n";
-         for (size_t j = fAxis; j < fSize; j++) {
-            out << SP << SP << "}\n";
-         }
-         for (size_t i = fAxis; i < fSize; i++) {
-            out << SP << "}\n";
-         }
+      // for (size_t i = 0; i < fAxis; i++) {
+      //    out << SP << "}\n";
+      // }
+
+      // set output mean and invStdDev if requested
+      if (!fNMean.empty())
+         out << SP << SP <<  "tensor_" << fNMean << "[" << axesIndex << "] = mean;\n";
+      if (!fNInvStdDev.empty())
+         out << SP << SP <<  "tensor_" << fNInvStdDev << "[" << axesIndex << "] = invStdDev;\n";
+
+      // scale and add bias
+
+      out << SP << "// Y = Scale o InvStdDev (X - Mean)\n";
+      // for (size_t i = 0; i < fAxis; i++) {
+      //    std::string iIdx = "axis_" + std::to_string(i);
+      //    out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
+      //                 << "; " << iIdx << "++){\n";
+      // }
+
+      for (size_t j = fAxis; j < fSize; j++) {
+         std::string jIdx = "axis_" + std::to_string(j);
+         out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] << "; " << jIdx
+             << "++){\n";
       }
+      out << SP << SP << SP << "tensor_" << fNY << "[" << inputIndex << "] = tensor_" << fNScale;
+      out << "[" << scaleIndex << "] * invStdDev * (tensor_" << fNX << "[" << inputIndex << "] - mean)";
 
-      if (!fNB.empty()) {
-         std::string bias = "tensor_" + (fNBroadcastedB.empty() ? fNB : fNBroadcastedB);
-         out << SP << "// Add the bias to Y\n";
-         out << SP << "int " << opName << "_n = " << fLength << ";\n";
-         out << SP << "float " << opName << "_alpha = 1.;\n";
-         out << SP << "int " << opName << "_inc = 1;\n";
-         out << SP << "BLAS::saxpy_(&" << opName << "_n, &" << opName << "_alpha, " << bias << ", &";
-         out << opName << "_inc, " << "tensor_" << fNY << ", &" << opName << "_inc);\n";
+      // add bias if needed
+      if (!fNB.empty())
+         // assume bias has index as scale
+         out << " + tensor_" << fNB << "[" << biasIndex << "]";
+      out << ";\n";
+
+      for (size_t j = fAxis; j < fSize; j++) {
+         out << SP << SP << "}\n";
       }
+      for (size_t i = fAxis; i < fSize; i++) {
+         out << SP << "}\n";
+      }
+
+      // if (!fNB.empty()) {
+      //    std::string bias = "tensor_" + (fNBroadcastedB.empty() ? fNB : fNBroadcastedB);
+      //    out << SP << "// Add the bias to Y\n";
+      //    out << SP << "int " << opName << "_n = " << fLength << ";\n";
+      //    out << SP << "float " << opName << "_alpha = 1.;\n";
+      //    out << SP << "int " << opName << "_inc = 1;\n";
+      //    out << SP << "BLAS::saxpy_(&" << opName << "_n, &" << opName << "_alpha, " << bias << ", &";
+      //    out << opName << "_inc, " << "tensor_" << fNY << ", &" << opName << "_inc);\n";
+      // }
 
       return out.str();
    }
diff --git a/tmva/sofie/inc/TMVA/ROperator_Range.hxx b/tmva/sofie/inc/TMVA/ROperator_Range.hxx
index 16d2cb689d518..b91e45dd6d84b 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Range.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Range.hxx
@@ -108,11 +108,6 @@ public:
          model.AddConstantTensor(fNOutput,shape, output.data());
          fShape = ConvertShapeToDim(shape);
 
-          // set the input tensor not writable
-         model.SetNotWritableInitializedTensor(fNStart);
-         model.SetNotWritableInitializedTensor(fNDelta);
-         model.SetNotWritableInitializedTensor(fNLimit);
-
       } else { // case of a shape tensor
          std::string start = (res1 == 1) ? std::to_string(start_value) : start_dim.GetVal();
          std::string limit = (res2 == 1) ? std::to_string(limit_value) : limit_dim.GetVal();
@@ -164,22 +159,20 @@ public:
          throw std::runtime_error("TMVA SOFIE Range operator called to Generate without being initialized first");
       }
 
-      std::string sizeName = fShape[0].param;
-      if (sizeName.find("range_size") != std::string::npos)
-         sizeName = "static_cast<size_t>(std::max(std::ceil((static_cast<float>(*tensor_" + fNLimit +
+      std::string outputSizeVar;
+      std::string outputSize = fShape[0].param;
+      if (outputSize.find("range_size") != std::string::npos) {
+         outputSizeVar = outputSize;
+         outputSize = "static_cast<size_t>(std::max(std::ceil((static_cast<float>(*tensor_" + fNLimit +
                 ") - static_cast<float>(*tensor_" + fNStart + ")) / static_cast<float>(*tensor_" + fNDelta + ")), 0.0f))";
-      out << SP << "{\n";
-      out << SP << SP << "size_t range" << " = " << sizeName << ";\n";
-      if (sizeName != fShape[0].param) {
-         out << SP << SP << "if ( range > " << "fTensor_" << fNOutput << ".size() ){\n";
-         // we should probably resize the tensor here
-         out << SP << SP << SP << "throw std::runtime_error(\"wrong size allocated for output of range\");\n";
-         out << SP << SP << "}\n";
+      } else {
+         outputSizeVar = "range_" + opName;
       }
-      out << SP << SP << "for (size_t i = 0; i < range; i++) {\n";
-      out << SP << SP << SP << "tensor_" << fNOutput << "[i] = *tensor_" << fNStart << " + i * (*tensor_" << fNDelta << ");\n";
-      out << SP << SP << "}\n";
+      out << SP << "size_t " << outputSizeVar <<  " = " << outputSize << ";\n";
+      out << SP << "for (size_t i = 0; i < " << outputSizeVar << "; i++) {\n";
+      out << SP << SP << "tensor_" << fNOutput << "[i] = *tensor_" << fNStart << " + i * (*tensor_" << fNDelta << ");\n";
       out << SP << "}\n";
+
       return out.str();
    }
 };
diff --git a/tmva/sofie/src/RModel.cxx b/tmva/sofie/src/RModel.cxx
index d7ab2b4ad39af..2f80138265ee7 100644
--- a/tmva/sofie/src/RModel.cxx
+++ b/tmva/sofie/src/RModel.cxx
@@ -577,19 +577,6 @@ void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool
       PrintDynamicTensors();
    }
 
-   // check if there are initialized tensors to write in a weight file
-   // support for the time being only weight of FLOAT type
-   if (fUseWeightFile) {
-      bool modelHasWeights = false;
-      for (auto &i : fInitializedTensors) {
-         if (i.second.IsWeightTensor()) {
-            modelHasWeights = true;
-            break;
-         }
-      }
-      if (!modelHasWeights)
-         fUseWeightFile = false;
-   }
    // Go through model and initialize each operator
    int i = 0;
 
@@ -621,6 +608,20 @@ void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool
          it.second.SetConstant();
    }
 
+   // check if there are initialized tensors to write in a weight file
+   // support for the time being only weight of FLOAT type
+   if (fUseWeightFile) {
+      bool modelHasWeights = false;
+      for (auto &i : fInitializedTensors) {
+         if (i.second.IsWeightTensor()) {
+            modelHasWeights = true;
+            break;
+         }
+      }
+      if (!modelHasWeights)
+         fUseWeightFile = false;
+   }
+
    fIsInitialized = true;
 }
 
@@ -698,7 +699,7 @@ void RModel::GenerateInitializedTensorInfo()
    // here are constant tensor or initialized ones which are not weights (e.g. int64_t tensors )
    for (auto &i : fInitializedTensors) {
       if (i.second.IsNotWritable())  continue;
-      if (!fUseWeightFile || i.second.IsConstantTensor()) {
+      if (!fUseWeightFile || i.second.IsConstantTensor() || !i.second.IsWeightTensor() ) {
          if (i.second.type() == ETensorType::FLOAT) {
             fGC += GenerateConstantTensorCode<float>(i);
             fConstantTensorSize += ConvertShapeToLength(i.second.shape()) * 4;
@@ -1203,7 +1204,9 @@ void RModel::Generate(std::underlying_type_t<Options> options, int batchSize, lo
 void RModel::ReadInitializedTensorsFromFile(long pos) {
     // generate the code to read initialized tensors from a text data file
     if (fWeightFile == WeightFileType::Text) {
-        if (fInitializedTensors.empty()) return;
+        // check if there are tensors to write
+
+        if (!fUseWeightFile) return;
 
         fGC += "   std::ifstream f;\n";
         fGC += "   f.open(filename);\n";
diff --git a/tmva/sofie/test/TestCustomModelsFromONNX.cxx b/tmva/sofie/test/TestCustomModelsFromONNX.cxx
index 5b77caf2aed1d..401afb8257e25 100644
--- a/tmva/sofie/test/TestCustomModelsFromONNX.cxx
+++ b/tmva/sofie/test/TestCustomModelsFromONNX.cxx
@@ -323,6 +323,8 @@
 
 #include "ScatterElements_FromONNX.hxx"
 
+#include "MatMul_Stacked_FromONNX.hxx"
+
 #include "gtest/gtest.h"
 
 constexpr float DEFAULT_TOLERANCE = 1e-3f;
@@ -2856,7 +2858,7 @@ TEST(ONNX, RangeFloat) {
    float start = 1.;
    float limit = 10.;
    float delta = 2.;
-   TMVA_SOFIE_RangeFloat::Session s("RangeFloat_FromONNX.dat");
+   TMVA_SOFIE_RangeFloat::Session s("RangeFloat_FromONNX.dat",5);
    std::vector<float> output(s.infer(&start, &limit, &delta));
 
    // Checking the output size
@@ -2875,7 +2877,7 @@ TEST(ONNX, RangeInt) {
    int64_t start = 1;
    int64_t limit = 10;
    int64_t delta = 2;
-   TMVA_SOFIE_RangeInt::Session s("RangeInt_FromONNX.dat");
+   TMVA_SOFIE_RangeInt::Session s("RangeInt_FromONNX.dat",5);
    std::vector<int64_t> output(s.infer(&start, &limit, &delta));
 
    // Checking the output size
@@ -2947,7 +2949,7 @@ TEST(ONNX, Where) {
    // test also the broadcast of boolean tensors
    std::vector<float> input1 = {1,2};
    std::vector<float> input2 = {3,4,5,6};
-   bool cond[] = {true, false, true}; // need to pass arrays for booleans
+   uint8_t cond[] = {true, false, true}; // need to pass arrays for booleans
    std::vector<float> correct = {1,2,5,6,1,2};
    TMVA_SOFIE_Where::Session s("Where_FromONNX.dat");
    std::vector<float> output(s.infer(input1.data(), input2.data(), cond));
@@ -3214,3 +3216,24 @@ TEST(ONNX, ScatterElements)
       EXPECT_LE(std::abs(output[i] - correct_output[i]), DEFAULT_TOLERANCE);
    }
 }
+
+TEST(ONNX, MatMul_Stacked)
+{
+   // test scatter elements (similar test as in ONNX doc)
+   std::vector<float> input1 = {1,2,3,4,5,6,7,8};    // input tensor shape is (2,2,2)
+   std::vector<float> input2 = {2,3};                // shape is (2,1)
+
+   std::vector<float> correct_output = {8,18, 28,38};
+
+   // model is dynamic , use N = 2
+   TMVA_SOFIE_MatMul_Stacked::Session s("MatMul_Stacked_FromONNX.dat", 2);
+
+   auto output = s.infer(2, input1.data(), input2.data());
+
+   // Checking output size
+   EXPECT_EQ(output.size(), correct_output.size());
+   // Checking output
+   for (size_t i = 0; i < output.size(); ++i) {
+      EXPECT_LE(std::abs(output[i] - correct_output[i]), DEFAULT_TOLERANCE);
+   }
+}
diff --git a/tmva/sofie/test/input_models/MatMul_Stacked.onnx b/tmva/sofie/test/input_models/MatMul_Stacked.onnx
new file mode 100644
index 0000000000000..19c39ee2adddd
--- /dev/null
+++ b/tmva/sofie/test/input_models/MatMul_Stacked.onnx
@@ -0,0 +1,19 @@
+
+onnx-example:�
+ 
+input1
+input2output"MatMulAddGraphZ
+input1
+
+N
+
+Z
+input2
+
+
+b
+output
+
+N
+
+B
\ No newline at end of file

From 2fba9e5ea8f00baceda6a6e10d153911e049a71d Mon Sep 17 00:00:00 2001
From: moneta <lorenzo.moneta@cern.ch>
Date: Sat, 20 Dec 2025 17:41:21 +0100
Subject: [PATCH 06/12] [tmva][sofie] Fix issue with order execution of tensors
 in GNN models

The order execution ws not set for tensor inputs to operators added using the
GNN Sofie classes. This is now fixed and the correct memory mangement can be performed.
---
 tmva/sofie/inc/TMVA/RFunction.hxx    |  6 +++---
 tmva/sofie/inc/TMVA/RModel.hxx       | 10 +++++-----
 tmva/sofie/inc/TMVA/SOFIE_common.hxx |  4 ++--
 tmva/sofie/src/RFunction.cxx         | 20 +++++++++-----------
 tmva/sofie/src/RFunction_MLP.cxx     | 16 ++++++++--------
 tmva/sofie/src/RModel.cxx            | 26 +++++++++++++++++---------
 6 files changed, 44 insertions(+), 38 deletions(-)

diff --git a/tmva/sofie/inc/TMVA/RFunction.hxx b/tmva/sofie/inc/TMVA/RFunction.hxx
index 1cca39aa7ff3e..9247bd4180d26 100644
--- a/tmva/sofie/inc/TMVA/RFunction.hxx
+++ b/tmva/sofie/inc/TMVA/RFunction.hxx
@@ -32,7 +32,7 @@ public:
 
 class RFunction_Update: public RFunction {
 protected:
-    std::shared_ptr<RModel> function_block;
+    std::shared_ptr<RModel> fFunction_block;
     FunctionTarget fTarget;
     GraphType fGraphType;
     std::vector<std::string> fInputTensors;
@@ -50,9 +50,9 @@ public:
     void AddInputTensors(const std::vector<std::vector<std::size_t>>& inputShapes);
     void AddInputTensors(const std::vector<std::vector<Dim>>& inputShapes);
     std::shared_ptr<RModel> GetFunctionBlock() {
-        return function_block;
+        return fFunction_block;
     }
-    std::string GenerateModel(const std::string& filename, long read_pos = 0, long block_size = -1);
+    std::string GenerateModel(const std::string& filename, long read_pos = 0, long block_size = -1, bool verbose = false);
     std::string Generate(const std::vector<std::string>& inputPtrs);
     FunctionTarget GetFunctionTarget() {
         return fTarget;
diff --git a/tmva/sofie/inc/TMVA/RModel.hxx b/tmva/sofie/inc/TMVA/RModel.hxx
index 996c51020270f..2a68bcb3593d3 100644
--- a/tmva/sofie/inc/TMVA/RModel.hxx
+++ b/tmva/sofie/inc/TMVA/RModel.hxx
@@ -205,8 +205,8 @@ public:
    void ReadInitializedTensorsFromFile(long);
    long WriteInitializedTensorsToFile(std::string filename = "");
 
-   void PrintIntermediateTensors();
-   void PrintOutputTensors();
+   void PrintIntermediateTensors() const;
+   void PrintOutputTensors() const;
    void OutputGenerated(std::string filename = "", bool append = false);
    std::vector<std::string> GetOutputTensorNames() { return fOutputTensorNames; }
    void SetFilename(std::string filename) { fName = filename; }
@@ -224,9 +224,9 @@ public:
       }
    */
 
-   void PrintRequiredInputTensors();
-   void PrintInitializedTensors();
-   void PrintDynamicTensors();
+   void PrintRequiredInputTensors() const;
+   void PrintInitializedTensors() const;
+   void PrintDynamicTensors() const;
    void HeadInitializedTensors(std::string name, int n_print = 50);
 
    bool UseSession() const { return fUseSession; }
diff --git a/tmva/sofie/inc/TMVA/SOFIE_common.hxx b/tmva/sofie/inc/TMVA/SOFIE_common.hxx
index 7abb7df68d997..68a74d08fd93a 100644
--- a/tmva/sofie/inc/TMVA/SOFIE_common.hxx
+++ b/tmva/sofie/inc/TMVA/SOFIE_common.hxx
@@ -252,13 +252,13 @@ public:
    bool IsConstantTensor() const { return fConstant;}
    // query if tensor needs to be written in a weight file. Constant tensors are not written in a file
    bool IsWeightTensor() const { return !fConstant && !fIsNotWritable;}
-   // check if a Tensor is Writable (need to be written in teh file or in the generated code (e.g. as a costant tensor)
+   // check if a Tensor is Writable (need to be written in the file or in the generated code (e.g. as a constant tensor)
    // if an initialized tensors is used in a constant operator at compile time does not need to be written and can be omitted in
    // the generated code
    bool IsNotWritable() const { return fIsNotWritable; }
    // set not writable initialized tensors - i.e. tensor that must not be written in a file
    void SetNotWritable() { fIsNotWritable = true;}
-   // set as constant (needed for non-flot initialized tensors)
+   // set as constant (needed for non-float initialized tensors)
    void SetConstant() { fConstant = true;}
 
    template <class T = void>
diff --git a/tmva/sofie/src/RFunction.cxx b/tmva/sofie/src/RFunction.cxx
index a6df8dcb43e61..505d84187ca9a 100644
--- a/tmva/sofie/src/RFunction.cxx
+++ b/tmva/sofie/src/RFunction.cxx
@@ -26,7 +26,7 @@ RFunction_Update::RFunction_Update(FunctionTarget target, GraphType gType): fTar
         throw std::runtime_error("Invalid target for Update function");
     }
     fType = FunctionType::UPDATE;
-    function_block = std::make_unique<RModel>(fFuncName);
+    fFunction_block = std::make_unique<RModel>(fFuncName);
 
     if(fGraphType == GraphType::GNN) {
         if(fTarget == FunctionTarget::EDGES) {
@@ -49,25 +49,23 @@ RFunction_Update::RFunction_Update(FunctionTarget target, GraphType gType): fTar
 // add input tensors, order of provided shapes must be the same as in fInputTensors
 void RFunction_Update::AddInputTensors(const std::vector<std::vector<std::size_t>>& inputShapes) {
     for(long unsigned int i=0; i<inputShapes.size(); ++i) {
-        function_block->AddInputTensorInfo(fInputTensors[i],ETensorType::FLOAT, inputShapes[i]);
-        function_block->AddInputTensorName(fInputTensors[i]);
+        fFunction_block->AddInputTensorInfo(fInputTensors[i],ETensorType::FLOAT, inputShapes[i]);
+        fFunction_block->AddInputTensorName(fInputTensors[i]);
     }
 }
 void RFunction_Update::AddInputTensors(const std::vector<std::vector<Dim>>& inputShapes) {
     for(long unsigned int i=0; i<inputShapes.size(); ++i) {
-        function_block->AddInputTensorInfo(fInputTensors[i],ETensorType::FLOAT, inputShapes[i]);
-        function_block->AddInputTensorName(fInputTensors[i]);
+        fFunction_block->AddInputTensorInfo(fInputTensors[i],ETensorType::FLOAT, inputShapes[i]);
+        fFunction_block->AddInputTensorName(fInputTensors[i]);
     }
 }
 
-std::string RFunction_Update::GenerateModel(const std::string& filename, long read_pos, long block_size) {
-    function_block->SetFilename(filename);
+std::string RFunction_Update::GenerateModel(const std::string& filename, long read_pos, long block_size, bool verbose) {
+    fFunction_block->SetFilename(filename);
     // use batch size as block size in RModel::generate
-    function_block->PrintRequiredInputTensors();
-    function_block->PrintDynamicTensors();
-    function_block->Generate(Options::kGNNComponent,block_size,read_pos);
+    fFunction_block->Generate(Options::kGNNComponent,block_size,read_pos, verbose);
     std::string modelGenerationString;
-    modelGenerationString = "\n//--------- GNN_Update_Function---"+fFuncName+"\n"+function_block->ReturnGenerated();
+    modelGenerationString = "\n//--------- GNN_Update_Function---"+fFuncName+"\n"+fFunction_block->ReturnGenerated();
     return modelGenerationString;
 }
 
diff --git a/tmva/sofie/src/RFunction_MLP.cxx b/tmva/sofie/src/RFunction_MLP.cxx
index 32148cae36794..c41135de49902 100644
--- a/tmva/sofie/src/RFunction_MLP.cxx
+++ b/tmva/sofie/src/RFunction_MLP.cxx
@@ -20,9 +20,9 @@ RFunction_MLP::RFunction_MLP(FunctionTarget target, Int_t numLayers, Activation
          throw std::runtime_error("TMVA SOFIE GNN doesn't currently supports the provided activation function for " +
                                   fFuncName + " update.");
       }
-      function_block->AddOutputTensorNameList({fFuncName + "Relu" + std::to_string(fNumLayers)});
+      fFunction_block->AddOutputTensorNameList({fFuncName + "Relu" + std::to_string(fNumLayers)});
    } else {
-      function_block->AddOutputTensorNameList({fFuncName + "Gemm" + std::to_string(fNumLayers)});
+      fFunction_block->AddOutputTensorNameList({fFuncName + "Gemm" + std::to_string(fNumLayers)});
    }
 }
 
@@ -32,7 +32,7 @@ void RFunction_MLP::Initialize() {
     if(fGraphType == GraphType::GNN) {
         std::unique_ptr<ROperator> op_concat;
         op_concat.reset(new ROperator_Concat(fInputTensors,1,0,fFuncName+"InputConcat"));
-        function_block->AddOperator(std::move(op_concat));
+        fFunction_block->AddOperator(std::move(op_concat));
         fGemmInput = fFuncName+"InputConcat";
 
     } else if(fGraphType == GraphType::GraphIndependent) {
@@ -43,24 +43,24 @@ void RFunction_MLP::Initialize() {
     for(int i=0; i<fNumLayers-1; ++i) {
         double beta = (fBiasTensors[i].empty()) ? 0. : 1.;
         op_gemm.reset(new ROperator_Gemm<float>(1.0,beta,0,0,fGemmInput,UTILITY::Clean_name(fKernelTensors[i]),UTILITY::Clean_name(fBiasTensors[i]),fFuncName+"Gemm"+std::to_string(i)));
-        function_block->AddOperator(std::move(op_gemm));
+        fFunction_block->AddOperator(std::move(op_gemm));
         fGemmInput = fFuncName+"Gemm"+i;
         if (fActivationFunction == Activation::RELU) {
             std::unique_ptr<ROperator> op_relu;
             op_relu.reset(new ROperator_Relu<float>(fFuncName+"Gemm"+std::to_string(i), fFuncName+"Relu"+std::to_string(i)));
-            function_block->AddOperator(std::move(op_relu));
+            fFunction_block->AddOperator(std::move(op_relu));
             fGemmInput = fFuncName+"Relu"+i;
 
         }
     }
     double beta = (fBiasTensors.back().empty()) ? 0. : 1.;
     op_gemm.reset(new ROperator_Gemm<float>(1.0,beta,0,0,fGemmInput,UTILITY::Clean_name(fKernelTensors.back()),UTILITY::Clean_name(fBiasTensors.back()),fFuncName+"Gemm"+std::to_string(fNumLayers)));
-    function_block->AddOperator(std::move(op_gemm));
+    fFunction_block->AddOperator(std::move(op_gemm));
     if(fActivateFinal) {
         if (fActivationFunction == Activation::RELU) {
             std::unique_ptr<ROperator> op_relu;
             op_relu.reset(new ROperator_Relu<float>(fFuncName+"Gemm"+std::to_string(fNumLayers), fFuncName+"Relu"+std::to_string(fNumLayers)));
-            function_block->AddOperator(std::move(op_relu));
+            fFunction_block->AddOperator(std::move(op_relu));
         }
     }
 
@@ -68,7 +68,7 @@ void RFunction_MLP::Initialize() {
     if(fAddlOp.size()) {
         for(auto &i:fAddlOp) {
             std::unique_ptr<ROperator> tmp(i);
-            function_block->AddOperator(std::move(tmp));
+            fFunction_block->AddOperator(std::move(tmp));
         }
     }
 }
diff --git a/tmva/sofie/src/RModel.cxx b/tmva/sofie/src/RModel.cxx
index 2f80138265ee7..d6c2e31a20893 100644
--- a/tmva/sofie/src/RModel.cxx
+++ b/tmva/sofie/src/RModel.cxx
@@ -164,6 +164,7 @@ void RModel::AddOperator(std::unique_ptr<ROperator> op, int order_execution) {
         fOperators.insert(fOperators.begin() + order_execution, std::move(op));
     } else {
         fOperators.push_back(std::move(op));
+        order_execution = fOperators.size()-1;
     }
 
     // storing the last usage of tensors which are input to
@@ -812,6 +813,11 @@ void RModel::GenerateDynamicTensorInfo()
    if (fDynamicTensorInfos.empty())
       return;
 
+   if (fVerbose) {
+      std::cout << "generating code for dynamic tensor management" << std::endl;
+      PrintDynamicTensors();
+   }
+
    std::stringstream out;
    out << "//  dynamic tensor memory management\n";
    out << SP << "std::vector<TMVA::Experimental::SOFIE::TensorLifeInfo> dynamicTensorInfos;\n";
@@ -1387,7 +1393,7 @@ long RModel::WriteInitializedTensorsToFile(std::string filename) {
     }
 }
 
-void RModel::PrintRequiredInputTensors() {
+void RModel::PrintRequiredInputTensors() const {
     std::cout << "Model requires following inputs:\n";
     for (auto& inputInfo: fInputTensorInfos) {
         std::cout << "Parametrised Tensor name: " << inputInfo.first << "\t";
@@ -1417,7 +1423,7 @@ void RModel::PrintRequiredInputTensors() {
     std::cout << "\n";
 }
 
-void RModel::PrintInitializedTensors() {
+void RModel::PrintInitializedTensors() const {
     std::cout << "Model initialized the following tensors:\n";
     for (auto& it: fInitializedTensors) {
         std::cout << "Tensor name: \"" << it.first << "\"\t";
@@ -1435,7 +1441,7 @@ void RModel::PrintInitializedTensors() {
     std::cout << "\n";
 }
 
-void RModel::PrintIntermediateTensors() {
+void RModel::PrintIntermediateTensors() const {
     std::cout << "Model specify the following intermediate tensors:\n";
     for (auto& it: fIntermediateTensorInfos) {
         std::cout << "Tensor name: \"" << it.first << "\"\t";
@@ -1450,7 +1456,7 @@ void RModel::PrintIntermediateTensors() {
     std::cout << "\n";
 }
 
-void RModel::PrintDynamicTensors() {
+void RModel::PrintDynamicTensors() const {
     std::cout << "Model specify the following dynamic tensors:\n";
     for (auto& it: fDynamicTensorInfos) {
         std::cout << "Tensor name: \"" << it.first << "\"\t";
@@ -1465,14 +1471,16 @@ void RModel::PrintDynamicTensors() {
     std::cout << "\n";
 }
 
-void RModel::PrintOutputTensors() {
+void RModel::PrintOutputTensors() const {
     std::cout << "Model specify the following output tensors:\n";
     for (auto& it: fOutputTensorNames) {
         std::cout << "Tensor name: \"" << it << "\"\t";
-        if (!IsDynamicTensor(it))
-           std::cout << "shape: " << ConvertShapeToString(GetTensorShape(it)) << std::endl;
-        else
-          std::cout << "shape: " << ConvertShapeToString(GetDynamicTensorShape(it)) << std::endl;
+        try {
+         auto shape = GetDimTensorShape(it);
+         std::cout << "with shape: " << ConvertShapeToString(shape) << std::endl;
+        } catch (...) {
+          std::cout << "with shape not yet defined" << std::endl;
+        }
     }
     std::cout << "\n";
 }

From 55fb3e7eb17f095a2960e384fe68afe89e528083 Mon Sep 17 00:00:00 2001
From: moneta <lorenzo.moneta@cern.ch>
Date: Mon, 5 Jan 2026 18:36:47 +0100
Subject: [PATCH 07/12] [tmva][sofie] Apply fixes for the TestCustiomModelsFrom
 ROOT

SOme fixes are needed for the test, since the session is not used for this tests.
Need also to force using Session in case of Dynamic tensors

Fix also a warning in Gemm operator and RModel
---
 tmva/sofie/inc/TMVA/ROperator_Conv.hxx       |  2 +-
 tmva/sofie/inc/TMVA/ROperator_Gemm.hxx       | 13 +++++++------
 tmva/sofie/src/RModel.cxx                    | 16 ++++++++++++----
 tmva/sofie/test/TestCustomModelsFromROOT.cxx |  6 ++++--
 4 files changed, 24 insertions(+), 13 deletions(-)

diff --git a/tmva/sofie/inc/TMVA/ROperator_Conv.hxx b/tmva/sofie/inc/TMVA/ROperator_Conv.hxx
index 823e7fa04717e..87d1ad0a0bf67 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Conv.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Conv.hxx
@@ -350,8 +350,8 @@ public:
          out << SP << SP << "float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_"
              << fNB << ", " << ConvertShapeToString(shape) << ", " << ConvertShapeToString(fShapeY) << ");\n";
          out << SP << SP << "fTensor_" << fNB << ".resize(" << length << ");\n";
+         out << SP << SP << "std::copy(data, data + " << length << ", fTensor_" << fNB << ".begin());\n";
          out << SP << SP << "tensor_" << fNB << " = fTensor_" << fNB << ".data();\n";
-         out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNB << ");\n";
          out << SP << SP << "delete[] data;\n";
          out << SP << "}\n";
       }
diff --git a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
index 47bc5392fede4..9f911756196a8 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
@@ -107,6 +107,7 @@ namespace SOFIE{
          if (input[0].size() > 2 && input[1].size() == input[0].size()) {
             // in case of dim > 2 first dimensions are equal to the input ones not
             // equal to 1 (e.g. (1,2,3) * (2,3,4) -> (2,2,4))
+            // here could probably use the Broadcasting function  UTILITY::MultidirectionalBroadcastShape
             for (size_t i = 0; i < input[0].size()-2; i++) {
                Dim valueA = input[0][i];
                Dim valueB = input[1][i];
@@ -311,8 +312,8 @@ namespace SOFIE{
                 << fNC << "," << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY) << ");\n";
 
             out << SP << SP << "fTensor_" << fNC << ".resize(" << length << ");\n";
+            out << SP << SP << "std::copy(data, data + " << length << ", fTensor_" << fNC << ".begin());\n";
             out << SP << SP << "tensor_" << fNC << " = fTensor_" << fNC << ".data();\n";
-            out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNC << ");\n";
             out << SP << SP << "delete [] data;\n";
             out << SP << "}\n";
          }
@@ -341,12 +342,12 @@ namespace SOFIE{
          // size of B  n*k
          std::vector<Dim> sY = {fShapeY[dimY-2], fShapeY[dimY-1]};
          // extra dimensions in case of stacked MatMul
-         std::vector<Dim> sA;
+         std::vector<Dim> sExtraY;
          for (int64_t i = 0; i < dimY-2; i++) {
-            sA.push_back(fShapeY[i]);
+            sExtraY.push_back(fShapeY[i]);
          }
          auto lengthGemm = ConvertDimShapeToLength(sY); // size of the Gemm operation
-         auto lengthExtra = ConvertDimShapeToLength(sA); // extra length in case input tensors are of dim>2 (MatMul)
+         auto lengthExtra_Y = ConvertDimShapeToLength(sExtraY); // extra length in case input tensors are of dim>2 (MatMul)
 
          // case bias is present
          if (!fNC.empty()){
@@ -372,7 +373,7 @@ namespace SOFIE{
 
          // include MatMul case where we stack the Gemm operations
          // exclude case where we have only 1's in the additional dims
-         bool doStackMul = dimY > 2 && ( fIsDynamic  || std::stoi(lengthExtra) > 1);
+         bool doStackMul = dimY > 2 && ( fIsDynamic  || std::stoi(lengthExtra_Y) > 1);
          // compute input offset for stack multiplications
          std::string lengthExtra_A;
          std::string lengthExtra_B;
@@ -398,7 +399,7 @@ namespace SOFIE{
                out << SP << "size_t " << opName << "_A_offset = 0;\n";
             if (extraB)
                out << SP << "size_t " << opName << "_B_offset = 0;\n";
-            out << SP << "for (size_t i = 0; i < " << lengthExtra << "; i++){\n";
+            out << SP << "for (size_t i = 0; i < " << lengthExtra_Y << "; i++){\n";
             out << SP;
          }
 
diff --git a/tmva/sofie/src/RModel.cxx b/tmva/sofie/src/RModel.cxx
index d6c2e31a20893..dbd9c3666e01f 100644
--- a/tmva/sofie/src/RModel.cxx
+++ b/tmva/sofie/src/RModel.cxx
@@ -613,8 +613,8 @@ void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool
    // support for the time being only weight of FLOAT type
    if (fUseWeightFile) {
       bool modelHasWeights = false;
-      for (auto &i : fInitializedTensors) {
-         if (i.second.IsWeightTensor()) {
+      for (auto &it : fInitializedTensors) {
+         if (it.second.IsWeightTensor()) {
             modelHasWeights = true;
             break;
          }
@@ -664,7 +664,8 @@ std::string GenerateConstantTensorCode(const std::pair<std::string, InitializedT
    std::string type = ConvertTypeToString(t.second.type());
    size_t length = ConvertShapeToLength(t.second.shape());
    // avoid using stack sizes for constant tensors to reduce compilation time
-   bool allocateOnStack = (length > 100) ? false : true;
+   // also for weights which can be broadcasted do not use stack but allocate as a std::vector
+   bool allocateOnStack = (length > 100 || t.second.IsWeightTensor()) ? false : true;
 
    const T *data = t.second.data<T>();
 
@@ -687,7 +688,7 @@ std::string GenerateConstantTensorCode(const std::pair<std::string, InitializedT
       else {
          strs << ConvertValuesToString(length, data) << ";\n";
       }
-      strs << "const " << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n";
+      strs << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n";
    }
    return strs.str();
 }
@@ -1181,6 +1182,13 @@ void RModel::Generate(std::underlying_type_t<Options> options, int batchSize, lo
    // initialize the model including all operators and sub-graphs
    Initialize(batchSize, verbose);
 
+   // if having dynamic tensor we need to have a Session
+   if (!fDynamicTensorInfos.empty()) {
+      fUseSession = true;
+      if (verbose)
+         std::cout << "Warning: Force having a Session since model has dynamic tensors " << std::endl;
+   }
+
    std::string hgname;
    if (!fIsGNNComponent && !fIsSubGraph) {
       fGC.clear();
diff --git a/tmva/sofie/test/TestCustomModelsFromROOT.cxx b/tmva/sofie/test/TestCustomModelsFromROOT.cxx
index d077aede3e2e6..7e3c8c9c2fc09 100644
--- a/tmva/sofie/test/TestCustomModelsFromROOT.cxx
+++ b/tmva/sofie/test/TestCustomModelsFromROOT.cxx
@@ -891,7 +891,8 @@ TEST(ROOT, RangeFloat) {
    float start = 1.;
    float limit = 10.;
    float delta = 2.;
-   std::vector<float> output = TMVA_SOFIE_RangeFloat::infer(&start, &limit, &delta);
+   TMVA_SOFIE_RangeFloat::Session s("",5);
+   std::vector<float> output(s.infer(&start, &limit, &delta));
 
    // Checking the output size
    EXPECT_EQ(output.size(), sizeof(RangeFloat_ExpectedOutput::outputs) / sizeof(float));
@@ -909,7 +910,8 @@ TEST(ROOT, RangeInt) {
    int64_t start = 1;
    int64_t limit = 10;
    int64_t delta = 2;
-   std::vector<int64_t> output = TMVA_SOFIE_RangeInt::infer(&start, &limit, &delta);
+   TMVA_SOFIE_RangeInt::Session s("",5);
+   std::vector<int64_t> output(s.infer(&start, &limit, &delta));
 
    // Checking the output size
    EXPECT_EQ(output.size(), sizeof(RangeInt_ExpectedOutput::outputs) / sizeof(int64_t));

From 4b87a64e41794d425de06ec55a9d02ea792076fb Mon Sep 17 00:00:00 2001
From: moneta <lorenzo.moneta@cern.ch>
Date: Wed, 7 Jan 2026 13:28:30 +0100
Subject: [PATCH 08/12] [tmva][sofie] Sort in alphabetical order the shape
 parameter in Session ctor

Do an alphabetical order of Session shape parameters for dynamic tensors, otherwise they may get a random order. Observed different order on different platforms

Add some small improvements in the generated code (add nunber and shape informations) when generating Gemm code
---
 tmva/sofie/inc/TMVA/ROperator_BasicBinary.hxx | 2 +-
 tmva/sofie/inc/TMVA/ROperator_Gemm.hxx        | 3 ++-
 tmva/sofie/src/RModel.cxx                     | 2 ++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tmva/sofie/inc/TMVA/ROperator_BasicBinary.hxx b/tmva/sofie/inc/TMVA/ROperator_BasicBinary.hxx
index 1c4f20363ebe2..491b669554118 100644
--- a/tmva/sofie/inc/TMVA/ROperator_BasicBinary.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_BasicBinary.hxx
@@ -192,7 +192,7 @@ public:
                dataY[i] = BinaryOperatorTrait<T, Op>::Func(dataA[i], dataB[i]);
             }
             model.AddConstantTensor<T>(fNY, fShapeY, dataY.data());
-            // flag tensors to not be written in the weight file
+            // flag tensors to not be written in the generated code or weight file
             model.SetNotWritableInitializedTensor(nameA);
             model.SetNotWritableInitializedTensor(nameB);
             fIsOutputConstant = true;
diff --git a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
index 9f911756196a8..2c2df2aa37830 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
@@ -327,7 +327,8 @@ namespace SOFIE{
             throw std::runtime_error("TMVA SOFIE Gemm Op called to Generate without being initialized first");
          }
          std::stringstream out;
-         out << "\n//--------- Gemm\n";
+         out << "\n//--------- Gemm " << opName << " " << ConvertShapeToString(fShapeA) << " * " << ConvertShapeToString(fShapeB)
+             << " -> " << ConvertShapeToString(fShapeY) << "\n";
          // need to consider case A and B have dim > 2 (for MatMul)
          int64_t dimA = fShapeA.size();
          int64_t dimB = fShapeB.size();
diff --git a/tmva/sofie/src/RModel.cxx b/tmva/sofie/src/RModel.cxx
index dbd9c3666e01f..089e656fedbd1 100644
--- a/tmva/sofie/src/RModel.cxx
+++ b/tmva/sofie/src/RModel.cxx
@@ -1087,6 +1087,8 @@ void RModel::GenerateSessionCode()
       // add initialization of shape parameters
       // assume all parameters are of type size_t
       if (!fDimShapeNames.empty()) {
+         // sort first the shape parameters in alphabetical order to avoid a random order
+         std::sort(fDimShapeNames.begin(), fDimShapeNames.end() );
          for (auto &p : fDimShapeNames) {
             fGC += ",\n";
             fGC += "        size_t " + p + " = " + fShapeParams[p];

From b95b9a7b989fa0158d71b9b662178cd0ac7110bd Mon Sep 17 00:00:00 2001
From: moneta <lorenzo.moneta@cern.ch>
Date: Thu, 8 Jan 2026 17:03:06 +0100
Subject: [PATCH 09/12] [tmva][sofie] Do not permor broadcast of bias tensor in
 Gemm in Session ctor

Avoid creating a broadcasted bias tensor which uses lots of memory.
Do broadcasting of the bias on the fly before computing Gemm by using the output tensor.
This saves a large amount of memory on models using large Gemm calss like the atlas GNN model used for tracking
---
 tmva/sofie/inc/TMVA/ROperator_Gemm.hxx | 82 +++++++++++++++-----------
 tmva/sofie/src/SOFIE_common.cxx        | 11 ++--
 2 files changed, 55 insertions(+), 38 deletions(-)

diff --git a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
index 2c2df2aa37830..a18914b8892a8 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
@@ -219,7 +219,7 @@ namespace SOFIE{
             fShapeC = model.GetTensorShape(fNC);
             size_t lengthC = ConvertShapeToLength(fShapeC);
             size_t lengthY = ConvertShapeToLength(shapeY);
-            // for dynamic outputs broadcasting is always done
+            // for dynamic outputs broadcasting is always needed
             bool broadcast_needed = false;
             if (fIsDynamic && shapeY.empty())
                broadcast_needed = true;
@@ -229,34 +229,21 @@ namespace SOFIE{
 
             if (broadcast_needed) {
                fBroadcastBias = true;
-               if (!model.UseSession()) {
-                  // without session dynamic tensors not supported in Gemm
-                  if (fIsDynamic) {
-                      throw std::runtime_error("TMVA SOFIE Gemm Op:  dynamic tensors not supported without a session");
-                  }
-                  auto original_data = model.GetInitializedTensorData(fNC);
-                  auto targetShape = UTILITY::UnidirectionalBroadcastShape(fShapeC, shapeY);
-                  if (fType == "float") {
-                     std::shared_ptr<void> new_data_ptr(UTILITY::UnidirectionalBroadcast<float>(
-                        static_cast<float *>(original_data.get()), fShapeC, targetShape),
-                        std::default_delete<float[]>());
-
-                     model.UpdateInitializedTensor(fNC, model.GetTensorType(fNC), shapeY, new_data_ptr);
-                     fShapeC = shapeY;
-                  }
-               } else {
-                  // /d to add a new intermediate tensor for broadcasted bias tensor
-                  // fNC2 = fNC + "bcast";
-                  // if (!fIsDynamic) {
-                  //    model.AddIntermed/ In case of session add broadcasting code in Session constructor and in GenerateInitCode
-                  // // we neeiateTensor(fNC2, model.GetTensorType(fNC), shapeY);
-                  // }
-                  // else
-                  //    model.AddDynamicTensor(fNC2,model.GetTensorType(fNC), fShapeY);
-                  // // do not add to lists of input/output tensors since broadcasted tensors are special
-                  // // and we manage their memory separatly
-                  // //fInputTensorNames.emplace_back(fNC2);
-                  // //fOutputTensorNames.emplace_back(fNC2);
+               // check if broadcasting is compatible and note that prepend 1 to shapeC
+               auto shapeDimC = ConvertShapeToDim(fShapeC);
+               auto r = UTILITY::MultidirectionalBroadcastShape(fShapeY, shapeDimC);
+               // return flag must be equal to 1 since this is a unidirectional broadcast of C->Y
+               if (r.first > 1) {
+                  throw std::runtime_error("TMVA SOFIE Gemm Op - bias tensor of shape " + ConvertShapeToString(fShapeC) + " cannot be uni-directional broadcasted to " + ConvertDimShapeToString(fShapeY));
+               }
+               fShapeC = ConvertShapeToInt(shapeDimC);
+               if (fShapeC.empty()) {
+                  throw std::runtime_error("TMVA SOFIE Gemm Op - Error in bias tensor " + ConvertDimShapeToString(shapeDimC) );
+               }
+            } else {
+               // for the case lengthY == lengthC but shape is different (e.g. Y is (2,3) and  is (6))
+               if (shapeY  != fShapeC) {
+                  throw std::runtime_error("TMVA SOFIE Gemm Op:  invalid shape for bias tensor " + ConvertShapeToString(fShapeC));
                }
             }
          }
@@ -294,6 +281,7 @@ namespace SOFIE{
       std::string GenerateInitCode() override {
          std::stringstream out;
          // generate initialization code for broadcasting of bias tensor
+#if 0
          if (fShapeC.size() != fShapeY.size() && fBroadcastBias) {
             // we broadcast here always C in Y output, so target shape is the one of Y
             // no need to call UTILITY::UnidirectionalBroadcastShape.
@@ -317,6 +305,7 @@ namespace SOFIE{
             out << SP << SP << "delete [] data;\n";
             out << SP << "}\n";
          }
+#endif
          return out.str();
       }
 
@@ -403,6 +392,33 @@ namespace SOFIE{
             out << SP << "for (size_t i = 0; i < " << lengthExtra_Y << "; i++){\n";
             out << SP;
          }
+         // do the bias broadcasting
+         if (fBroadcastBias) {
+            out << SP << "for (size_t j = 0; j < " << sY[0] << "; j++) { \n";
+            out << SP << SP << "size_t y_index = ";
+            if (doStackMul) // add offset in caseof stack multiplications (not sure if bias is present in these cases)
+               out <<  opName << "_y_offset + ";
+            if (sY[1].GetVal() != "1")
+               out << sY[1] << " * j;\n";
+            else
+               out << "j;\n";
+
+            out << SP << SP << "for (size_t k = 0; k < " << sY[1] << "; k++) { \n";
+            std::string bias_index;
+            if (fShapeC[0] == 1 && fShapeC[1] == sY[1].dim)
+               bias_index = "k";
+            else if (fShapeC[1] == 1 && fShapeC[0] == sY[0].dim)
+               bias_index = "j";
+            else if (fShapeC[0] == 1 && fShapeC[1] == 1)   // scalar case
+               bias_index = "0";
+            else {
+               throw std::runtime_error("TMVA SOFIE Gemm Op - invalid shape for bias tensor " + ConvertShapeToString(fShapeC));
+            }
+
+            out << SP << SP << SP << "tensor_" << fNY << "[y_index + k] = " <<  "tensor_" << fNC << "[" << bias_index << "];\n";
+            out << SP << SP << "}\n";
+            out << SP << "}\n";
+         }
 
          if (fType == "float"){
 
@@ -418,12 +434,12 @@ namespace SOFIE{
             out << ", tensor_" << fNA;
             if (extraA) out << " + " << opName << "_A_offset";
             out << ", " << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrBeta << ",";
-            // in the case of bias
-             if (!fNC.empty())
+            // in the case of bias and no broadcasting needed
+            if (!fNC.empty() && !fBroadcastBias)
                out << "tensor_" << fNC;
-             else
+            else
                out << "nullptr";
-             out << ");\n";
+            out << ");\n";
 
             if(fActivation == EActivationType::RELU){
                out << SP << "for (int id = 0; id < " << ConvertDimShapeToLength(fShapeY) << " ; id++){\n";
diff --git a/tmva/sofie/src/SOFIE_common.cxx b/tmva/sofie/src/SOFIE_common.cxx
index 1ff510842643a..f659d0e1a2fe6 100644
--- a/tmva/sofie/src/SOFIE_common.cxx
+++ b/tmva/sofie/src/SOFIE_common.cxx
@@ -414,14 +414,15 @@ std::pair<int, std::vector<size_t>>  UTILITY::MultidirectionalBroadcastShape(std
             + " to a common shape.");
    }
 }
-// unidirectional broadcast- only B changes
+// unidirectional broadcast- of shape A to target B
 std::vector<size_t>  UTILITY::UnidirectionalBroadcastShape(std::vector<size_t> & shapeA, std::vector<size_t> & shapeB)
 {
-   auto ret = UTILITY::MultidirectionalBroadcastShape(shapeA, shapeB);
+   auto ret = UTILITY::MultidirectionalBroadcastShape(shapeB, shapeA);
    if (ret.first > 1) {
-      std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape "
-            + ConvertShapeToString(shapeA) + " and " + ConvertShapeToString(shapeB)
-            + " to a common shape.");
+      throw
+         std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape "
+            + ConvertShapeToString(shapeA) + " to  " + ConvertShapeToString(shapeB)
+            + " in a common shape.");
    }
    return ret.second;
 }

From 6282757cc6b3d5afb60e311a5abe7e87cffbf98a Mon Sep 17 00:00:00 2001
From: moneta <lorenzo.moneta@cern.ch>
Date: Thu, 8 Jan 2026 19:09:39 +0100
Subject: [PATCH 10/12] [tmva][sofie] Add alias tensors

Add alias tensors to cope with identity operators.
In this case just a pointer assignment is performed by the operator.
Exclude this tensors in the allocation and take care of them in the dynamic memory pool

Optimise Slice operator when slice is an identity and also ScatterElements
---
 tmva/sofie/inc/TMVA/RModel.hxx                |  5 ++
 tmva/sofie/inc/TMVA/ROperator_Constant.hxx    |  8 ++-
 .../inc/TMVA/ROperator_ScatterElements.hxx    | 23 +++++-
 tmva/sofie/inc/TMVA/ROperator_Slice.hxx       | 36 +++++++++-
 tmva/sofie/src/RModel.cxx                     | 71 +++++++++++++------
 tmva/sofie/src/SOFIE_common.cxx               |  2 +-
 6 files changed, 118 insertions(+), 27 deletions(-)

diff --git a/tmva/sofie/inc/TMVA/RModel.hxx b/tmva/sofie/inc/TMVA/RModel.hxx
index 2a68bcb3593d3..13d95935d9600 100644
--- a/tmva/sofie/inc/TMVA/RModel.hxx
+++ b/tmva/sofie/inc/TMVA/RModel.hxx
@@ -30,6 +30,7 @@ private:
    std::unordered_map<std::string, DynamicTensorInfo> fDynamicTensorInfos;
    std::unordered_map<std::string, std::pair<std::vector<Dim>, bool>> fShapeTensors; // constant tensors describing a shape
    std::unordered_map<std::string, std::string> fShapeParams; // parameters defining the dynamic shape (e.g. batch size), store also its default value
+   std::unordered_map<std::string, std::string> fAliasTensors;   // list of alias tensors
    std::vector<std::string> fDimShapeNames; // parameter names used to define the shapes
    std::vector<std::string> fOutputTensorNames;
    std::vector<std::string> fInputTensorNames; // input tensor names using ONNX order
@@ -82,6 +83,8 @@ public:
    void AddConstantTensor(std::string tensor_name, ETensorType type, std::vector<std::size_t> shape,
                              std::shared_ptr<void> data);
 
+   void AddAliasTensor(const std::string & tensor_name, const std::string & orig_tensor_name);
+
 
    template<class T>
    void AddConstantTensor(const std::string & name, const std::vector<size_t> & shape, const T * data) {
@@ -130,6 +133,8 @@ public:
    bool IsReadyInputTensor(const std::string &name) const;
    /// check if a tensor is a shape tensor
    bool IsShapeTensor(const std::string & name) const;
+   /// check if a tensor is a alias tensor
+   bool IsAliasTensor(const std::string & name) const;
 
    // Add intermediate tensor
    void AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector<Dim> dim_shape);
diff --git a/tmva/sofie/inc/TMVA/ROperator_Constant.hxx b/tmva/sofie/inc/TMVA/ROperator_Constant.hxx
index 93f3c43feceb9..7c824f1abe6e3 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Constant.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Constant.hxx
@@ -123,8 +123,12 @@ public:
          if (model.Verbose()) {
             std::cout << "adding constant tensor " << fNY << " with shape " << ConvertShapeToString(fShape)
             << " and values [";
-            for (auto v : fValues) std::cout << " " << v;
-            std::cout << "]" << std::endl;
+            if (!fIsConstantOfShape) {
+               for (auto v : fValues) std::cout << " " << v;
+               std::cout << "]" << std::endl;
+            } else {  // for constant of shape is enough to print one value
+               std::cout << "... " << fValues[0] << " ....]" << std::endl;
+            }
          }
       } else {
          model.AddIntermediateTensor(fNY, ConvertStringToType(TensorType<T>::Name()), fDimOutputShape);
diff --git a/tmva/sofie/inc/TMVA/ROperator_ScatterElements.hxx b/tmva/sofie/inc/TMVA/ROperator_ScatterElements.hxx
index 626debd13038e..2525ea32629df 100644
--- a/tmva/sofie/inc/TMVA/ROperator_ScatterElements.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_ScatterElements.hxx
@@ -136,6 +136,17 @@ public:
          return strst.str();
       };
 
+      auto tensorIndexOpt = [](const std::vector<std::string> & sdx, const std::vector<std::string> & idx) {
+         std::stringstream strst;
+         int dims = idx.size();
+         for (int i = 0; i < dims-1; i++) {
+            strst << sdx[i];
+            strst << " + ";
+         }
+         strst << idx[dims-1];
+         return strst.str();
+      };
+
 
       // copy first input in output (maybe can be avoided??)
       out << SP << "std::copy(tensor_" << fNX << ", tensor_" << fNX << " + " << length << ", tensor_" << fNY << ");\n";
@@ -143,14 +154,24 @@ public:
       // loop on tensor rank
       int dims = fShapeY.size();
       std::vector<std::string> idx(dims);
+      std::vector<std::string> sdx(dims);  // stride for indices
       for (int i = 0; i < dims; i++) {
          idx[i] = std::string("i") + std::to_string(i);
+         sdx[i] = std::string("s") + std::to_string(i);
          for (int j = 0; j <= i; j++) out << SP;
          out << "for (int " << idx[i] << " = 0; " << idx[i] << " < " << fShapeI[i] << "; " << idx[i] << "++) {\n";
+         if (i < dims-1) {
+            for (int j = 0; j <= i+1 ; j++) out << SP;
+            if (strideI[i].GetVal() != "1")
+               out << "int "<< sdx[i] << " = " << strideI[i] << " * " << idx[i] << ";\n";
+            else
+               out << "int "<< sdx[i] << " = " << idx[i] << ";\n";
+         }
       }
       // correct index for specific axis
       for (int j = 0; j <= dims; j++) out << SP;
-      out << "int updateIndex = " << tensorIndex(strideI,idx) << ";\n";
+      // can use optimised formula for indices since the loop above is on fShapeI
+      out << "int updateIndex = " << tensorIndexOpt(sdx,idx) << ";\n";
       for (int j = 0; j <= dims; j++) out << SP;
       out << "int iAxis = tensor_" << fNI << "[updateIndex];\n";
       for (int j = 0; j <= dims; j++) out << SP;
diff --git a/tmva/sofie/inc/TMVA/ROperator_Slice.hxx b/tmva/sofie/inc/TMVA/ROperator_Slice.hxx
index 3add774b0d8d4..4e3c1319bd772 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Slice.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Slice.hxx
@@ -25,6 +25,7 @@ private:
    bool fIsStartUndef = false;
    bool fIsEndUndef = false;
    bool fIsStepUndef = false;
+   bool fIdentitySlice = false;
    std::string fNData;        // input data tensor name
    std::string fNOutput;      // output data name
    std::vector<std::string> fNames;       // tensor names for meta(axis) information
@@ -332,10 +333,25 @@ public:
          }
       }
       else {
+         // check if Slice is just an Identity operator in case start = 0, end = input_shape and step=1
+         size_t ndim = fShapeInput.size();
+         fIdentitySlice = fShapeOutput.size() == ndim;
+         for (size_t idim = 0; idim < ndim; idim++) {
+            if (!fIdentitySlice) break;
+            fIdentitySlice &= (fStart[idim].GetVal() == "0");
+            fIdentitySlice &= (fSteps[idim].GetVal() == "1");
+            fIdentitySlice &= (fEnd[idim].GetVal() == fShapeOutput[idim].GetVal());
+         }
+
          model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput);
+         if (fIdentitySlice)  model.AddAliasTensor(fNOutput, fNData);
+
          if (model.Verbose()) {
             std::cout << "Slice " << fNData << "  " << ConvertShapeToString(fShapeInput)
-                      << "---> " << fNOutput << " " <<  ConvertShapeToString(fShapeOutput) << std::endl;
+                      << "---> " << fNOutput << " " <<  ConvertShapeToString(fShapeOutput);
+            if (fIdentitySlice) std::cout << " (using alias tensor since slice is an identity) ";
+            std::cout << std::endl;
+
          }
       }
    }
@@ -351,8 +367,24 @@ public:
       out << "///------- Slice operator " << opName << "---> " << fNOutput << " "
           << ConvertDimShapeToString(fShapeOutput) << "\n" << std::endl;
       if (fIsOutputConstant) return out.str();  //no op for constant tensors
-      // loop on the dimensions depending no the orders
+
       size_t ndim = fShapeInput.size();
+      // check if Slice is just an Identity operator in case start = 0, end = input_shape and step=1
+      bool identitySlice = fShapeInput.size() == fShapeOutput.size();
+      for (size_t idim = 0; idim < ndim; idim++) {
+         if (!identitySlice) break;
+         identitySlice &= (fStart[idim].GetVal() == "0");
+         identitySlice &= (fSteps[idim].GetVal() == "1");
+         identitySlice &= (fEnd[idim].GetVal() == fShapeOutput[idim].GetVal());
+      }
+
+      if (identitySlice) {
+         out << "/// Slice is just an identity (copy pointers) \n";
+         out << SP << "tensor_" << fNOutput << " = tensor_" << fNData << ";\n";
+         return out.str();
+      }
+
+      // loop on the dimensions depending no the orders
       auto strides = UTILITY::ComputeStrideFromShape(fShapeInput);
 
 
diff --git a/tmva/sofie/src/RModel.cxx b/tmva/sofie/src/RModel.cxx
index 089e656fedbd1..5e22f48b8f2be 100644
--- a/tmva/sofie/src/RModel.cxx
+++ b/tmva/sofie/src/RModel.cxx
@@ -167,8 +167,8 @@ void RModel::AddOperator(std::unique_ptr<ROperator> op, int order_execution) {
         order_execution = fOperators.size()-1;
     }
 
-    // storing the last usage of tensors which are input to
-    // operators (but are not inputs to the model or they are not initialized)
+    // storing the last usage of tensors which are input to the operator
+    // (excluding tensors which are inputs to the model or the initialized (weights) tensors)
     // We call this function during parsing so we don't have yet initialized the operators
    for(size_t index = 0; index<op_input_tensors.size() &&
             fInitializedTensors.find(UTILITY::Clean_name(std::string(op_input_tensors[index]))) == fInitializedTensors.end() &&
@@ -208,10 +208,24 @@ void RModel::AddShapeTensor(const std::string & name, const std::vector<Dim> & s
    fShapeTensors[tensor_name] = std::make_pair(shape_values, scalar);
 }
 
+void RModel::AddAliasTensor(const std::string & name, const std::string & origin){
+   // add an alias tensor to origin
+   auto tensor_name = UTILITY::Clean_name(name);
+   auto origin_name = UTILITY::Clean_name(origin);
+   if (fAliasTensors.count(tensor_name) != 0) {
+      throw std::runtime_error("TMVA-SOFIE: alias tensor with name " + tensor_name + " already exists \n");
+   }
+   fAliasTensors[tensor_name] = origin_name;
+}
+
 bool RModel::IsShapeTensor(const std::string & tensor_name) const {
    return fShapeTensors.count(tensor_name) != 0;
 }
 
+bool RModel::IsAliasTensor(const std::string & tensor_name) const {
+   return fAliasTensors.count(tensor_name) != 0;
+}
+
 const std::vector<Dim> & RModel::GetShapeTensorValues(const std::string & tensor_name) const {
    //if (!IsShapeTensor(tensor_name) ) return std::vector<Dim>{};
    return fShapeTensors.at(tensor_name).first;
@@ -356,6 +370,11 @@ std::string RModel::AllocateIntermediateMemory(std::span<const std::string_view>
           fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end())
          continue;
 
+      // case of alias tensor
+      if (IsAliasTensor(name)) {
+         continue;
+      }
+
       auto tensor_size = GetTypeSize(GetTensorType(name)) * ConvertShapeToLength(GetTensorShape(name));
       // important fill the pair in the ordered output tensors with the string view and not the string
       TensorMemoryInfo tmi = {it, tensor_size};
@@ -435,9 +454,14 @@ void RModel::CheckAndFlushIntermediateMemory(std::span<const std::string_view> o
         chunk != fIntermediateMemoryInfo.available_stack.end(); chunk++) {
       if (fVerbose) std::cout << "-- free chunk " << chunk->first <<  " size = " << chunk->second << std::endl;
    }
-   for (auto &it : op_input_tensors) {
+   for (auto &iv : op_input_tensors) {
       // last occurrence of the tensor is reached => flush it from memory
-      if (fVerbose) std::cout << ".. input tensors : " << it;
+      if (fVerbose) std::cout << ".. input tensors : " << iv;
+
+      // for alias tensors replace name with its alias
+      std::string it{iv};  // convert view to string
+      if (IsAliasTensor(it))
+         it = fAliasTensors[it];
       if (fIntermediateTensorFrequencyLookup[it] == op_idx) {
          if (fVerbose) std::cout << "  flash condition is met - looping on chunks to find matching one \n";
          for (auto chunk = fIntermediateMemoryInfo.total_stack.begin();
@@ -623,6 +647,17 @@ void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool
          fUseWeightFile = false;
    }
 
+   // update fIntermediateTensorFrequencyLookup for alias tensors
+   for (auto & it : fAliasTensors) {
+      if (fIntermediateTensorFrequencyLookup.find(it.first) == fIntermediateTensorFrequencyLookup.end()) continue;
+      if (fIntermediateTensorFrequencyLookup.find(it.second) == fIntermediateTensorFrequencyLookup.end() )
+         fIntermediateTensorFrequencyLookup[it.second] = fIntermediateTensorFrequencyLookup[it.first];
+      else {
+         // take the largest one
+         fIntermediateTensorFrequencyLookup[it.second] = std::max(fIntermediateTensorFrequencyLookup[it.second],fIntermediateTensorFrequencyLookup[it.first] );
+      }
+   }
+
    fIsInitialized = true;
 }
 
@@ -737,7 +772,8 @@ void RModel::GenerateIntermediateTensorInfo() {
    if (!fIntermediateTensorInfos.empty()) {
       std::string tensor_declaration_block = "";
       for (auto &i : fIntermediateTensorInfos) {
-         if (i.second.type == ETensorType::BOOL) {
+         bool  is_alias = (IsAliasTensor(i.first));
+         if (i.second.type == ETensorType::BOOL && !is_alias) {
                tensor_declaration_block += "std::vector<std::uint8_t> fTensor_" + i.first + " = std::vector<std::uint8_t>(" + std::to_string(ConvertShapeToLength(i.second.shape)) + ");\n";
                tensor_declaration_block += "std::uint8_t * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
                continue;
@@ -748,7 +784,7 @@ void RModel::GenerateIntermediateTensorInfo() {
          bool not_in_output_names =
             (std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) == fOutputTensorNames.end());
 
-         if ((not_in_freq_map && not_in_output_names) || (!not_in_freq_map && !is_extended && not_in_output_names)) {
+         if (((not_in_freq_map && not_in_output_names) || (!not_in_freq_map && !is_extended && not_in_output_names) ) && !is_alias) {
             size_t length = ConvertShapeToLength(i.second.shape);
 
             if (i.second.type == ETensorType::FLOAT) {
@@ -767,6 +803,10 @@ void RModel::GenerateIntermediateTensorInfo() {
                fOtherTensorSize += 8 * length;
             }
          }
+         if (is_alias) {
+             tensor_declaration_block += ConvertTypeToString(i.second.type) + " * tensor_" + i.first + " = nullptr;\n";
+         }
+
       }
 
       if (tensor_declaration_block.length()) {
@@ -777,19 +817,7 @@ void RModel::GenerateIntermediateTensorInfo() {
    if (!fDynamicTensorInfos.empty()) {
       fGC += "//--- declare the dynamic tensors\n";
       for (auto &i : fDynamicTensorInfos) {
-         if (i.second.type == ETensorType::FLOAT) {
-            //fGC += "std::vector<float> fTensor_" + i.first + ";\n";
-            fGC += "float * tensor_" + i.first + " = nullptr;\n";
-         } else if (i.second.type == ETensorType::DOUBLE) {
-            //fGC += "std::vector<double> fTensor_" + i.first + ";\n";
-            fGC += "double * tensor_" + i.first + " = nullptr;\n";
-         } else if (i.second.type == ETensorType::INT64) {
-            //fGC += "std::vector<int64_t> fTensor_" + i.first + ";\n";
-            fGC += "int64_t * tensor_" + i.first + " = nullptr;\n";
-         } else if (i.second.type == ETensorType::BOOL) {
-            //fGC += "std::vector<uint8_t> fTensor_" + i.first + ";\n";
-            fGC += "uint8_t * tensor_" + i.first + " = nullptr;\n";
-         }
+         fGC += ConvertTypeToString(i.second.type) + " * tensor_" + i.first + " = nullptr;\n";
       }
       fGC += "//--- dynamic tensors pool\n";
       fGC += "std::vector<char> fDynamicMemoryPool;\n";
@@ -835,9 +863,9 @@ void RModel::GenerateDynamicTensorInfo()
             auto op_ptr = op.get();
             std::cout << "Looping on operator " << op_index << "   " << typeid(*op_ptr).name() << std::endl;
          }
-         // check if is a dynamic tensor
+         // check if is a dynamic tensor and not an alias tensor
          std::string name = std::string(it);
-         if ( fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end() ) {
+         if ( fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end() && !IsAliasTensor(name)) {
             auto tensor_size =  ConvertDimShapeToLength(GetDimTensorShape(name));
             auto type = GetTensorType(name);
             size_t type_size = GetTypeSize(type);
@@ -873,6 +901,7 @@ void RModel::GenerateDynamicTensorInfo()
    // check that all dynamic tensors are covered
    bool missingTensor = false;
    for (auto &i : fDynamicTensorInfos) {
+      if (IsAliasTensor(i.first)) continue;
       if (std::find(tensors.begin(), tensors.end(), std::pair<std::string,ETensorType>{i.first, i.second.type}) == tensors.end()) {
          std::cout << "Dynamic tensors " << i.first << " is not in list of operator input/output " << std::endl;
          missingTensor = true;
diff --git a/tmva/sofie/src/SOFIE_common.cxx b/tmva/sofie/src/SOFIE_common.cxx
index f659d0e1a2fe6..54fed04ba42b1 100644
--- a/tmva/sofie/src/SOFIE_common.cxx
+++ b/tmva/sofie/src/SOFIE_common.cxx
@@ -132,7 +132,7 @@ std::string ConvertDimShapeToString(const std::vector<Dim> & shape) {
    std::stringstream out;
    out << "{ ";
    for (size_t i = 0; i < shape.size(); i++) {
-      out << shape[i].GetVal();
+      out << shape[i];
       if (i < shape.size()-1) out << " , ";
    }
    out << " }";

From 08bfc2897c5bfe6b852111dbe4c1743d96483da6 Mon Sep 17 00:00:00 2001
From: Olia <olia@osirikov-1.dyndns.cern.ch>
Date: Mon, 11 Aug 2025 12:46:25 +0200
Subject: [PATCH 11/12] Time Profiler for Sofie

---
 tmva/sofie/CMakeLists.txt                    |   2 +
 tmva/sofie/inc/TMVA/RModel.hxx               |   9 +-
 tmva/sofie/inc/TMVA/RModelProfiler.hxx       |  42 +++++
 tmva/sofie/inc/TMVA/RModel_Base.hxx          |   1 +
 tmva/sofie/inc/TMVA/ROperator.hxx            |   3 +
 tmva/sofie/src/RModel.cxx                    |  49 +++---
 tmva/sofie/src/RModelProfiler.cxx            | 161 +++++++++++++++++++
 tmva/sofie_parsers/src/RModelParser_ONNX.cxx |   9 +-
 tutorials/machine_learning/TMVA_SOFIE_ONNX.C |   2 +-
 9 files changed, 255 insertions(+), 23 deletions(-)
 create mode 100644 tmva/sofie/inc/TMVA/RModelProfiler.hxx
 create mode 100644 tmva/sofie/src/RModelProfiler.cxx

diff --git a/tmva/sofie/CMakeLists.txt b/tmva/sofie/CMakeLists.txt
index c807d1b7b8c27..f56d2350ecadd 100644
--- a/tmva/sofie/CMakeLists.txt
+++ b/tmva/sofie/CMakeLists.txt
@@ -22,6 +22,7 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ROOTTMVASofie
    TMVA/OperatorList.hxx
    TMVA/RModel_Base.hxx
    TMVA/RModel.hxx
+   TMVA/RModelProfiler.hxx
    TMVA/ROperator.hxx
    TMVA/ROperator_BasicUnary.hxx
    TMVA/ROperator_BasicBinary.hxx
@@ -77,6 +78,7 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ROOTTMVASofie
   SOURCES
     src/RModel_Base.cxx
     src/RModel.cxx
+    src/RModelProfiler.cxx
     src/RModel_GNN.cxx
     src/RModel_GraphIndependent.cxx
     src/RFunction.cxx
diff --git a/tmva/sofie/inc/TMVA/RModel.hxx b/tmva/sofie/inc/TMVA/RModel.hxx
index 13d95935d9600..a82c58c75b2e2 100644
--- a/tmva/sofie/inc/TMVA/RModel.hxx
+++ b/tmva/sofie/inc/TMVA/RModel.hxx
@@ -11,16 +11,23 @@ namespace SOFIE {
 
 class RModel final : public RModel_Base {
 
+   friend class RModelProfiler;
+
 private:
    bool fIsInitialized = false;
    bool fIsSubGraph = false;
+   bool fProfile = false;
+
    int fVerbose = 0;
    int fBatchSize = -1;
    long fReadPos = 0;  // reading file position
+
    size_t fConstantTensorSize = 0; // size  (in Bytes) of the allocated constant tensors
    size_t fWeightsTensorSize = 0;  // size  (in Bytes) of the allocated weight tensors
    size_t fOtherTensorSize = 0;    // size  (in Bytes) of intermediate tensors which are not managed by the memory pool
 
+   std::string fProfilerGC = "";
+
    OptimizationLevel fOptimizationLevel = OptimizationLevel::kExtended;
 
    std::unordered_map<std::string, InputTensorInfo> fInputTensorInfos; // input tensors where shape may not fully defined or other graph inputs?
@@ -157,7 +164,7 @@ public:
    void Initialize(int batchSize = -1, bool verbose = false);
    void Initialize(const std::map<std::string,size_t> & inputParams, bool verbose = false);
 
-   void Generate(std::underlying_type_t<Options> options, int batchSize = -1, long pos = 0, bool verbose = false);
+    void Generate(std::underlying_type_t<Options> options, int batchSize = -1, long pos = 0, bool verbose = false);
    void Generate(Options options = Options::kDefault, int batchSize = -1, int pos = 0, bool verbose = false)
    {
       Generate(static_cast<std::underlying_type_t<Options>>(options), batchSize, pos, verbose);
diff --git a/tmva/sofie/inc/TMVA/RModelProfiler.hxx b/tmva/sofie/inc/TMVA/RModelProfiler.hxx
new file mode 100644
index 0000000000000..fd9c8c7d0267d
--- /dev/null
+++ b/tmva/sofie/inc/TMVA/RModelProfiler.hxx
@@ -0,0 +1,42 @@
+#ifndef TMVA_SOFIE_RMODELPROFILER
+#define TMVA_SOFIE_RMODELPROFILER
+
+#include "TMVA/RModel.hxx"
+
+namespace TMVA {
+namespace Experimental {
+namespace SOFIE {
+
+/// \class RModelProfiler
+/// \brief A helper class to generate profiled inference code for an RModel.
+///
+/// This class instruments the generated C++ code to measure the execution
+/// time of each operator. It is invoked when the RModel::Generate is called
+/// with the Options::kProfile flag. 
+class RModelProfiler {
+private:
+   RModel &fModel;
+   
+   void GenerateUtilityFunctions();
+
+public:
+   // The profiler must be constructed with a model to work on.
+   RModelProfiler() = delete;
+   RModelProfiler(RModel &model);
+   ~RModelProfiler() = default;
+   
+   // There is no point in copying or moving an RModelProfiler
+   RModelProfiler(const RModelProfiler &other) = delete;
+   RModelProfiler(RModelProfiler &&other) = delete;
+   RModelProfiler &operator=(const RModelProfiler &other) = delete;
+   RModelProfiler &operator=(RModelProfiler &&other) = delete;
+   
+   // Main function to generate the profiled code.
+   void Generate();
+};
+
+} // namespace SOFIE
+} // namespace Experimental
+} // namespace TMVA
+
+#endif // TMVA_SOFIE_RMODELPROFILER
diff --git a/tmva/sofie/inc/TMVA/RModel_Base.hxx b/tmva/sofie/inc/TMVA/RModel_Base.hxx
index 2cbcc6cc8ea41..2ab5dacaac57f 100644
--- a/tmva/sofie/inc/TMVA/RModel_Base.hxx
+++ b/tmva/sofie/inc/TMVA/RModel_Base.hxx
@@ -26,6 +26,7 @@ enum class Options {
    kRootBinaryWeightFile = 0x4,
    kGNN = 0x8,
    kGNNComponent = 0x10,
+   kProfile = 0x20,
 };
 
 // Optimization levels inspired by ONNXRuntime.
diff --git a/tmva/sofie/inc/TMVA/ROperator.hxx b/tmva/sofie/inc/TMVA/ROperator.hxx
index f0afd9c4374c1..200cd3f2976fe 100644
--- a/tmva/sofie/inc/TMVA/ROperator.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator.hxx
@@ -37,6 +37,9 @@ public:
    //virtual void Forward_blas() = 0;
    virtual ~ROperator(){}
 
+   std::string name = "UnnamedOperator";
+   const std::string &GetOperatorName() { return name; };
+
 protected:
 
    const std::string SP = "   ";    ///< space used to correctly indent the generated C++ code
diff --git a/tmva/sofie/src/RModel.cxx b/tmva/sofie/src/RModel.cxx
index 5e22f48b8f2be..32da75fdc045b 100644
--- a/tmva/sofie/src/RModel.cxx
+++ b/tmva/sofie/src/RModel.cxx
@@ -9,6 +9,7 @@
 #endif
 
 #include "TMVA/RModel.hxx"
+#include "TMVA/RModelProfiler.hxx"
 #include "TMVA/SOFIE_common.hxx"
 
 namespace TMVA {
@@ -1061,7 +1062,7 @@ void RModel::GenerateSessionCode()
          CheckAndFlushIntermediateMemory(fOperators[op_idx]->GetOpInputTensors(), op_idx);
       }
 
-      // to check remaining unused fragments after memory allocation (lesser the better)
+  // to check remaining unused fragments after memory allocation (lesser the better)
       // for (const auto &it: fIntermediateMemoryInfo.available_stack){
       //    std::cout<<"chunk_idx: "<<it.first<<", chunk_size: "<<it.second<<"\n";
       // }
@@ -1089,13 +1090,13 @@ void RModel::GenerateSessionCode()
    // Generate code for Session constructor
    if (fUseSession) {
       std::string sessionName = "Session";
-      if (fIsSubGraph)
+      if (fIsSubGraph) 
          sessionName += "_" + fName;
       // add here specific operator code that needs to define session data members
       fGC += "\n";
       for (size_t id = 0; id < fOperators.size(); id++) {
          std::string opName = std::to_string(id);
-         fGC += fOperators[id]->GenerateSessionMembersCode(opName);
+         fGC += fOperators[id]->GenerateSessionMembersCode(opName);        
       }
       fGC += "\n";
       // here add initialization and reading of weight tensors
@@ -1143,23 +1144,28 @@ void RModel::GenerateSessionCode()
       fGC += "}\n\n";
    }
 
-   fGC += doInferSignature + "{\n";
-   fGC += "\n";
+   if (fProfile) {
+      RModelProfiler profiler(*this);
+      profiler.Generate();
+      fGC += fProfilerGC; 
+   } else {
+      fGC += doInferSignature + "{\n";
+      fGC += "\n";
 
-   // generate the inference code
-   if (fVerbose)
-      std::cout << "Generating main inference code for " << fName << std::endl;
+      // generate the inference code
+      if (fVerbose)
+         std::cout << "Generating main inference code for " << fName << std::endl;
 
-   if (fOutputTensorNames.size() == 0)
-      throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported");
+      if (fOutputTensorNames.size() == 0)
+         throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported");
 
-   for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
-      if (fVerbose)
+      for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
+         if (fVerbose)
          std::cout << "Generating code for operator .... " << op_idx << std::endl;
-      fGC += (fOperators[op_idx]->Generate(std::to_string(op_idx)));
-   }
+         fGC += (fOperators[op_idx]->Generate(std::to_string(op_idx)));
+      }
 
-   fGC += SP + "using TMVA::Experimental::SOFIE::UTILITY::FillOutput;\n\n";
+      fGC += SP + "using TMVA::Experimental::SOFIE::UTILITY::FillOutput;\n\n";
 
    for (std::string const &name : fOutputTensorNames) {
       // need to check is size is the same (don't want to return a vector with
@@ -1170,7 +1176,8 @@ void RModel::GenerateSessionCode()
       fGC += SP + "FillOutput(tensor_" + name + ", output_tensor_" + name + ", " + n + ");\n";
    }
 
-   fGC += "}\n\n";
+      fGC += "}\n\n";
+   }
 
    // generate the inference overload that returns an output struct
    GenerateOutput();
@@ -1183,9 +1190,11 @@ void RModel::GenerateSessionCode()
 
 void RModel::Generate(std::underlying_type_t<Options> options, int batchSize, long pos, bool verbose)
 {
+   bool profile = (options & static_cast<std::underlying_type_t<Options>>(Options::kProfile));
    fVerbose = verbose;
    fBatchSize = batchSize;
    fReadPos = pos;
+   fProfile = profile;
 
    // session flag is used in operator initialize
    if (static_cast<std::underlying_type_t<Options>>(Options::kNoSession) & options) {
@@ -1205,9 +1214,9 @@ void RModel::Generate(std::underlying_type_t<Options> options, int batchSize, lo
          "TMVA-SOFIE: RModel::Generate: cannot use a separate weight file without generating a Session class");
    }
 
-   if (static_cast<std::underlying_type_t<Options>>(Options::kGNN) & options)
+   if (static_cast<std::underlying_type_t<Options>>(Options::kGNN) & options) 
       fIsGNN = true;
-   if (static_cast<std::underlying_type_t<Options>>(Options::kGNNComponent) & options)
+   if (static_cast<std::underlying_type_t<Options>>(Options::kGNNComponent) & options) 
       fIsGNNComponent = true;
 
    // initialize the model including all operators and sub-graphs
@@ -1228,13 +1237,13 @@ void RModel::Generate(std::underlying_type_t<Options> options, int batchSize, lo
 
    // generate first code for the subgraphs
    for (auto &graph : fSubGraphs) {
-      if (fVerbose)
+      if (fVerbose) 
          std::cout << "generate session code for subgraph " << graph->fName << std::endl;
       graph->GenerateSessionCode();
       fGC += graph->fGC;
    }
 
-   if (fVerbose)
+   if (fVerbose) 
       std::cout << "generate Main session code - model  " << fName << std::endl;
 
    // generate main session code
diff --git a/tmva/sofie/src/RModelProfiler.cxx b/tmva/sofie/src/RModelProfiler.cxx
new file mode 100644
index 0000000000000..76386e6de817d
--- /dev/null
+++ b/tmva/sofie/src/RModelProfiler.cxx
@@ -0,0 +1,161 @@
+#include "TMVA/RModelProfiler.hxx"
+#include "TMVA/SOFIE_common.hxx"
+
+namespace TMVA {
+namespace Experimental {
+namespace SOFIE {
+
+// The constructor now just registers the necessary C++ libraries.
+RModelProfiler::RModelProfiler(RModel &model) : fModel(model)
+{
+   fModel.AddNeededStdLib("chrono");      // for timing operators
+   fModel.AddNeededStdLib("vector");      // for storing profiling results
+   fModel.AddNeededStdLib("string");      // for operator names
+   fModel.AddNeededStdLib("map");         // for the results map
+   fModel.AddNeededStdLib("iostream");    // for printing results
+   fModel.AddNeededStdLib("iomanip");     // for printing results
+}
+
+// This function generates the helper functions inside the Session struct.
+void RModelProfiler::GenerateUtilityFunctions()
+{
+   auto &gc = fModel.fProfilerGC;
+
+   // Generate PrintProfilingResults function
+   gc += "   void PrintProfilingResults() const {\n";
+   gc += "      if (fProfilingResults.empty()) {\n";
+   gc += "         std::cout << \"No profiling results to display.\" << std::endl;\n";
+   gc += "         return;\n";
+   gc += "      }\n";
+   gc += "\n";
+   gc += "      std::cout << \"\\n\" << std::string(50, '=') << std::endl;\n";
+   gc += "      std::cout << \"         AVERAGE PROFILING RESULTS\" << std::endl;\n";
+   gc += "      std::cout << std::string(50, '=') << std::endl;\n";
+   gc += "      for (const auto& op : fProfilingResults) {\n";
+   gc += "         double sum = 0.0;\n";
+   gc += "         for (double time : op.second) {\n";
+   gc += "            sum += time;\n";
+   gc += "         }\n";
+   gc += "         double average = sum / op.second.size();\n";
+   gc += "         std::cout << \"  \" << std::left << std::setw(20) << op.first\n";
+   gc += "                   << \": \" << std::fixed << std::setprecision(6) << average << \" us\"\n";
+   gc += "                   << \"  (over \" << op.second.size() << \" runs)\" << std::endl;\n";
+   gc += "      }\n";
+   gc += "      std::cout << std::string(50, '=') << \"\\n\" << std::endl;\n";
+   gc += "   }\n";
+   gc += "\n";
+
+   // Generate ResetProfilingResults function
+   gc += "   void ResetProfilingResults() {\n";
+   gc += "      fProfilingResults.clear();\n";
+   gc += "   }\n";
+   gc += "\n";
+
+   // Generate GetOpAvgTime function
+   gc += "   std::map<std::string, double> GetOpAvgTime() const {\n";
+   gc += "      if (fProfilingResults.empty()) {\n";
+   gc += "         return {};\n";
+   gc += "      }\n";
+   gc += "\n";
+   gc += "      std::map<std::string, double> avg;\n";
+   gc += "      for (const auto& op : fProfilingResults) {\n";
+   gc += "         double mean = 0.0;\n";
+   gc += "         for (double time : op.second) {\n";
+   gc += "            mean += time;\n";
+   gc += "         }\n";
+   gc += "         mean /= op.second.size();\n";
+   gc += "         avg[op.first] = mean;\n";
+   gc += "      }\n";
+   gc += "\n";
+   gc += "      return avg;\n";
+   gc += "   }\n";
+   gc += "\n";
+
+   // Generate GetOpVariance function 
+   gc += "   std::map<std::string, double> GetOpVariance() const {\n";
+   gc += "      if (fProfilingResults.empty()) {\n";
+   gc += "         return {};\n";
+   gc += "      }\n";
+   gc += "\n";
+   gc += "      std::map<std::string, double> variance;\n";
+   gc += "      for (const auto& op : fProfilingResults) {\n";
+   gc += "         // Var[X] = E[X^2] - E[X]^2\n";
+   gc += "         double mean = 0.0, mean2 = 0.0;\n";
+   gc += "         for (double time : op.second) {\n";
+   gc += "            mean += time;\n";
+   gc += "            mean2 += time * time;\n";
+   gc += "         }\n";
+   gc += "         mean /= op.second.size();\n";
+   gc += "         mean2 /= op.second.size();\n";
+   gc += "         variance[op.first] = mean2 - mean * mean;\n";
+   gc += "      }\n";
+   gc += "\n";
+   gc += "      return variance;\n";
+   gc += "   }\n";
+}
+
+// Main generation function for the profiler.
+void RModelProfiler::Generate()
+{
+   // Clear the profiler's code string to start fresh.
+   fModel.fProfilerGC.clear();
+   auto &gc = fModel.fProfilerGC;
+
+   // 1. Add the data member to the Session struct to store results.
+   gc += "public:\n";
+   gc += "   // Maps an operator name to a vector of its execution times (in microseconds).\n";
+   gc += "   std::map<std::string, std::vector<double>> fProfilingResults;\n\n";
+
+   // 2. Generate and add the utility functions like PrintProfilingResults.
+   GenerateUtilityFunctions();
+
+   // 3. Generate the signature for the profiled doInfer method.
+   std::string doInferSignature = fModel.GenerateInferSignature();
+   if (!doInferSignature.empty()) doInferSignature += ", ";
+   for (auto const &name : fModel.GetOutputTensorNames()) {
+      doInferSignature += " std::vector<" + ConvertTypeToString(fModel.GetTensorType(name)) + "> &output_tensor_" + name + ",";
+   }
+   if (!fModel.GetOutputTensorNames().empty()) {
+      doInferSignature.back() = ' ';
+   }
+   gc += "void doInfer(" + doInferSignature + ") {\n";
+
+   // 4. Generate the body of the doInfer method with timing instrumentation.
+   gc += "   // Timer variable for profiling\n";
+   gc += "   std::chrono::steady_clock::time_point tp_start, tp_overall_start;\n\n";
+   gc += "   tp_overall_start = std::chrono::steady_clock::now();\n\n";
+
+   for (size_t op_idx = 0; op_idx < fModel.fOperators.size(); ++op_idx) {
+      const auto& op = fModel.fOperators[op_idx];
+      gc += "   // -- Profiling for operator " + op->name + " --\n";
+      gc += "   tp_start = std::chrono::steady_clock::now();\n\n";
+      
+      // Add the actual operator inference code
+      gc += op->Generate(std::to_string(op_idx));
+      
+      // Add the code to stop the timer and store the result
+      gc += "\n   fProfilingResults[\"" + op->name + "\"].push_back(\n";
+      gc += "      std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(\n";
+      gc += "         std::chrono::steady_clock::now() - tp_start).count());\n\n";
+   }
+
+   // 5. Generate the code to fill the output tensors.
+   gc += "   using TMVA::Experimental::SOFIE::UTILITY::FillOutput;\n\n";
+   for (std::string const &name : fModel.GetOutputTensorNames()) {
+      bool isIntermediate = fModel.fIntermediateTensorInfos.count(name) > 0;
+      std::string n = isIntermediate ? std::to_string(ConvertShapeToLength(fModel.GetTensorShape(name)))
+                                     : ConvertDynamicShapeToLength(fModel.GetDynamicTensorShape(name));
+      gc += "   FillOutput(tensor_" + name + ", output_tensor_" + name + ", " + n + ");\n";
+   }
+
+   gc += "\n   // -- Record overall inference time --\n";
+   gc += "   fProfilingResults[\"Overall_Time\"].push_back(\n";
+   gc += "      std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(\n";
+   gc += "         std::chrono::steady_clock::now() - tp_overall_start).count());\n";
+
+   gc += "}\n\n"; // End of doInfer function
+}
+
+} // namespace SOFIE
+} // namespace Experimental
+} // namespace TMVA
diff --git a/tmva/sofie_parsers/src/RModelParser_ONNX.cxx b/tmva/sofie_parsers/src/RModelParser_ONNX.cxx
index 7b4ade2b6bc09..4903c8d1c6511 100644
--- a/tmva/sofie_parsers/src/RModelParser_ONNX.cxx
+++ b/tmva/sofie_parsers/src/RModelParser_ONNX.cxx
@@ -731,7 +731,8 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto &
          std::cout << "\t" << i << "  " << nodesOrder[i] << " parsing operator " << op_type << std::endl;
       }
 
-      std::unique_ptr<ROperator> op = ParseOperator(i, graph, nodesOrder, nodesChildren[i]);
+      std::unique_ptr<ROperator> op = ParseOperator(i, graph, nodesOrder, nodesChildren[nodesOrder[i]]);
+
       if (!op) {
          if (verbose) {
             std::cout << "\t\tskipping operator since it is fused with previous one" << std::endl;
@@ -739,6 +740,12 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto &
          // for skipping the fused nodes like Add after MatMul
          continue;
       }
+      const auto &nodeproto = graph.node(nodesOrder[i]);
+      op->name = nodeproto.name();
+      if (op->name.empty()) {
+          op->name = op_type + "_" + std::to_string(i);
+      }
+
       rmodel.AddOperator(std::move(op), node_order_exec++);
    }
 
diff --git a/tutorials/machine_learning/TMVA_SOFIE_ONNX.C b/tutorials/machine_learning/TMVA_SOFIE_ONNX.C
index 8c192789e1210..878167db8c791 100644
--- a/tutorials/machine_learning/TMVA_SOFIE_ONNX.C
+++ b/tutorials/machine_learning/TMVA_SOFIE_ONNX.C
@@ -19,7 +19,7 @@ void TMVA_SOFIE_ONNX(std::string inputFile = ""){
     SOFIE::RModel model = parser.Parse(inputFile, true);
 
     //Generating inference code
-    model.Generate();
+    model.Generate(SOFIE::Options::kProfile);
     // write the code in a file (by default Linear_16.hxx and Linear_16.dat
     model.OutputGenerated();
 

From 32d38a2855b91550f5ec5d3ebdf0121fe0e4f57a Mon Sep 17 00:00:00 2001
From: moneta <lorenzo.moneta@cern.ch>
Date: Wed, 7 Jan 2026 16:00:54 +0100
Subject: [PATCH 12/12] [tmva][sofie] Imporve RModel profiler

Compute also the error on the average when printing results and sort them in decreasing order in time
---
 tmva/sofie/src/RModelProfiler.cxx | 29 ++++++++++++++++++++++-------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/tmva/sofie/src/RModelProfiler.cxx b/tmva/sofie/src/RModelProfiler.cxx
index 76386e6de817d..c56d4127e99b7 100644
--- a/tmva/sofie/src/RModelProfiler.cxx
+++ b/tmva/sofie/src/RModelProfiler.cxx
@@ -22,24 +22,39 @@ void RModelProfiler::GenerateUtilityFunctions()
    auto &gc = fModel.fProfilerGC;
 
    // Generate PrintProfilingResults function
-   gc += "   void PrintProfilingResults() const {\n";
+   gc += "   // generate code for printing operator results. By default order according to time (from higher to lower)\n";
+   gc += "   void PrintProfilingResults(bool order = true) const {\n";
    gc += "      if (fProfilingResults.empty()) {\n";
    gc += "         std::cout << \"No profiling results to display.\" << std::endl;\n";
    gc += "         return;\n";
    gc += "      }\n";
    gc += "\n";
+   gc += "      // compute summary statistics of profiling results and sort them in decreasing time\n";
+   gc += "      std::vector<std::tuple<std::string, double, double, int>> averageResults;\n";
    gc += "      std::cout << \"\\n\" << std::string(50, '=') << std::endl;\n";
    gc += "      std::cout << \"         AVERAGE PROFILING RESULTS\" << std::endl;\n";
    gc += "      std::cout << std::string(50, '=') << std::endl;\n";
    gc += "      for (const auto& op : fProfilingResults) {\n";
    gc += "         double sum = 0.0;\n";
+   gc += "         double sum2 = 0.0;\n";
    gc += "         for (double time : op.second) {\n";
    gc += "            sum += time;\n";
+   gc += "            sum2 += time*time;\n";
    gc += "         }\n";
    gc += "         double average = sum / op.second.size();\n";
-   gc += "         std::cout << \"  \" << std::left << std::setw(20) << op.first\n";
-   gc += "                   << \": \" << std::fixed << std::setprecision(6) << average << \" us\"\n";
-   gc += "                   << \"  (over \" << op.second.size() << \" runs)\" << std::endl;\n";
+   gc += "         double stddev = std::sqrt(( sum2 - sum *average)/ (op.second.size()-1));\n";
+   gc += "         averageResults.push_back({op.first, average, stddev, op.second.size()});\n";
+   gc += "      }\n";
+   gc += "\n";
+   gc += "      // sort average results in decreasing time\n";
+   gc += "      std::sort(averageResults.begin(), averageResults.end(),\n";
+   gc += "          []( std::tuple<std::string,double,double,int> a, std::tuple<std::string,double,double,int> b) {return std::get<1>(a) > std::get<1>(b); });\n";
+   gc += "\n";
+   gc += "      for (const auto & r : averageResults) {\n";
+   gc += "         std::cout << \"  \" << std::left << std::setw(20) << std::get<0>(r)\n";
+   gc += "                   << \": \" << std::fixed << std::setprecision(6) << std::get<1>(r) << \" +/- \" \n";
+   gc += "                   << std::get<2>(r)/std::sqrt(std::get<3>(r)) << \" us\"\n";
+   gc += "                   << \"  (over \" << std::get<3>(r) << \" runs)\" << std::endl;\n";
    gc += "      }\n";
    gc += "      std::cout << std::string(50, '=') << \"\\n\" << std::endl;\n";
    gc += "   }\n";
@@ -71,7 +86,7 @@ void RModelProfiler::GenerateUtilityFunctions()
    gc += "   }\n";
    gc += "\n";
 
-   // Generate GetOpVariance function 
+   // Generate GetOpVariance function
    gc += "   std::map<std::string, double> GetOpVariance() const {\n";
    gc += "      if (fProfilingResults.empty()) {\n";
    gc += "         return {};\n";
@@ -129,10 +144,10 @@ void RModelProfiler::Generate()
       const auto& op = fModel.fOperators[op_idx];
       gc += "   // -- Profiling for operator " + op->name + " --\n";
       gc += "   tp_start = std::chrono::steady_clock::now();\n\n";
-      
+
       // Add the actual operator inference code
       gc += op->Generate(std::to_string(op_idx));
-      
+
       // Add the code to stop the timer and store the result
       gc += "\n   fProfilingResults[\"" + op->name + "\"].push_back(\n";
       gc += "      std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(\n";