diff --git a/tmva/sofie/CMakeLists.txt b/tmva/sofie/CMakeLists.txt
index c807d1b7b8c27..f56d2350ecadd 100644
--- a/tmva/sofie/CMakeLists.txt
+++ b/tmva/sofie/CMakeLists.txt
@@ -22,6 +22,7 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ROOTTMVASofie
    TMVA/OperatorList.hxx
    TMVA/RModel_Base.hxx
    TMVA/RModel.hxx
+   TMVA/RModelProfiler.hxx
    TMVA/ROperator.hxx
    TMVA/ROperator_BasicUnary.hxx
    TMVA/ROperator_BasicBinary.hxx
@@ -77,6 +78,7 @@ ROOT_STANDARD_LIBRARY_PACKAGE(ROOTTMVASofie
   SOURCES
     src/RModel_Base.cxx
     src/RModel.cxx
+    src/RModelProfiler.cxx
     src/RModel_GNN.cxx
     src/RModel_GraphIndependent.cxx
     src/RFunction.cxx
diff --git a/tmva/sofie/inc/TMVA/RFunction.hxx b/tmva/sofie/inc/TMVA/RFunction.hxx
index 1cca39aa7ff3e..9247bd4180d26 100644
--- a/tmva/sofie/inc/TMVA/RFunction.hxx
+++ b/tmva/sofie/inc/TMVA/RFunction.hxx
@@ -32,7 +32,7 @@ public:
 
 class RFunction_Update: public RFunction {
 protected:
-    std::shared_ptr<RModel> function_block;
+    std::shared_ptr<RModel> fFunction_block;
     FunctionTarget fTarget;
     GraphType fGraphType;
     std::vector<std::string> fInputTensors;
@@ -50,9 +50,9 @@ public:
     void AddInputTensors(const std::vector<std::vector<std::size_t>>& inputShapes);
     void AddInputTensors(const std::vector<std::vector<Dim>>& inputShapes);
     std::shared_ptr<RModel> GetFunctionBlock() {
-        return function_block;
+        return fFunction_block;
     }
-    std::string GenerateModel(const std::string& filename, long read_pos = 0, long block_size = -1);
+    std::string GenerateModel(const std::string& filename, long read_pos = 0, long block_size = -1, bool verbose = false);
     std::string Generate(const std::vector<std::string>& inputPtrs);
     FunctionTarget GetFunctionTarget() {
         return fTarget;
diff --git a/tmva/sofie/inc/TMVA/RModel.hxx b/tmva/sofie/inc/TMVA/RModel.hxx
index 996c51020270f..a82c58c75b2e2 100644
--- a/tmva/sofie/inc/TMVA/RModel.hxx
+++ b/tmva/sofie/inc/TMVA/RModel.hxx
@@ -11,16 +11,23 @@ namespace SOFIE {
 
 class RModel final : public RModel_Base {
 
+   friend class RModelProfiler;
+
 private:
    bool fIsInitialized = false;
    bool fIsSubGraph = false;
+   bool fProfile = false;
+
    int fVerbose = 0;
    int fBatchSize = -1;
    long fReadPos = 0;  // reading file position
+
    size_t fConstantTensorSize = 0; // size  (in Bytes) of the allocated constant tensors
    size_t fWeightsTensorSize = 0;  // size  (in Bytes) of the allocated weight tensors
    size_t fOtherTensorSize = 0;    // size  (in Bytes) of intermediate tensors which are not managed by the memory pool
 
+   std::string fProfilerGC = "";
+
    OptimizationLevel fOptimizationLevel = OptimizationLevel::kExtended;
 
    std::unordered_map<std::string, InputTensorInfo> fInputTensorInfos; // input tensors where shape may not fully defined or other graph inputs?
@@ -30,6 +37,7 @@ private:
    std::unordered_map<std::string, DynamicTensorInfo> fDynamicTensorInfos;
    std::unordered_map<std::string, std::pair<std::vector<Dim>, bool>> fShapeTensors; // constant tensors describing a shape
    std::unordered_map<std::string, std::string> fShapeParams; // parameters defining the dynamic shape (e.g. batch size), store also its default value
+   std::unordered_map<std::string, std::string> fAliasTensors;   // list of alias tensors
    std::vector<std::string> fDimShapeNames; // parameter names used to define the shapes
    std::vector<std::string> fOutputTensorNames;
    std::vector<std::string> fInputTensorNames; // input tensor names using ONNX order
@@ -82,6 +90,8 @@ public:
    void AddConstantTensor(std::string tensor_name, ETensorType type, std::vector<std::size_t> shape,
                              std::shared_ptr<void> data);
 
+   void AddAliasTensor(const std::string & tensor_name, const std::string & orig_tensor_name);
+
 
    template<class T>
    void AddConstantTensor(const std::string & name, const std::vector<size_t> & shape, const T * data) {
@@ -130,6 +140,8 @@ public:
    bool IsReadyInputTensor(const std::string &name) const;
    /// check if a tensor is a shape tensor
    bool IsShapeTensor(const std::string & name) const;
+   /// check if a tensor is a alias tensor
+   bool IsAliasTensor(const std::string & name) const;
 
    // Add intermediate tensor
    void AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector<Dim> dim_shape);
@@ -152,7 +164,7 @@ public:
    void Initialize(int batchSize = -1, bool verbose = false);
    void Initialize(const std::map<std::string,size_t> & inputParams, bool verbose = false);
 
-   void Generate(std::underlying_type_t<Options> options, int batchSize = -1, long pos = 0, bool verbose = false);
+    void Generate(std::underlying_type_t<Options> options, int batchSize = -1, long pos = 0, bool verbose = false);
    void Generate(Options options = Options::kDefault, int batchSize = -1, int pos = 0, bool verbose = false)
    {
       Generate(static_cast<std::underlying_type_t<Options>>(options), batchSize, pos, verbose);
@@ -205,8 +217,8 @@ public:
    void ReadInitializedTensorsFromFile(long);
    long WriteInitializedTensorsToFile(std::string filename = "");
 
-   void PrintIntermediateTensors();
-   void PrintOutputTensors();
+   void PrintIntermediateTensors() const;
+   void PrintOutputTensors() const;
    void OutputGenerated(std::string filename = "", bool append = false);
    std::vector<std::string> GetOutputTensorNames() { return fOutputTensorNames; }
    void SetFilename(std::string filename) { fName = filename; }
@@ -224,9 +236,9 @@ public:
       }
    */
 
-   void PrintRequiredInputTensors();
-   void PrintInitializedTensors();
-   void PrintDynamicTensors();
+   void PrintRequiredInputTensors() const;
+   void PrintInitializedTensors() const;
+   void PrintDynamicTensors() const;
    void HeadInitializedTensors(std::string name, int n_print = 50);
 
    bool UseSession() const { return fUseSession; }
diff --git a/tmva/sofie/inc/TMVA/RModelProfiler.hxx b/tmva/sofie/inc/TMVA/RModelProfiler.hxx
new file mode 100644
index 0000000000000..fd9c8c7d0267d
--- /dev/null
+++ b/tmva/sofie/inc/TMVA/RModelProfiler.hxx
@@ -0,0 +1,42 @@
+#ifndef TMVA_SOFIE_RMODELPROFILER
+#define TMVA_SOFIE_RMODELPROFILER
+
+#include "TMVA/RModel.hxx"
+
+namespace TMVA {
+namespace Experimental {
+namespace SOFIE {
+
+/// \class RModelProfiler
+/// \brief A helper class to generate profiled inference code for an RModel.
+///
+/// This class instruments the generated C++ code to measure the execution
+/// time of each operator. It is invoked when the RModel::Generate is called
+/// with the Options::kProfile flag. 
+class RModelProfiler {
+private:
+   RModel &fModel;
+   
+   void GenerateUtilityFunctions();
+
+public:
+   // The profiler must be constructed with a model to work on.
+   RModelProfiler() = delete;
+   RModelProfiler(RModel &model);
+   ~RModelProfiler() = default;
+   
+   // There is no point in copying or moving an RModelProfiler
+   RModelProfiler(const RModelProfiler &other) = delete;
+   RModelProfiler(RModelProfiler &&other) = delete;
+   RModelProfiler &operator=(const RModelProfiler &other) = delete;
+   RModelProfiler &operator=(RModelProfiler &&other) = delete;
+   
+   // Main function to generate the profiled code.
+   void Generate();
+};
+
+} // namespace SOFIE
+} // namespace Experimental
+} // namespace TMVA
+
+#endif // TMVA_SOFIE_RMODELPROFILER
diff --git a/tmva/sofie/inc/TMVA/RModel_Base.hxx b/tmva/sofie/inc/TMVA/RModel_Base.hxx
index 2cbcc6cc8ea41..2ab5dacaac57f 100644
--- a/tmva/sofie/inc/TMVA/RModel_Base.hxx
+++ b/tmva/sofie/inc/TMVA/RModel_Base.hxx
@@ -26,6 +26,7 @@ enum class Options {
    kRootBinaryWeightFile = 0x4,
    kGNN = 0x8,
    kGNNComponent = 0x10,
+   kProfile = 0x20,
 };
 
 // Optimization levels inspired by ONNXRuntime.
diff --git a/tmva/sofie/inc/TMVA/ROperator.hxx b/tmva/sofie/inc/TMVA/ROperator.hxx
index f0afd9c4374c1..200cd3f2976fe 100644
--- a/tmva/sofie/inc/TMVA/ROperator.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator.hxx
@@ -37,6 +37,9 @@ public:
    //virtual void Forward_blas() = 0;
    virtual ~ROperator(){}
 
+   std::string name = "UnnamedOperator";
+   const std::string &GetOperatorName() { return name; };
+
 protected:
 
    const std::string SP = "   ";    ///< space used to correctly indent the generated C++ code
diff --git a/tmva/sofie/inc/TMVA/ROperator_BasicBinary.hxx b/tmva/sofie/inc/TMVA/ROperator_BasicBinary.hxx
index 1c4f20363ebe2..491b669554118 100644
--- a/tmva/sofie/inc/TMVA/ROperator_BasicBinary.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_BasicBinary.hxx
@@ -192,7 +192,7 @@ public:
                dataY[i] = BinaryOperatorTrait<T, Op>::Func(dataA[i], dataB[i]);
             }
             model.AddConstantTensor<T>(fNY, fShapeY, dataY.data());
-            // flag tensors to not be written in the weight file
+            // flag tensors to not be written in the generated code or weight file
             model.SetNotWritableInitializedTensor(nameA);
             model.SetNotWritableInitializedTensor(nameB);
             fIsOutputConstant = true;
diff --git a/tmva/sofie/inc/TMVA/ROperator_BasicNary.hxx b/tmva/sofie/inc/TMVA/ROperator_BasicNary.hxx
index bcc0e52a40ca3..f73bd34e53386 100644
--- a/tmva/sofie/inc/TMVA/ROperator_BasicNary.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_BasicNary.hxx
@@ -23,10 +23,11 @@ struct NaryOperatorTraits<T, EBasicNaryOperator::Max> {
    static const std::string Name() {return "Max";}
    static std::string Op(const std::string& res, std::vector<std::string>& inputs) {
       std::stringstream out;
-      out << "\t" << "\t" << res << " = " << inputs[0] << ";\n";
+      out << res << " = std::max({ " << inputs[0];
       for (size_t i = 1; i < inputs.size(); i++) {
-         out << "\t" << "\t" << res << " = std::max(" << res << ", " << inputs[i] << ");\n";
+         out << ", " << inputs[i];
       }
+      out << "});\n";
       return out.str();
    }
 };
@@ -36,10 +37,11 @@ struct NaryOperatorTraits<T, EBasicNaryOperator::Min> {
    static const std::string Name() {return "Min";}
    static std::string Op(const std::string& res, std::vector<std::string>& inputs) {
       std::stringstream out;
-      out << "\t" << "\t" << res << " = " << inputs[0] << ";\n";
+       out << res << " = std::min({ " << inputs[0];
       for (size_t i = 1; i < inputs.size(); i++) {
-         out << "\t" << "\t" << res << " = std::min(" << res << ", " << inputs[i] << ");\n";
+         out << ", " << inputs[i];
       }
+      out << "});\n";
       return out.str();
    }
 };
@@ -52,7 +54,7 @@ struct NaryOperatorTraits<float, EBasicNaryOperator::Mean> {
    static const std::string Name() {return "Mean";}
    static std::string Op(const std::string& res, std::vector<std::string>& inputs) {
       std::stringstream out;
-      out << "\t" << "\t" << res << " = (" << inputs[0];
+      out << res << " = (" << inputs[0];
       for (size_t i = 1; i < inputs.size(); i++) {
          out << " + " << inputs[i];
       }
@@ -66,7 +68,7 @@ struct NaryOperatorTraits<T, EBasicNaryOperator::Sum> {
    static const std::string Name() {return "Sum";}
    static std::string Op(const std::string& res, std::vector<std::string>& inputs) {
       std::stringstream out;
-      out << "\t" << "\t" << res << " = " << inputs[0];
+      out << res << " = " << inputs[0];
       for (size_t i = 1; i < inputs.size(); i++) {
          out << " + " << inputs[i];
       }
@@ -83,10 +85,11 @@ private:
 
    std::vector<std::string> fNInputs;
    std::string fNY;
-   std::vector<std::vector<size_t>> fShapeInputs;
+   std::vector<std::vector<Dim>> fShapeInputs;
 
    std::vector<std::string> fNBroadcastedInputs;
    std::vector<size_t> fShapeY;
+   std::vector<Dim> fDimShapeY;
 
    bool fBroadcast = false;
 
@@ -119,64 +122,164 @@ public:
    }
 
    void Initialize(RModel& model) override {
+      std::vector<std::vector<size_t>> inputShapes;
       for (auto &it : fNInputs) {
          if (!model.CheckIfTensorAlreadyExist(it)) {
             throw std::runtime_error("TMVA SOFIE BasicNary Op Input Tensor " + it + " is not found in model");
          }
-         fShapeInputs.push_back(model.GetTensorShape(it));
+         fShapeInputs.push_back(model.GetDimTensorShape(it));
+         if (fNInputs.size()> 2) {
+            if (model.IsDimInputTensor(it))
+               throw std::runtime_error("TMVA SOFIE BasicNary : supports only 2 inputs for dynamic tensors");
+            else
+               inputShapes.push_back(model.GetTensorShape(it));
+         }
       }
       // Find the common shape of the input tensors
-      fShapeY = UTILITY::MultidirectionalBroadcastShape(fShapeInputs);
-      model.AddIntermediateTensor(fNY, model.GetTensorType(fNInputs[0]), fShapeY);
-      // Broadcasting
-      size_t N = fNInputs.size();
-      fNBroadcastedInputs.reserve(N);
-      for (size_t i = 0; i < N; i++) {
-         if (!UTILITY::AreSameShape(model.GetTensorShape(fNInputs[i]), fShapeY)) {
-            fBroadcast = true;
-            std::string name = "Broadcasted"  + fNInputs[i];
-            model.AddIntermediateTensor(name, model.GetTensorType(fNInputs[0]), fShapeY);
-            fNBroadcastedInputs.emplace_back("tensor_" + name);
-         } else {
-            fNBroadcastedInputs.emplace_back("tensor_" + fNInputs[i]);
+      if (fShapeInputs.size() > 2 ) {
+         // support dynamic tensors now for input list of size=2
+         auto shapeY = UTILITY::MultidirectionalBroadcastShape(inputShapes);
+         fDimShapeY = ConvertShapeToDim(shapeY);
+      } else if (fShapeInputs.size() == 2 ) {
+         auto ret  = UTILITY::MultidirectionalBroadcastShape(fShapeInputs[0], fShapeInputs[1]);
+         // use same code as in BinaryOperator (need to extend for input sizes > 2)
+         fBroadcast = ret.first;
+         fDimShapeY = ret.second;
+         // case of all parametric shapes and MultiDirectionalBroadcastShape  return the max of the 2
+         // need to do before we declare the output tensor shape and the broadcasted ones
+         if (ret.first & 4) {
+            // check if one of the parameter is an input dimension
+            // define function to find this
+            auto IsInputDimParam = [&](const std::string &p) {
+               auto inputNames = model.GetInputTensorNames();
+               for (auto &input : inputNames) {
+                  for (auto &i_s : model.GetDimTensorShape(input)) {
+                     if (i_s.isParam && i_s.param == p)
+                        return true;
+                  }
+               }
+               return false;
+            };
+            auto & shapeA = fShapeInputs[0];
+            auto & shapeB = fShapeInputs[1];
+            for (size_t i = 0; i < fDimShapeY.size(); i++) {
+               auto &s = fDimShapeY[i];
+               if (s.isParam && s.param.find("std::max") != std::string::npos) {
+                  if (IsInputDimParam(shapeA[i].param)) {
+                     // case dim is 1 we indicate that the input parameter is equal to 1
+                     if (shapeA[i].dim != 1)
+                        s = shapeA[i];
+                     else
+                        s = shapeB[i];
+                  } else if (IsInputDimParam(shapeB[i].param)) {
+                     if (shapeB[i].dim != 1)
+                        s = shapeB[i];
+                     else
+                        s = shapeA[i];
+                  }
+               }
+            }
          }
+      } else if  (fShapeInputs.size() == 1 ) {
+         fDimShapeY = fShapeInputs[0];
       }
+      if (!fShapeY.empty())
+         model.AddIntermediateTensor(fNY, model.GetTensorType(fNInputs[0]), fShapeY);
+      else
+         model.AddIntermediateTensor(fNY, model.GetTensorType(fNInputs[0]), fDimShapeY);
+
+
       fType = ConvertTypeToString(model.GetTensorType(fNInputs[0]));
+
+      if (model.Verbose()) {
+         std::cout << NaryOperatorTraits<T, Op>::Name() << " : ";
+         if (fNInputs.size() == 2)
+            std::cout << ConvertShapeToString(fShapeInputs[0]) << " , "
+                      << ConvertShapeToString(fShapeInputs[1]);
+         std::cout << " --> " << ConvertShapeToString(fDimShapeY) << std::endl;
+      }
    }
 
    std::string Generate(std::string OpName) override {
       OpName = "op_" + OpName;
-      if (fShapeY.empty()) {
+      if (fDimShapeY.empty()) {
          throw std::runtime_error("TMVA SOFIE BasicNary called to Generate without being initialized first");
       }
       std::stringstream out;
-      size_t length = ConvertShapeToLength(fShapeY);
+      auto length = ConvertDimShapeToLength(fDimShapeY);
       out << SP << "\n//------ BasicNary operator\n";
-      if (fBroadcast) {
-         for (size_t i = 0; i < fNInputs.size(); i++) {
-            if (fNBroadcastedInputs[i] != fNInputs[i]) {
-               out << SP << SP << "// Broadcasting " << fNInputs[i] << " to " << ConvertShapeToString(fShapeY) << "\n";
-               out << SP << SP << "{\n";
-               out << SP << SP << SP << fType << "* data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<" << fType << ">(tensor_" + fNInputs[i] << ", " << ConvertShapeToString(fShapeInputs[i]);
-               out << ", " << ConvertShapeToString(fShapeY) << ");\n";
-               out << SP << SP << SP << "std::copy(data, data + " << length << ", " << fNBroadcastedInputs[i] << ");\n";
-               out << SP << SP << SP << "delete[] data;\n";
-               out << SP << SP << "}\n";
-            }
-         }
-      }
 
-      if (fNInputs.size() == 1) {
+      int nInputs = fNInputs.size();
+
+      if (nInputs == 1) {
          out << SP << "std::copy(tensor_" << fNInputs[0] << ", tensor_" << fNInputs[0] << " + ";
          out << length << ", tensor_" << fNY << ");\n";
       } else {
-         std::vector<std::string> inputs(fNBroadcastedInputs.size());
-         for (size_t i = 0; i < fNBroadcastedInputs.size(); i++) {
-            inputs[i] = fNBroadcastedInputs[i] + "[id]";
+
+         // implement operator without broadcasting, but using loos on all indices
+         std::vector<std::vector<Dim>> inputStrides(nInputs);
+         for (int i = 0; i < nInputs; i++)
+            inputStrides[i] = UTILITY::ComputeStrideFromShape(fShapeInputs[i]);
+
+         auto stridesY = UTILITY::ComputeStrideFromShape(fDimShapeY);
+
+         // make loop on output indices
+         std::string compute_idx_Y;
+         int nloop = 0;
+         if (fDimShapeY.empty() ||
+               std::all_of(fDimShapeY.begin(), fDimShapeY.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+            compute_idx_Y = "0";
+         } else {
+            for (size_t i = 0; i < fDimShapeY.size(); ++i) {
+               if (fDimShapeY[i].dim != 1 && fDimShapeY[i].GetVal() != "1") {
+                  nloop++;
+                  for (int j = 0; j < nloop; j++) out << SP;
+                  out << "for (size_t idx_" << i << " = 0; idx_" << i << " < " << fDimShapeY[i]
+                      << "; ++idx_" << i << "){\n";
+                  compute_idx_Y += "idx_" + std::to_string(i);
+                  if (stridesY[i].GetVal() != "1")
+                     compute_idx_Y += " * " + stridesY[i].GetVal();
+                  compute_idx_Y += " + ";
+               }
+            }
+            // remove last 3 characters " + "
+            for (int j = 0; j < 3; j++)
+               compute_idx_Y.pop_back();
+         }
+         // find indices for input tensors
+         std::vector<std::string> inputs(nInputs);
+         for (int ipt = 0; ipt < nInputs; ipt++ ) {
+            std::string compute_idx_X;
+            auto & shape = fShapeInputs[ipt];
+            auto & stride = inputStrides[ipt];
+            if (shape.empty() ||
+                std::all_of(shape.begin(), shape.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+               compute_idx_X = "0";
+            } else {
+               for (size_t i = 0; i < shape.size(); ++i) {
+                  if (shape[i].dim == 1 || shape[i].GetVal() == "1")
+                     continue;
+                  compute_idx_X += "idx_" + std::to_string(i + (fDimShapeY.size() - shape.size()));
+                  if (stride[i].GetVal() != "1")
+                     compute_idx_X += " * " + stride[i].GetVal();
+                  compute_idx_X += " + ";
+               }
+               // remove last 3 character " + "
+               for (int j = 0; j < 3; j++)
+                  compute_idx_X.pop_back();
+            }
+            inputs[ipt] = "tensor_" + fNInputs[ipt] + "[" + compute_idx_X + "]";
+         }
+
+         // perform the operation
+         for (int j = 0; j < nloop + 1; j++) out << SP;
+         std::string output = "tensor_" + fNY + "[" + compute_idx_Y + "]";
+         out << NaryOperatorTraits<T,Op>::Op(output, inputs);
+
+         for (int i = nloop; i > 0; i--) {
+            for (int j = 0; j < i; j++) out << SP;
+            out << "}\n";
          }
-         out << SP << "for (size_t id = 0; id < " << length << "; id++) {\n";
-         out << NaryOperatorTraits<T,Op>::Op("tensor_" + fNY + "[id]", inputs);
-         out << SP << "}\n";
       }
       return out.str();
    }
diff --git a/tmva/sofie/inc/TMVA/ROperator_BatchNormalization.hxx b/tmva/sofie/inc/TMVA/ROperator_BatchNormalization.hxx
index f2d31796bbbcd..c37e7fc4b68de 100644
--- a/tmva/sofie/inc/TMVA/ROperator_BatchNormalization.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_BatchNormalization.hxx
@@ -141,8 +141,8 @@ public:
       }
    }
 
-   std::string Generate(std::string OpName) override {
-      OpName = "op_" + OpName;
+   std::string Generate(std::string opName) override {
+      opName = "op_" + opName;
       if (fShapeX.empty()){
          throw std::runtime_error("TMVA SOFIE Batch Normalization called to Generate without being initialized first");
       }
@@ -158,7 +158,7 @@ public:
          spatial_dim = ConvertDimShapeToLength( spatialShape);
       }
 
-      out << "\n\n//---- BatchNorm" << (fActivation == EActivationType::RELU ? " + ReLU" : "") << "\n";
+      out << "\n\n//---- BatchNorm" << (fActivation == EActivationType::RELU ? " + ReLU " : " ") << opName << "\n";
       out << SP << "{\n";
       out << SP << "   size_t i = 0;\n";
       out << SP << "   for (size_t n = 0; n < " << batchSize << "; ++n) {\n";
diff --git a/tmva/sofie/inc/TMVA/ROperator_Cast.hxx b/tmva/sofie/inc/TMVA/ROperator_Cast.hxx
index f48e27ee4f264..8267bb8a7e4f4 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Cast.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Cast.hxx
@@ -46,7 +46,7 @@ public:
         throw std::runtime_error("TMVA SOFIE Cast Op Input Tensor is not found in model");
       }
       fShape = model.GetDimTensorShape(fNX);
-      // shoud we add a check if the same type
+      // should we add a check if the same type
       auto inputType = model.GetTensorType(fNX);
       if (model.IsInitializedTensor(fNX)) {
          fIsOutputConstant = true;
@@ -57,29 +57,30 @@ public:
          }
          else
             fIsOutputConstant = false;
+      } else if (model.IsShapeTensor(fNX) && ConvertStringToType(fAttrType) == ETensorType::INT64) {
+         auto shapeData = model.GetShapeTensorValues(fNX);
+         model.AddShapeTensor(fNY, shapeData, fShape.size() == 0);
+         fIsOutputConstant = true;
       }
       if (!fIsOutputConstant)
          model.AddIntermediateTensor(fNY, ConvertStringToType(fAttrType), fShape);
       if (model.Verbose()) {
-         std::cout << "Cast : " << ConvertTypeToString(inputType) << " " << fNX << " -> " << fAttrType << " for " << fNY;
+         std::cout << "Cast : " << ConvertTypeToString(inputType) << " " << fNX << " -> " << fAttrType << " for " << fNY
+                  << " shape " << ConvertDimShapeToString(fShape);
          if (fIsOutputConstant) std::cout << " (constant) ";
          std::cout << std::endl;
       }
    }
 
 
-   std::string Generate(std::string OpName) override {
-      if (fIsOutputConstant) return "";
+   std::string Generate(std::string opName) override {
+
+      // output shape can be empty if is a scalar
 
-      OpName = "op_" + OpName;
-      if (fShape.empty()) {
-         throw std::runtime_error("TMVA SOFIE Cast called to Generate without being initialized first");
-      }
       std::stringstream out;
       auto length = ConvertDimShapeToLength(fShape);
 
-      // out << SP << ETensorType << " " << OpName << "_attr = "  << fattr << ";\n";
-      out << "\n//------ CAST\n";
+      out << "\n//------ CAST " << opName << " ---> " << fNY << "  " << ConvertDimShapeToString(fShape) << "\n";
        // no generated code for constant outputs
       if (fIsOutputConstant) return out.str();
 
diff --git a/tmva/sofie/inc/TMVA/ROperator_Comparision.hxx b/tmva/sofie/inc/TMVA/ROperator_Comparision.hxx
index 0d365ae517de5..734434357a149 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Comparision.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Comparision.hxx
@@ -56,7 +56,6 @@ template<typename T, EComparisionOperator Op>
 class ROperator_Comparision final : public ROperator{
 private:
 
-   bool fIsModelOutput = false;
    std::string fNX1;
    std::string fNX2;
    std::string fNY;
@@ -65,11 +64,10 @@ private:
    std::vector<Dim> fDimShapeX1;
    std::vector<Dim> fDimShapeX2;
    std::vector<size_t> fShapeY;
-   std::string fNBroadcastedX1;
-   std::string fNBroadcastedX2;
+   std::vector<Dim> fDimShapeY;
    ETensorType fTensorType1 = ETensorType::UNDEFINED;
    ETensorType fTensorType2 = ETensorType::UNDEFINED;
-   bool fBroadcast = false;
+   int fBroadcastFlag = 0;
 
 
 public:
@@ -115,184 +113,260 @@ public:
       }
       fTensorType1 = model.GetTensorType(fNX1);
       fTensorType2 = model.GetTensorType(fNX2);
-      bool broadcast = !UTILITY::AreSameShape(fShapeX1, fShapeX2);
-      if (broadcast) {
-         // Y is the common shape of A and B
-         fShapeY = UTILITY::UnidirectionalBroadcastShape(fShapeX1, fShapeX2);
-         bool broadcastX1 = !UTILITY::AreSameShape(fShapeX1, fShapeY);
-         bool broadcastX2 = !UTILITY::AreSameShape(fShapeX2, fShapeY);
-         // Broadcast A to Y
-         if (broadcastX1) {
-            if (model.IsInitializedTensor(fNX1)) {
-               auto data = model.GetInitializedTensorData(fNX1);
-               std::shared_ptr<void> broadcastedData(
-                  UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeX1, fShapeY),
-                  std::default_delete<T[]>());
-               // Update the data and the shape of A
-               model.UpdateInitializedTensor(fNX1, model.GetTensorType(fNX1), fShapeY, broadcastedData);
-               fShapeX1 = fShapeY;
-            } else {
-               // Add an intermediate tensor for broadcasting A
-               fNBroadcastedX1 = "Broadcasted" + fNX1;
-               model.AddIntermediateTensor(fNBroadcastedX1, model.GetTensorType(fNX1), fShapeY);
+      // case of non dynamic tensors
+      if (!fShapeX1.empty() && !fShapeX2.empty()) {
+         bool broadcastX1 = false;
+         bool broadcastX2 = false;
+         if (UTILITY::AreSameShape(fShapeX1, fShapeX2)) {
+            // no broadcast needed
+            fShapeY = fShapeX1;
+         } else  {
+            // Y is the common shape of A and B
+            fShapeY = UTILITY::UnidirectionalBroadcastShape(fShapeX1, fShapeX2);
+            broadcastX1 = !UTILITY::AreSameShape(fShapeX1, fShapeY);
+            broadcastX2 = !UTILITY::AreSameShape(fShapeX2, fShapeY);
+         }
+
+
+         // analyze case of constant tensors or shape tensors (which have known shapes but data as Dim values
+         // normal case with non-dynamic tensor is also here
+         T *data1 = nullptr;
+         T *data2 = nullptr;
+         std::unique_ptr<T> broadcastedData1;
+         std::unique_ptr<T> broadcastedData2;
+         // data for shape tensors
+         std::vector<Dim> shapeData1;
+         std::vector<Dim> shapeData2;
+         size_t length = ConvertShapeToLength(fShapeY);
+         bool *outData = new bool[length];
+         if (model.IsInitializedTensor(fNX1)) {
+            data1 = static_cast<T *>(model.GetInitializedTensorData(fNX1).get());
+            if (broadcastX1) {
+               broadcastedData1 = std::unique_ptr<T>(
+                  UTILITY::UnidirectionalBroadcast<T>(data1, fShapeX1, fShapeY));
+               data1 = broadcastedData1.get();
             }
+
+         } else if (model.IsShapeTensor(fNX1)) {
+            shapeData1 = model.GetShapeTensorValues(fNX1);
          }
-         // Broadcast B to Y
-         if (broadcastX2) {
-            if (model.IsInitializedTensor(fNX2)) {
-               auto data = model.GetInitializedTensorData(fNX2);
-               std::shared_ptr<void> broadcastedData(
-                  UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeX2, fShapeY),
-                  std::default_delete<T[]>());
-               // Update the data and the shape of B
-               model.UpdateInitializedTensor(fNX2, model.GetTensorType(fNX2), fShapeY, broadcastedData);
-               fShapeX2 = fShapeY;
-            } else {
-               // Add an intermediate tensor for broadcasting B
-               fNBroadcastedX2 = "Broadcasted" + fNX2;
-               model.AddIntermediateTensor(fNBroadcastedX2, model.GetTensorType(fNX2), fShapeY);
+         if (model.IsInitializedTensor(fNX2)) {
+            data2 = static_cast<T *>(model.GetInitializedTensorData(fNX2).get());
+            if (broadcastX2) {
+               broadcastedData2 = std::unique_ptr<T>(
+                  UTILITY::UnidirectionalBroadcast<T>(data2, fShapeX2, fShapeY));
+               data2 = broadcastedData2.get();
             }
+         } else if (model.IsShapeTensor(fNX2)) {
+            shapeData2 = model.GetShapeTensorValues(fNX2);
          }
-      } else {
-         fShapeY = fShapeX1;
-      }
-      // case of constant tensors
-      T * data1 = nullptr;
-      T * data2 = nullptr;
-      std::vector<Dim> shapeData1;
-      std::vector<Dim> shapeData2;
-      size_t length = ConvertShapeToLength(fShapeY);
-      bool *  outData = new bool[length];
-      if (model.IsInitializedTensor(fNX1)) {
-         data1 = static_cast<T *>(model.GetInitializedTensorData(fNX1).get());
-      } else if (model.IsShapeTensor(fNX1)) {
-         shapeData1 = model.GetShapeTensorValues(fNX1);
-      }
-      if (model.IsInitializedTensor(fNX2)) {
-         data2 = static_cast<T *>(model.GetInitializedTensorData(fNX2).get());
-      } else if (model.IsShapeTensor(fNX2)) {
-         shapeData2 = model.GetShapeTensorValues(fNX2);
-      }
-      if (data1 && data2) {
-         fIsOutputConstant = true;
-         for (size_t i = 0; i < length; i++)
-            outData[i] = ComparisionTrait<T,Op>::Result(data1[i], data2[i]);
-         model.AddConstantTensor(fNY, fShapeY, outData);
-         if (model.Verbose())
-            std::cout <<  ComparisionTrait<T,Op>::Name() << " op ---> " << fNY << "  " << ConvertShapeToString(fShapeY) << " : "
-               << ConvertValuesToString(length,outData) << std::endl;
-      } else if ((data1 || !shapeData1.empty()) && (data2 || !shapeData2.empty())) {
-         fIsOutputConstant = true;
-         if (data1 && !data2) {
-            // data 1 is constant and data2 is shape
-            for (size_t i = 0; i < length; i++) {
-               if (shapeData2[i].isParam) {
-                  if (shapeData2[i].dim == size_t(-1) || data1[i] > 0) {
-                     fIsOutputConstant = false;
-                     break;
-                  } else {
-                     // assume a comparison is done with .dim = 0
-                     shapeData2[i].dim = 0;
+         if (data1 && data2) {
+            fIsOutputConstant = true;
+            for (size_t i = 0; i < length; i++)
+               outData[i] = ComparisionTrait<T, Op>::Result(data1[i], data2[i]);
+            model.AddConstantTensor(fNY, fShapeY, outData);
+            if (model.Verbose())
+               std::cout << ComparisionTrait<T, Op>::Name() << " op ---> " << fNY << "  "
+                         << ConvertShapeToString(fShapeY) << " : " << ConvertValuesToString(length, outData)
+                         << std::endl;
+         } else if ((data1 || !shapeData1.empty()) && (data2 || !shapeData2.empty())) {
+            fIsOutputConstant = true;
+            if (data1 && !data2) {
+               // data 1 is constant and data2 is shape
+               for (size_t i = 0; i < length; i++) {
+                  if (shapeData2[i].isParam) {
+                     if (shapeData2[i].dim == size_t(-1) || data1[i] > 0) {
+                        fIsOutputConstant = false;
+                        break;
+                     } else {
+                        // assume a comparison is done with .dim = 0
+                        shapeData2[i].dim = 0;
+                     }
                   }
+                  outData[i] = ComparisionTrait<T, Op>::Result(data1[i], static_cast<T>(shapeData2[i].dim));
                }
-               outData[i] = ComparisionTrait<T,Op>::Result(data1[i], static_cast<T>(shapeData2[i].dim));
-            }
-         } else if (!data1 && data2) {
-            // data 1 is shape and dat2 is constant
-            for (size_t i = 0; i < length; i++) {
-               if (shapeData1[i].isParam) {
-                  if (shapeData1[i].dim == size_t(-1) || data2[i] > 0) {
+            } else if (!data1 && data2) {
+               // data 1 is shape and dat2 is constant
+               for (size_t i = 0; i < length; i++) {
+                  if (shapeData1[i].isParam) {
+                     if (shapeData1[i].dim == size_t(-1) || data2[i] > 0) {
+                        fIsOutputConstant = false;
+                        break;
+                     } else {
+                        // assume a comparison is done with .dim = 0
+                        shapeData1[i].dim = 0;
+                     }
+                  }
+                  outData[i] = ComparisionTrait<T, Op>::Result(static_cast<T>(shapeData1[i].dim), data2[i]);
+               }
+            } else if (!shapeData1.empty() && !shapeData2.empty()) {
+               // both data1 and data2 are shape tensors
+               for (size_t i = 0; i < length; i++) {
+                  if (!shapeData1[i].isParam && !shapeData2[i].isParam) {
+                     outData[i] = ComparisionTrait<T, Op>::Result(shapeData1[i].dim, shapeData2[i].dim);
+                  } else if (shapeData1[i].isParam && shapeData2[i].isParam) {
+                     if (shapeData1[i].param == shapeData2[i].param)
+                        outData[i] = ComparisionTrait<int, Op>::Result(1, 1); // comparison of two equal value
+                     else {
+                        fIsOutputConstant = false;
+                        break;
+                     }
+                  } else {
                      fIsOutputConstant = false;
                      break;
-                  } else {
-                     // assume a comparison is done with .dim = 0
-                     shapeData1[i].dim = 0;
                   }
                }
-               outData[i] = ComparisionTrait<T,Op>::Result(static_cast<T>(shapeData1[i].dim), data2[i]);
             }
-         } else if (!shapeData1.empty() && !shapeData2.empty() ) {
-            // both data1 and data2 are shape tensors
-            for (size_t i = 0; i < length; i++) {
-               if (!shapeData1[i].isParam && !shapeData2[i].isParam) {
-                  outData[i] = ComparisionTrait<T,Op>::Result(shapeData1[i].dim, shapeData2[i].dim);
-               }
-               else if (shapeData1[i].isParam && shapeData2[i].isParam) {
-                  if (shapeData1[i].param == shapeData2[i].param)
-                     outData[i] = ComparisionTrait<int,Op>::Result(1,1); // comparison of two equal value
-                  else {
-                     fIsOutputConstant = false;
-                     break;
+            if (fIsOutputConstant) {
+               model.AddConstantTensor(fNY, fShapeY, outData);
+               if (model.Verbose())
+                  std::cout << ComparisionTrait<T, Op>::Name() << " op ---> " << fNY << "  "
+                            << ConvertShapeToString(fShapeY) << " : " << ConvertValuesToString(length, outData)
+                            << " (constant) " << std::endl;
+            }
+         }
+         delete[] outData;
+         // case of non constant output (no constant or shape tensors)
+         if (!fIsOutputConstant && !fShapeY.empty()) {
+            model.AddIntermediateTensor(fNY, ETensorType::BOOL, fShapeY);
+            fDimShapeY = ConvertShapeToDim(fShapeY);
+            if (model.Verbose())
+               std::cout << ComparisionTrait<T, Op>::Name() << " op ---> " << fNY << "  "
+                         << ConvertShapeToString(fShapeY) << std::endl;
+         }
+      } else {
+         // case of dynamic tensors
+          // case A or B have dynamic shapes. We need to broadcast if shape are not same
+         auto ret = UTILITY::MultidirectionalBroadcastShape(fDimShapeX1, fDimShapeX2);
+         fBroadcastFlag = ret.first;
+         fDimShapeY = ret.second;
+         // case of all parametric shapes and MultiDirectionalBroadcastShape  return the max of the 2
+         // need to do before we declare the output tensor shape and the broadcasted ones
+         if (ret.first & 4) {
+            // check if one of the parameter is an input dimension
+            // define function to find this
+            auto IsInputDimParam = [&](const std::string &p) {
+               auto inputNames = model.GetInputTensorNames();
+               for (auto &input : inputNames) {
+                  for (auto &i_s : model.GetDimTensorShape(input)) {
+                     if (i_s.isParam && i_s.param == p)
+                        return true;
                   }
                }
-               else {
-                  fIsOutputConstant = false;
-                  break;
+               return false;
+            };
+            for (size_t i = 0; i < fDimShapeY.size(); i++) {
+               auto &s = fDimShapeY[i];
+               if (s.isParam && s.param.find("std::max") != std::string::npos) {
+                  if (IsInputDimParam(fDimShapeX1[i].param)) {
+                     // case dim is 1 we indicate that the input parameter is equal to 1
+                     if (fDimShapeX1[i].dim != 1)
+                        s = fDimShapeX1[i];
+                     else
+                        s = fDimShapeX2[i];
+                  } else if (IsInputDimParam(fDimShapeX2[i].param)) {
+                     if (fDimShapeX2[i].dim != 1)
+                        s = fDimShapeX2[i];
+                     else
+                        s = fDimShapeX1[i];
+                  }
                }
             }
          }
-         if (fIsOutputConstant) {
-            model.AddConstantTensor(fNY, fShapeY, outData);
-            if (model.Verbose())
-               std::cout <<  ComparisionTrait<T,Op>::Name() << " op ---> " << fNY << "  " << ConvertShapeToString(fShapeY) << " : "
-                  << ConvertValuesToString(length,outData) << " (constant) " << std::endl;
 
+         model.AddIntermediateTensor(fNY, ETensorType::BOOL, fDimShapeY);
+         if (model.Verbose()) {
+            std::cout << ComparisionTrait<T, Op>::Name()  << " : " << fNX1 << "  " << ConvertShapeToString(fDimShapeX1) << " , "
+                                                          << fNX2 << "  " << ConvertShapeToString(fDimShapeX2) << " --> "
+                                                          << fNY  << "  " << ConvertShapeToString(fDimShapeY) << std::endl;
+            model.PrintIntermediateTensors();
          }
       }
-      delete [] outData;
-      if (!fIsOutputConstant) {
-         model.AddIntermediateTensor(fNY, ETensorType::BOOL , fShapeY);
-         if (model.Verbose())
-               std::cout <<  ComparisionTrait<T,Op>::Name() << " op ---> " << fNY << "  " << ConvertShapeToString(fShapeY) << std::endl;
-      }
-
-      // check if this is not output operators to add a specific line for definining the tensor_xxx variable
-      const auto & outputTensorNames = model.GetOutputTensorNames();
-      fIsModelOutput = false;
-      if (std::find(outputTensorNames.begin(), outputTensorNames.end(), fNY) != outputTensorNames.end())
-         fIsModelOutput = true;
    }
 
    std::string Generate(std::string opName) override {
       if (fIsOutputConstant) return "";
       opName = "op_" + opName;
 
-     if (fShapeY.empty()) {
+     if (fDimShapeY.empty()) {
          throw std::runtime_error("TMVA SOFIE Comparision Op called to Generate without being initialized first");
       }
       std::stringstream out;
       out << SP << "\n//------ " << ComparisionTrait<T,Op>::Name() << "  " << opName
                                  << " --> " << ConvertShapeToString(fShapeY) << "\n";
-      size_t length = ConvertShapeToLength(fShapeY);
-      // Broadcast A if it's uninitialized
-      if (!fNBroadcastedX1.empty()) {
-         std::string type1 = ConvertTypeToString(fTensorType1);
-         out << SP << "// Broadcasting uninitialized tensor " << fNX1 << "\n";
-         out << SP << "{\n";
-         out << SP << SP << type1 << "* data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<" << type1 << ">(tensor_" << fNX1 << ", " << ConvertShapeToString(fShapeX1) << ", " << ConvertShapeToString(fShapeY) << ");\n";
-         out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNBroadcastedX1 << ");\n";
-         out << SP << SP << "delete[] data;\n";
-         out << SP << "}\n";
+
+      // need to add check if tensors are compatible as in binary operator
+
+      // use same code as Binary operator
+      auto stridesA = UTILITY::ComputeStrideFromShape(fDimShapeX1);
+      auto stridesB = UTILITY::ComputeStrideFromShape(fDimShapeX2);
+      auto stridesY = UTILITY::ComputeStrideFromShape(fDimShapeY);
+
+      std::string compute_idx_X1, compute_idx_X2, compute_idx_Y;
+      if (fDimShapeX1.empty() ||
+          std::all_of(fDimShapeX1.begin(), fDimShapeX1.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+         compute_idx_X1 = "0";
+      } else {
+         for (size_t i = 0; i < fDimShapeX1.size(); ++i) {
+            if (fDimShapeX1[i].dim == 1 || fDimShapeX1[i].GetVal() == "1")
+               continue;
+            compute_idx_X1 += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeX1.size()));
+            if (stridesA[i].GetVal() != "1")
+               compute_idx_X1 += " * " + stridesA[i].GetVal();
+            compute_idx_X1 += " + ";
+         }
+         // remove last 3 character " + "
+         for (int j = 0; j < 3; j++)
+            compute_idx_X1.pop_back();
       }
-      // Broadcast B if it's uninitialized
-      if (!fNBroadcastedX2.empty()) {
-         std::string type2 = ConvertTypeToString(fTensorType2);
-         out << SP << "// Broadcasting uninitialized tensor " << fNX2 << "\n";
-         out << SP << "{\n";
-         out << SP << SP << type2 << "* data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<" << type2 << ">(tensor_" << fNX2 << ", " << ConvertShapeToString(fShapeX2) << ", " << ConvertShapeToString(fShapeY) << ");\n";
-         out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNBroadcastedX2 << ");\n";
-         out << SP << SP << "delete[] data;\n";
-         out << SP << "}\n";
+      if (fDimShapeX2.empty() ||
+          std::all_of(fDimShapeX2.begin(), fDimShapeX2.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+         compute_idx_X2 = "0";
+      } else {
+         for (size_t i = 0; i < fDimShapeX2.size(); ++i) {
+            if (fDimShapeX2[i].dim == 1 || fDimShapeX2[i].GetVal() == "1")
+               continue;
+            compute_idx_X2 += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeX2.size()));
+            if (stridesB[i].GetVal() != "1")
+               compute_idx_X2 += " * " + stridesB[i].GetVal();
+            compute_idx_X2 += " + ";
+         }
+          // remove last 3 character " + "
+         for (int j = 0; j < 3; j++)
+            compute_idx_X2.pop_back();
       }
-      const std::string& nameX1 = fNBroadcastedX1.empty()? fNX1 : fNBroadcastedX1;
-      const std::string& nameX2 = fNBroadcastedX2.empty()? fNX2 : fNBroadcastedX2;
-
-      out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n";
-      out << SP << SP << "fTensor_" << fNY << "[id] = " << ComparisionTrait<T,Op>::Op( "tensor_" + nameX1 + "[id]" , "tensor_" + nameX2 + "[id]") <<  " ;\n";
-      out << SP << "}\n";
-      // since output is a boolean need to add the tensor_xxx variable since it is not defined as a pointer to a boolean std::vector
-      if (!fIsModelOutput)
-         out << SP << "const std::vector<std::uint8_t> & tensor_" << fNY << " = fTensor_" << fNY << ";\n";
+      int nloop = 0;
+      if (fDimShapeY.empty() ||
+          std::all_of(fDimShapeY.begin(), fDimShapeY.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+         compute_idx_Y = "0";
+      } else {
+         for (size_t i = 0; i < fDimShapeY.size(); ++i) {
+            if (fDimShapeY[i].dim != 1 && fDimShapeY[i].GetVal() != "1") {
+               nloop++;
+               for (int j = 0; j < nloop; j++) out << SP;
+               out << "for (size_t idx_" << i << " = 0; idx_" << i << " < " << fDimShapeY[i]
+                   << "; ++idx_" << i << "){\n";
+               compute_idx_Y += "idx_" + std::to_string(i);
+               if (stridesY[i].GetVal() != "1")
+                  compute_idx_Y += " * " + stridesY[i].GetVal();
+               compute_idx_Y += " + ";
+            }
+         }
+         // remove last 3 characters " + "
+         for (int j = 0; j < 3; j++)
+            compute_idx_Y.pop_back();
+      }
+      for (int j = 0; j < nloop + 1; j++) out << SP;
+      out << "tensor_" << fNY << "[" << compute_idx_Y << "] = "
+          << ComparisionTrait<T,Op>::Op( "tensor_" + fNX1 + "[" + compute_idx_X1 + "]" ,
+                                         "tensor_" + fNX2 + "[" + compute_idx_X2 + "]") <<  " ;\n";
+
+
+      for (int i = nloop; i > 0; i--) {
+         for (int j = 0; j < i; j++) out << SP;
+         out << "}\n";
+      }
+
 
       return out.str();
    }
diff --git a/tmva/sofie/inc/TMVA/ROperator_Concat.hxx b/tmva/sofie/inc/TMVA/ROperator_Concat.hxx
index ad855341dfc17..d8155195c9f49 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Concat.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Concat.hxx
@@ -123,7 +123,7 @@
                            concat_dim = inputs[i][iaxis];
                         else if (inputs[i][iaxis].isParam || concat_dim.isParam) {
                            concat_dim =
-                              Dim{ concat_dim.GetVal() + std::string("+ ") + inputs[i][iaxis].GetVal(),
+                              Dim{ concat_dim.GetVal() + std::string(" + ") + inputs[i][iaxis].GetVal(),
                                  static_cast<size_t>(-1)};
                         } else {
                            concat_dim = Dim { concat_dim.dim + inputs[i][iaxis].dim };
@@ -156,7 +156,7 @@
                }
 
                // output shape for concatenated axis
-               ret[fAxis] = Dim{concat_dim};
+               ret[fAxis] = concat_dim;
 
             }
             // case of stacking (not supported yet)
@@ -205,7 +205,7 @@
                      size_t inputLength = ConvertShapeToLength(inputShape);
                      std::copy(inputData, inputData + inputLength, outputData.begin() + offset );
                      offset += inputLength;
-                     // data do not need to be written as a weight
+                     // data do not need to be written in teh generated code
                      model.SetNotWritableInitializedTensor(input);
                   }
                   model.AddConstantTensor<int64_t>(fOutput, outputShape, outputData.data());
@@ -221,15 +221,18 @@
                      std::vector<Dim> inputData;
                      auto inputShape = model.GetTensorShape(input); // shape is not dynamic
                      size_t inputLength = ConvertShapeToLength(inputShape); // shape can be a scalar
-                     if (model.IsShapeTensor(input))
+                     if (model.IsShapeTensor(input)) {
                         inputData = model.GetShapeTensorValues(input);
-                     else if (model.IsConstantTensor(input)) {
+                     } else if (model.IsInitializedTensor(input)) {
                         inputData.resize(inputLength);
                         auto intData = static_cast<int64_t*>(model.GetInitializedTensorData(input).get());
                         for (size_t i = 0; i < inputData.size(); i++)
                            inputData[i] = Dim{ static_cast<size_t>(intData[i])};
                      }
-                     std::cout << "concatenating input data " << inputLength << "  " << inputData[0] << std::endl;
+                     else {
+                        // this should not happen
+                        throw std::runtime_error("TMVA SOFIE Concat Operator- invalid input type for shape output type");
+                     }
                      std::copy(inputData.begin(), inputData.end(), outputData.begin() + offset );
                      offset += inputLength;
                   }
@@ -251,13 +254,15 @@
          }
 
          std::string Generate(std::string opName) override {
-            if (fIsOutputConstant) return "";
             opName = "op_" + opName;
+            std::stringstream out;
+            out<<"\n//--------- Concat " << opName << " --> " << fOutput << "  " << ConvertShapeToString(fOutputShape) << "\n";
+
+            if (fIsOutputConstant) return out.str();
+
             if(fOutputShape.empty()){
                   throw std::runtime_error("TMVA SOFIE Concat called to Generate without being initialized first");
             }
-            std::stringstream out;
-            out<<"\n//--------- Concat " << opName << " --> " << ConvertShapeToString(fOutputShape) << "\n";
             // special case when memory is contiguous
             bool hasShapeOnes = true;
             for(int i = 0; i<fAxis; ++i){
@@ -299,14 +304,14 @@
 
                for (size_t j = 0; j < fInputs.size(); j++) {
                   if (j>0)
-                  out << SP << SP << SP << "idxOut += " << fInputShapes[j-1][fAxis].GetVal() << ";\n";
+                  out << SP << SP << SP << "idxOut += " << inStrides[j-1][fAxis-1].GetVal() << ";\n";
                   out << SP << SP << SP << "int idxIn" << j <<" = ";
                   for (int k = 0; k < fAxis; k++) {
                      if (k > 0) out << " + ";
                      out << inStrides[j][k].GetVal() << "*i" << k;
                   }
                   out << ";\n";
-                  out << SP << SP << SP << "for (size_t iC = 0; iC < " << fInputShapes[j][fAxis].GetVal() << "; ++iC) {\n";
+                  out << SP << SP << SP << "for (size_t iC = 0; iC < " << inStrides[j][fAxis-1].GetVal() << "; ++iC) {\n";
                   out << SP << SP << SP << SP << "tensor_" << fOutput << "[idxOut+iC] = tensor_" << fInputs[j] << "[idxIn" << j << "+iC];\n";
                   out << SP << SP << SP << "}\n";
                // concatenate the axis values
diff --git a/tmva/sofie/inc/TMVA/ROperator_Constant.hxx b/tmva/sofie/inc/TMVA/ROperator_Constant.hxx
index 1cf5d13f5cd6f..7c824f1abe6e3 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Constant.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Constant.hxx
@@ -123,11 +123,16 @@ public:
          if (model.Verbose()) {
             std::cout << "adding constant tensor " << fNY << " with shape " << ConvertShapeToString(fShape)
             << " and values [";
-            for (auto v : fValues) std::cout << " " << v;
-            std::cout << "]" << std::endl;
+            if (!fIsConstantOfShape) {
+               for (auto v : fValues) std::cout << " " << v;
+               std::cout << "]" << std::endl;
+            } else {  // for constant of shape is enough to print one value
+               std::cout << "... " << fValues[0] << " ....]" << std::endl;
+            }
          }
       } else {
          model.AddIntermediateTensor(fNY, ConvertStringToType(TensorType<T>::Name()), fDimOutputShape);
+         fOutputTensorNames.emplace_back(fNY);
       }
    }
 
@@ -136,9 +141,9 @@ public:
       std::stringstream out;
       if (fIsOutputConstant) {
          if (fNX.empty())
-            out <<  "// ---- Constant (no-op) " << opName << " --> " << ConvertShapeToString(fDimOutputShape) << "\n";
+            out <<  "// ---- Constant (no-op) " << opName << " --> " << fNY << " " << ConvertShapeToString(fDimOutputShape) << "\n";
          else
-            out << "// ---- ConstantOfShape (no-op) " << opName << " --> " << ConvertShapeToString(fDimOutputShape) << "\n";
+            out << "// ---- ConstantOfShape (no-op) " << opName << " --> " << fNY << " " << ConvertShapeToString(fDimOutputShape) << "\n";
          return out.str();
       }
       // Only ConstantOfShape might require generation code
@@ -153,9 +158,7 @@ public:
       }
       auto length = ConvertDimShapeToLength(fDimOutputShape);
       // vector is already allocated- fill with values
-      out << SP << "if (" << length << " > fTensor_" << fNY << ".size())\n";
-      out << SP << SP << "fTensor_" << fNY << ".resize(" << length  << ");\n";
-      out << SP << "std::fill(fTensor_" << fNY << ".begin(), fTensor_" << fNY << ".end(), " << fValues[0] << ");\n";
+      out << SP << "std::fill(tensor_" << fNY << ", tensor_" << fNY << " + " << length << ", " << fValues[0] << ");\n";
       return out.str();
    }
 };
diff --git a/tmva/sofie/inc/TMVA/ROperator_Conv.hxx b/tmva/sofie/inc/TMVA/ROperator_Conv.hxx
index 95f226ca91d4b..87d1ad0a0bf67 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Conv.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Conv.hxx
@@ -20,6 +20,8 @@ template<typename T>
 class ROperator_Conv final : public ROperator
 {
 private:
+   bool fBroadcastBias = false;
+
    std::string fAttrAutopad;
    std::vector<size_t> fAttrDilations;
    size_t fAttrGroup;
@@ -30,7 +32,6 @@ private:
    std::string fNX;
    std::string fNW;
    std::string fNB;
-   std::string fNB2; // bias tensor name after broadcasting
    std::string fNY;
 
    std::string convK;
@@ -262,6 +263,9 @@ public:
                std::runtime_error("TMVA SOFIE Conv op Input Tensor " + fNB + " is not found in model");
          }
          fShapeB = model.GetTensorShape(fNB);
+         if (fShapeB.size() != 1)
+            throw
+               std::runtime_error("TMVA SOFIE Conv op : invalid shape for Bias tensor (is not 1D)");
          std::vector<Dim> targetShape(fShapeY.begin() + 1, fShapeY.end());
          auto shapeDimB = model.GetDimTensorShape(fNB);
          bool broadcast_needed = !UTILITY::AreSameShape(shapeDimB, targetShape);
@@ -278,7 +282,9 @@ public:
             if (fType != "float")
                throw std::runtime_error("TMVA SOFIE Conv op: Broadcasting for non-float type tensors is not supported");
             // here is the actual broadcasting
+            fBroadcastBias = true;
             if (!fUseSession) {
+               // do here broadcasting
                std::vector<size_t> shape(fDim + 1, 1);
                shape[0] = fShapeB[0];
                auto intTargetShape = ConvertShapeToInt(targetShape);
@@ -287,26 +293,28 @@ public:
                   std::default_delete<float[]>());
                model.UpdateInitializedTensor(fNB, model.GetTensorType(fNB), intTargetShape, new_data_ptr);
                fShapeB = model.GetTensorShape(fNB);
-               fNB2 = fNB;   // use same name
-            }
-            else {
-               // In case of session add broadcasting code in Session constructor and in GenerateInitCode
-               // we need to add a new intermediate tensor for broadcasted bias tensor
-               fNB2 = fNB + "bcast";
-               model.AddIntermediateTensor(fNB2, model.GetTensorType(fNB), targetShape);
             }
          }
       }
-      // output channel size can be parametric
+      // output channel size can be parametric and is an expression
       std::vector<Dim> outputDims = std::vector<Dim>(fShapeY.begin()+2, fShapeY.end());
-      auto outputChannelSize = ConvertDimShapeToLength(outputDims); // size/channel = D * H * W
+      //check if shape is not parametric
+      std::vector<size_t> outputInts = ConvertShapeToInt(outputDims);
+      Dim channelDim;
+      if (outputInts.empty()) {
+         auto outputChannelSize = ConvertDimShapeToLength(outputDims); // size/channel = D * H * W
+         channelDim = Dim{ outputChannelSize, static_cast<size_t>(-1)};
+      } else {
+         size_t outputChannelSize = ConvertShapeToLength(outputInts);
+         channelDim = Dim{ outputChannelSize };
+      }
       size_t kernelSize = fAttrKernelShape[0];
       for (size_t i = 1; i < fDim; i++) {
          kernelSize *= fAttrKernelShape[i];
       }
 
       std::vector<size_t> shape1 = {fShapeW[0], fShapeW[1], kernelSize};
-      std::vector<Dim> shape2 = {Dim{fShapeW[1]}, Dim{kernelSize}, Dim{outputChannelSize}};
+      std::vector<Dim> shape2 = {Dim{fShapeW[1]}, Dim{kernelSize}, channelDim };
       model.AddIntermediateTensor(fNX +"_f", ConvertStringToType(fType), shape1 );
       model.AddIntermediateTensor(fNX +"_xcol", ConvertStringToType(fType), shape2 );
       convK = fNX +"_f";
@@ -325,15 +333,25 @@ public:
    std::string GenerateInitCode() override {
       std::stringstream out;
       // Generate initialization code for broadcasting of bias tensor
-      if (!fNB2.empty()) {
+      if (fBroadcastBias) {
          // include a separate scope to avoid defining unique operator temp variables
          std::vector<size_t> shape(fDim + 1, 1);
+         // bias (is a 1D tensor)
          shape[0] = fShapeB[0];
          std::vector<Dim> targetShape(fShapeY.begin() + 1, fShapeY.end());
-         out << SP << "{\n";
+         out << "//--- broadcast bias tensor " << fNB << "for Conv op if needed \n";
+         // in case of dynamic tensors check needs to be done at run time
+         bool isOutDynamic = ConvertShapeToInt(targetShape).empty();
+         auto length = ConvertDimShapeToLength(targetShape);
+         if (isOutDynamic)
+            out << SP << "if (" << length << " > " << ConvertShapeToLength(shape) << ") {\n";
+         else
+            out << SP << "{\n";
          out << SP << SP << "float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_"
              << fNB << ", " << ConvertShapeToString(shape) << ", " << ConvertShapeToString(fShapeY) << ");\n";
-         out << SP << SP << "std::copy(data, data + " << ConvertDimShapeToLength(targetShape) << ", tensor_" << fNB2 << ");\n";
+         out << SP << SP << "fTensor_" << fNB << ".resize(" << length << ");\n";
+         out << SP << SP << "std::copy(data, data + " << length << ", fTensor_" << fNB << ".begin());\n";
+         out << SP << SP << "tensor_" << fNB << " = fTensor_" << fNB << ".data();\n";
          out << SP << SP << "delete[] data;\n";
          out << SP << "}\n";
       }
@@ -553,13 +571,13 @@ public:
          out << SP << SP << "}\n"; // end of group loop
       }
 
-      if (fNB2 != "") {
+      if (fNB != "") {
          out << SP << "int " << OpName << "_size = " << outputBatchStride << ";\n";
          out << SP << "float " << OpName << "_gamma = 1.0;\n";
          out << SP << "int " << OpName << "_incx = 1;\n";
          out << SP << "int " << OpName << "_incy = 1;\n";
 
-         out << SP << "BLAS::saxpy_(&" << OpName << "_size, &" << OpName << "_gamma, tensor_" << fNB2 << ", &"
+         out << SP << "BLAS::saxpy_(&" << OpName << "_size, &" << OpName << "_gamma, tensor_" << fNB << ", &"
              << OpName << "_incx, tensor_" << fNY << " + out_offset, &" << OpName << "_incy);\n";
 
       }
diff --git a/tmva/sofie/inc/TMVA/ROperator_Gather.hxx b/tmva/sofie/inc/TMVA/ROperator_Gather.hxx
index 81411b8ebf71a..0d50c0747c028 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Gather.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Gather.hxx
@@ -72,8 +72,6 @@ public:
           // empty shape Indices is a scalar value for the indices
          size_t indicesLength = ConvertShapeToLength(model.GetTensorShape(fNIndices));
          int64_t* indicesData = static_cast<int64_t*>(model.GetInitializedTensorData(fNIndices).get());
-         //flag index tensor as not writable (not sure this is needed since index tensor might be used in generated code)
-         model.SetNotWritableInitializedTensor(fNIndices);
          // update indices data in case of negative dim values
          for (size_t i = 0; i < indicesLength; i++) {
             // move this at generation time?
@@ -153,13 +151,14 @@ public:
    }
 
    std::string Generate(std::string opName) override {
+      opName = "op_" + opName;
+      std::stringstream out;
+      out << "//--------- Gather " << opName << " --> " << fNY << "  " << ConvertShapeToString(fShapeY) << "\n";
       if (fIsOutputConstant) {
          // no code to generate here for constant output. Tensor output is defined in Session constructor
-         return "//---------------------------------------\n";
+         out << "//--------------------(constant)----------\n";
+         return out.str();
       }
-      opName = "op_" + opName;
-      std::stringstream out;
-      out << "//--------- Gather " << opName << " --> " << ConvertShapeToString(fShapeY) << "\n";
       // The shape of the output is q + r - 1
       size_t r = fShapeX.size();
       // Indices of shape q
diff --git a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
index d954720396151..a18914b8892a8 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Gemm.hxx
@@ -24,6 +24,7 @@ namespace SOFIE{
 
    private:
       bool fIsDynamic = false;
+      bool fBroadcastBias = false;
 
       float fAttrAlpha = 1.0;
       float fAttrBeta = 1.0;
@@ -33,7 +34,6 @@ namespace SOFIE{
       std::string fNA;
       std::string fNB;
       std::string fNC = "";
-      std::string fNC2; // bias tensor name after broadcasting
       std::string fNY;
       std::string fType;
       EActivationType fActivation;
@@ -107,6 +107,7 @@ namespace SOFIE{
          if (input[0].size() > 2 && input[1].size() == input[0].size()) {
             // in case of dim > 2 first dimensions are equal to the input ones not
             // equal to 1 (e.g. (1,2,3) * (2,3,4) -> (2,2,4))
+            // here could probably use the Broadcasting function  UTILITY::MultidirectionalBroadcastShape
             for (size_t i = 0; i < input[0].size()-2; i++) {
                Dim valueA = input[0][i];
                Dim valueB = input[1][i];
@@ -207,13 +208,7 @@ namespace SOFIE{
          }
 
          fShapeY = DynamicShapeInference({fShapeA, fShapeB});
-         std::vector<size_t> shapeY;
-         if (!fIsDynamic) {
-            shapeY = ConvertShapeToInt(fShapeY);
-            if (shapeY.empty()) {
-               throw std::runtime_error("TMVA SOFIE Gemm Op " + fNY + " has invalid shape" + ConvertShapeToString(fShapeY));
-            }
-         }
+         std::vector<size_t> shapeY = ConvertShapeToInt(fShapeY);
 
          // bias is normally not dynamic (not support it for time being)
          if (fNC != ""){
@@ -222,38 +217,33 @@ namespace SOFIE{
                throw std::runtime_error("TMVA SOFIE Gemm Op Input Tensor" + fNC + " is dynamic and is not supported");
             }
             fShapeC = model.GetTensorShape(fNC);
-            fNC2 = fNC;
             size_t lengthC = ConvertShapeToLength(fShapeC);
             size_t lengthY = ConvertShapeToLength(shapeY);
-            // for dynamic outputs broadcasting is always done
-            bool broadcast_needed = lengthC != lengthY;
+            // for dynamic outputs broadcasting is always needed
+            bool broadcast_needed = false;
+            if (fIsDynamic && shapeY.empty())
+               broadcast_needed = true;
+            else
+               broadcast_needed = lengthC != lengthY;
 
 
             if (broadcast_needed) {
-               if (!model.UseSession()) {
-                  // without session dynamic tensors not supported in Gemm
-                  if (fIsDynamic) {
-                      throw std::runtime_error("TMVA SOFIE Gemm Op:  dynamic tensors not supported without a session");
-                  }
-                  auto original_data = model.GetInitializedTensorData(fNC);
-                  auto targetShape = UTILITY::UnidirectionalBroadcastShape(fShapeC, shapeY);
-                  if (fType == "float") {
-                     std::shared_ptr<void> new_data_ptr(UTILITY::UnidirectionalBroadcast<float>(
-                        static_cast<float *>(original_data.get()), fShapeC, targetShape),
-                        std::default_delete<float[]>());
-
-                     model.UpdateInitializedTensor(fNC, model.GetTensorType(fNC), shapeY, new_data_ptr);
-                     fShapeC = shapeY;
-                  }
-               } else {
-                  // In case of session add broadcasting code in Session constructor and in GenerateInitCode
-                  // we need to add a new intermediate tensor for broadcasted bias tensor
-                  fNC2 = fNC + "bcast";
-                  if (!fIsDynamic) {
-                     model.AddIntermediateTensor(fNC2, model.GetTensorType(fNC), shapeY);
-                  }
-                  else
-                     model.AddDynamicTensor(fNC2,model.GetTensorType(fNC), fShapeY);
+               fBroadcastBias = true;
+               // check if broadcasting is compatible and note that prepend 1 to shapeC
+               auto shapeDimC = ConvertShapeToDim(fShapeC);
+               auto r = UTILITY::MultidirectionalBroadcastShape(fShapeY, shapeDimC);
+               // return flag must be equal to 1 since this is a unidirectional broadcast of C->Y
+               if (r.first > 1) {
+                  throw std::runtime_error("TMVA SOFIE Gemm Op - bias tensor of shape " + ConvertShapeToString(fShapeC) + " cannot be uni-directional broadcasted to " + ConvertDimShapeToString(fShapeY));
+               }
+               fShapeC = ConvertShapeToInt(shapeDimC);
+               if (fShapeC.empty()) {
+                  throw std::runtime_error("TMVA SOFIE Gemm Op - Error in bias tensor " + ConvertDimShapeToString(shapeDimC) );
+               }
+            } else {
+               // for the case lengthY == lengthC but shape is different (e.g. Y is (2,3) and  is (6))
+               if (shapeY  != fShapeC) {
+                  throw std::runtime_error("TMVA SOFIE Gemm Op:  invalid shape for bias tensor " + ConvertShapeToString(fShapeC));
                }
             }
          }
@@ -291,21 +281,31 @@ namespace SOFIE{
       std::string GenerateInitCode() override {
          std::stringstream out;
          // generate initialization code for broadcasting of bias tensor
-         if (fShapeC.size() != fShapeY.size() && fNC != fNC2) {
+#if 0
+         if (fShapeC.size() != fShapeY.size() && fBroadcastBias) {
             // we broadcast here always C in Y output, so target shape is the one of Y
             // no need to call UTILITY::UnidirectionalBroadcastShape.
             // here in case of parametric shape we need to assume that the parameters will be defined in the initialization code.
-            auto targetShape = fShapeY;
-            // include a separate scope to avoid defining unique operator temp variables
-            out << "//--- broadcast bias tensor " << fNC << "for Gemm op\n";
-            out << SP << "{\n";
-            out << "      float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_"
-               << fNC << "," << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY) << ");\n";
             auto length = ConvertDimShapeToLength(fShapeY); // output size
-            out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNC2 << ");\n";
+            // include a separate scope to avoid defining unique operator temp variables
+            out << "//--- broadcast bias tensor " << fNC << "for Gemm op if needed \n";
+            // in case of dynamic tensors check needs to be done at run time
+            bool isOutDynamic = ConvertShapeToInt(fShapeY).empty();
+            if (isOutDynamic)
+               out << SP << "if (" << length << " > " << ConvertShapeToLength(fShapeC) << ") {\n";
+            else
+               out << SP << "{\n";
+            // here we broadcast
+            out << SP << SP << "float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_"
+                << fNC << "," << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY) << ");\n";
+
+            out << SP << SP << "fTensor_" << fNC << ".resize(" << length << ");\n";
+            out << SP << SP << "std::copy(data, data + " << length << ", fTensor_" << fNC << ".begin());\n";
+            out << SP << SP << "tensor_" << fNC << " = fTensor_" << fNC << ".data();\n";
             out << SP << SP << "delete [] data;\n";
             out << SP << "}\n";
          }
+#endif
          return out.str();
       }
 
@@ -316,7 +316,8 @@ namespace SOFIE{
             throw std::runtime_error("TMVA SOFIE Gemm Op called to Generate without being initialized first");
          }
          std::stringstream out;
-         out << "\n//--------- Gemm\n";
+         out << "\n//--------- Gemm " << opName << " " << ConvertShapeToString(fShapeA) << " * " << ConvertShapeToString(fShapeB)
+             << " -> " << ConvertShapeToString(fShapeY) << "\n";
          // need to consider case A and B have dim > 2 (for MatMul)
          int64_t dimA = fShapeA.size();
          int64_t dimB = fShapeB.size();
@@ -327,18 +328,20 @@ namespace SOFIE{
          auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal());
          auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal());
          auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal());
+         // size of A: if (trasposeA) is m*k else k*m
+         // size of B  n*k
          std::vector<Dim> sY = {fShapeY[dimY-2], fShapeY[dimY-1]};
          // extra dimensions in case of stacked MatMul
-         std::vector<Dim> sA;
+         std::vector<Dim> sExtraY;
          for (int64_t i = 0; i < dimY-2; i++) {
-            sA.push_back(fShapeY[i]);
+            sExtraY.push_back(fShapeY[i]);
          }
          auto lengthGemm = ConvertDimShapeToLength(sY); // size of the Gemm operation
-         auto lengthExtra = ConvertDimShapeToLength(sA); // extra length in case input tensors are of dim>2 (MatMul)
+         auto lengthExtra_Y = ConvertDimShapeToLength(sExtraY); // extra length in case input tensors are of dim>2 (MatMul)
 
          // case bias is present
          if (!fNC.empty()){
-            if (fNC2 == fNC) {
+            if (!fBroadcastBias) {
                // add a check in case broadcasting was not needed or done outside of session
                // C should have smaller dimension of Y
                if (!fIsDynamic) {
@@ -347,7 +350,7 @@ namespace SOFIE{
                             + ConvertShapeToString(fShapeC) + " output length " + lengthGemm);
                } else {
                   // add a dynamic check (C should not be a dynamic tensor)
-                  out << SP << "assert(" << lengthGemm << " != " <<  ConvertShapeToLength(fShapeC) << ");\n";
+                  out << SP << "assert(" << lengthGemm << " == " <<  ConvertShapeToLength(fShapeC) << ");\n";
                }
             }
          } else {
@@ -360,31 +363,83 @@ namespace SOFIE{
 
          // include MatMul case where we stack the Gemm operations
          // exclude case where we have only 1's in the additional dims
-         bool doStackMul = dimY > 2 && ( fIsDynamic  || std::stoi(lengthExtra) > 1);
+         bool doStackMul = dimY > 2 && ( fIsDynamic  || std::stoi(lengthExtra_Y) > 1);
+         // compute input offset for stack multiplications
+         std::string lengthExtra_A;
+         std::string lengthExtra_B;
+         std::string increment_A;
+         std::string increment_B;
+
+         if (doStackMul) {
+            std::vector<Dim> sA(fShapeA.begin(), fShapeA.begin()+dimA-2);
+            std::vector<Dim> sB(fShapeB.begin(), fShapeB.begin()+dimB-2);
+            std::vector<Dim> mA = {fShapeA[dimA-2], fShapeA[dimA-1]};
+            std::vector<Dim> mB = {fShapeA[dimB-2], fShapeB[dimB-1]};
+            lengthExtra_A = ConvertDimShapeToLength(sA);
+            lengthExtra_B = ConvertDimShapeToLength(sB);
+            // size of A performing matmul is m*k and n*k for B
+            increment_A = ConvertDimShapeToLength(mA);
+            increment_B = ConvertDimShapeToLength(mB);
+         }
+         bool extraA = (doStackMul && lengthExtra_A != "1");
+         bool extraB = (doStackMul && lengthExtra_B != "1");
          if (doStackMul) {
-            out << SP << "size_t " << opName << "_yoffset = 0;\n"; // needed if we stack the gemm operations
-            out << SP << "for (int i = 0; i < " << lengthExtra << "; i++){\n";
+            out << SP << "size_t " << opName << "_y_offset = 0;\n"; // needed if we stack the gemm operations
+            if (extraA)
+               out << SP << "size_t " << opName << "_A_offset = 0;\n";
+            if (extraB)
+               out << SP << "size_t " << opName << "_B_offset = 0;\n";
+            out << SP << "for (size_t i = 0; i < " << lengthExtra_Y << "; i++){\n";
             out << SP;
          }
+         // do the bias broadcasting
+         if (fBroadcastBias) {
+            out << SP << "for (size_t j = 0; j < " << sY[0] << "; j++) { \n";
+            out << SP << SP << "size_t y_index = ";
+            if (doStackMul) // add offset in caseof stack multiplications (not sure if bias is present in these cases)
+               out <<  opName << "_y_offset + ";
+            if (sY[1].GetVal() != "1")
+               out << sY[1] << " * j;\n";
+            else
+               out << "j;\n";
+
+            out << SP << SP << "for (size_t k = 0; k < " << sY[1] << "; k++) { \n";
+            std::string bias_index;
+            if (fShapeC[0] == 1 && fShapeC[1] == sY[1].dim)
+               bias_index = "k";
+            else if (fShapeC[1] == 1 && fShapeC[0] == sY[0].dim)
+               bias_index = "j";
+            else if (fShapeC[0] == 1 && fShapeC[1] == 1)   // scalar case
+               bias_index = "0";
+            else {
+               throw std::runtime_error("TMVA SOFIE Gemm Op - invalid shape for bias tensor " + ConvertShapeToString(fShapeC));
+            }
+
+            out << SP << SP << SP << "tensor_" << fNY << "[y_index + k] = " <<  "tensor_" << fNC << "[" << bias_index << "];\n";
+            out << SP << SP << "}\n";
+            out << SP << "}\n";
+         }
 
          if (fType == "float"){
 
             out << SP << "TMVA::Experimental::SOFIE::Gemm_Call("
              << "tensor_" << fNY;
-             if (doStackMul) out << " + " << opName << "_yoffset";
+             if (doStackMul) out << " + " << opName << "_y_offset";
             out <<   ", "
              << (fAttrTransB ? "true, " : "false, ")
              << (fAttrTransA ? "true, " : "false, ")
              << n << ", " << m << ", " << k << ", ";
-            out << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrAlpha << ",";
-            out << "tensor_" << fNB << ", " << "tensor_" << fNA << ", ";
-            out << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrBeta << ",";
-            // in the case of bias
-             if (!fNC.empty())
-               out << "tensor_" << fNC2;
-             else
+            out << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrAlpha << ", tensor_" << fNB;
+            if (extraB) out << " + " << opName << "_B_offset";
+            out << ", tensor_" << fNA;
+            if (extraA) out << " + " << opName << "_A_offset";
+            out << ", " << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrBeta << ",";
+            // in the case of bias and no broadcasting needed
+            if (!fNC.empty() && !fBroadcastBias)
+               out << "tensor_" << fNC;
+            else
                out << "nullptr";
-             out << ");\n";
+            out << ");\n";
 
             if(fActivation == EActivationType::RELU){
                out << SP << "for (int id = 0; id < " << ConvertDimShapeToLength(fShapeY) << " ; id++){\n";
@@ -394,7 +449,12 @@ namespace SOFIE{
          }
 
          if (doStackMul) {
-            out << SP << SP <<  opName << "_yoffset += " << lengthGemm << ";\n";
+            out << SP << SP <<  opName << "_y_offset += " << lengthGemm << ";\n";
+            if (lengthExtra_A != "1")
+               out << SP << SP << opName << "_A_offset += " << increment_A << ";\n";
+            if (lengthExtra_B != "1")
+               out << SP << SP << opName << "_B_offset += " << increment_B << ";\n";
+
             out << "}\n"; // end of loop on the stacked multiplications
          }
 
diff --git a/tmva/sofie/inc/TMVA/ROperator_LayerNormalization.hxx b/tmva/sofie/inc/TMVA/ROperator_LayerNormalization.hxx
index 239c5332172b0..f98ce201d400d 100644
--- a/tmva/sofie/inc/TMVA/ROperator_LayerNormalization.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_LayerNormalization.hxx
@@ -14,6 +14,7 @@ namespace SOFIE {
 template <typename T>
 class ROperator_LayerNormalization : public ROperator {
 private:
+   bool fCastToFloat = false;  // flag to indicate if operation 1 are in floats (to be  impl)
    int fAttrAxis;
    float fAttrEpsilon;
    size_t fAttrStashType;
@@ -31,7 +32,7 @@ private:
 
    std::vector<Dim> fShapeX;
    std::vector<Dim> fShapeScale;
-   std::vector<size_t> fShapeB;  // shape of input Bias (B) is assumed to be fully defined
+   std::vector<Dim> fShapeB;
    std::vector<Dim> fShapeY;
    std::vector<Dim> fShapeMean;
    std::vector<Dim> fShapeInvStdDev;
@@ -40,8 +41,8 @@ private:
    size_t fSize; // Size of the input
    // size_t fAxisDim;
 
-   std::vector<Dim> fNormalizedShape;
-   std::vector<Dim> fAxesShape;
+   std::vector<Dim> fNormalizedShape;  // shape from X[ axis,...,N-1]
+   std::vector<Dim> fAxesShape;        // shape from X[0,..,axis-1]
    // lengths in string format
    std::string fLength; // Length of the input
    std::string fNormalizedLength;
@@ -79,7 +80,7 @@ public:
 
    void Initialize(RModel& model) override {
       if (!model.CheckIfTensorAlreadyExist(fNX)) {
-         throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found.");
+         throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Tensor " + fNX + " not found.");
       }
       bool isDynamic = model.IsDynamicTensor(fNX);
       fShapeX = model.GetDimTensorShape(fNX);
@@ -104,8 +105,7 @@ public:
       // Type of mean and std
       ETensorType type = (fAttrStashType == 1) ? ETensorType::FLOAT : model.GetTensorType(fNX);
       // Mean
-      if (fNMean.empty()) {
-         fNMean = "Mean" + fNX;
+      if (!fNMean.empty()) {
          // cannot use initializer list with one element since it is ambiguous
          if (isDynamic)
             // add size_t(-1) to indicate that shape is an expression
@@ -114,29 +114,60 @@ public:
             model.AddIntermediateTensor(fNMean, type, std::vector<size_t>(1,std::stoi(fAxesLength)));
       }
       // Inverse Standard Deviation
-      if (fNInvStdDev.empty()) {
-         fNInvStdDev = "InvStdDev" + fNX;
+      if (!fNInvStdDev.empty()) {
          if (isDynamic)
             model.AddIntermediateTensor(fNInvStdDev, type, std::vector<Dim>(1,Dim{fAxesLength,std::size_t(-1)}));
          else
             model.AddIntermediateTensor(fNInvStdDev, type, std::vector<size_t>(1,std::stoi(fAxesLength)));
       }
+      // if mean and stdev are not empty they are not defined in the output list
       // Cast X to float
       if (fAttrStashType == 1 && model.GetTensorType(fNX) != ETensorType::FLOAT) {
-         fNCastedX = "Casted" + fNX;
-         model.AddIntermediateTensor(fNCastedX, ETensorType::FLOAT, fShapeX);
-         fNNormalizedX = "Normalized" + fNX;
-         model.AddIntermediateTensor(fNNormalizedX, ETensorType::FLOAT, fShapeX);
+         fCastToFloat = true;
+         fType = "float";
+         // fNCastedX = "Casted" + fNX;
+         // model.AddIntermediateTensor(fNCastedX, ETensorType::FLOAT, fShapeX);
+         // fNNormalizedX = "Normalized" + fNX;
+         // model.AddIntermediateTensor(fNNormalizedX, ETensorType::FLOAT, fShapeX);
+      }
+      // scale shape
+      fShapeScale = model.GetDimTensorShape(fNScale);
+      // appends 1 to scale shapes if missing
+      size_t dimScale = fShapeScale.size();
+      if (dimScale < fSize) {
+         for (size_t i = 0; i < fSize-dimScale; i++)
+            fShapeScale.insert(fShapeScale.begin(), Dim{1});
+      }
+      // check also shape if consistent now
+      for (size_t i = 0; i < fSize; i++) {
+         if (fShapeScale[i].dim != 1 && fShapeScale[i] != fShapeX[i])
+            throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Scale Tensor has invalid shape " + ConvertDimShapeToString(fShapeScale));
       }
-      // Broadcast the bias
       if (!fNB.empty()) {
-         fShapeB = model.GetTensorShape(fNB);
-         size_t lengthB = ConvertShapeToLength(fShapeB);
-         if (isDynamic || lengthB < static_cast<size_t>(std::stoi(fLength))) {
-            fNBroadcastedB = "Broadcasted" + fNB;
-            model.AddIntermediateTensor(fNBroadcastedB, ConvertStringToType(fType), fShapeX);
+         fShapeB = model.GetDimTensorShape(fNB);
+         // appends 1 to bias shapes if missing
+         size_t dimB = fShapeB.size();
+         if (dimB < fShapeX.size()) {
+            for (size_t i = 0; i < fSize-dimB; i++)
+               fShapeB.insert(fShapeB.begin(), Dim{1});
+         }
+         for (size_t i = 0; i < fSize; i++) {
+            if (fShapeB[i].dim != 1 && fShapeB[i] != fShapeX[i])
+               throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Bias Tensor has invalid shape " + ConvertDimShapeToString(fShapeScale));
          }
       }
+
+      std::cout << "bias + scale " << ConvertDimShapeToString(fShapeB) << "  " << ConvertDimShapeToString(fShapeScale) << std::endl;
+
+      // // Broadcast the bias
+      // if (!fNB.empty()) {
+      //    fShapeB = model.GetTensorShape(fNB);
+      //    size_t lengthB = ConvertShapeToLength(fShapeB);
+      //    if (isDynamic || lengthB < static_cast<size_t>(std::stoi(fLength))) {
+      //       fNBroadcastedB = "Broadcasted" + fNB;
+      //       model.AddIntermediateTensor(fNBroadcastedB, ConvertStringToType(fType), fShapeX);
+      //    }
+      // }
       model.AddNeededStdLib("cmath");
    }
 
@@ -162,10 +193,6 @@ public:
          throw std::runtime_error("TMVA::SOFIE LayerNormalization operator " + opName +
                                   " called to generate without being initialized first.");
       }
-      if (fShapeX.size() > 5) {
-         throw std::runtime_error("TMVA::SOFIE LayerNormalization operator not "
-                                  "implemented for input tensor of size > 5.");
-      }
 
       std::stringstream out;
 
@@ -179,10 +206,32 @@ public:
       }
 
       auto strides = UTILITY::ComputeStrideFromShape(fShapeX);
-      std::string InputIndex = "axis_0 * " + strides[0].GetVal();
+      std::string inputIndex = "axis_0 * " + strides[0].GetVal();
       for (size_t i = 1; i < fSize; i++) {
-         InputIndex += " + axis_" + std::to_string(i) + " * " + strides[i].GetVal();
+         inputIndex += " + axis_" + std::to_string(i);
+         if (i < fSize-1) inputIndex += " * " + strides[i].GetVal();
       }
+      auto scaleStrides = UTILITY::ComputeStrideFromShape(fShapeScale);
+      std::string scaleIndex;
+      for (size_t i = 0; i < fSize; i++) {
+         if (fShapeScale[i].dim != 1) {
+            if (!scaleIndex.empty()) scaleIndex += " + ";
+            scaleIndex += "axis_" + std::to_string(i);
+            if ( scaleStrides[i].dim != 1) scaleIndex +=  " * " + scaleStrides[i].GetVal();
+         }
+      }
+      if (scaleIndex.empty()) scaleIndex = "0";
+
+      auto biasStrides = UTILITY::ComputeStrideFromShape(fShapeB);
+      std::string biasIndex;
+      for (size_t i = 0; i < fSize; i++) {
+         if (fShapeB[i].dim != 1) {
+            if (!biasIndex.empty()) biasIndex += " + ";
+            biasIndex += "axis_" + std::to_string(i);
+            if ( biasStrides[i].dim != 1) biasIndex +=  " * " + biasStrides[i].GetVal();
+         }
+      }
+      if (biasIndex.empty()) biasIndex = "0";
 
       auto axesStrides = UTILITY::ComputeStrideFromShape(fAxesShape);
       std::string axesIndex = "axis_" + std::to_string(0) + " * " + axesStrides[0].GetVal();
@@ -190,51 +239,42 @@ public:
          axesIndex += " + axis_" + std::to_string(i) + " * " + axesStrides[i].GetVal();
       }
 
-      auto normalizedStrides = UTILITY::ComputeStrideFromShape(fNormalizedShape);
-      std::string normalizedIndex = "axis_" + std::to_string(fAxis) + " * " + normalizedStrides[0].GetVal();
-      for (size_t i = fAxis + 1; i < fSize; i++) {
-         normalizedIndex += " + axis_" + std::to_string(i) + " * " + normalizedStrides[i - fAxis].GetVal();
-      }
 
-      if (!fNCastedX.empty()) {
-         // Cast X to float
-         out << SP << "for (size_t i = 0; i < " << fLength << "; i++) {\n";
-         out << SP << SP << "tensor_" << fNCastedX << "[i] = " << "static_cast<float>(tensor_" << fNX;
-         out << "[i]);\n";
-         out << SP << "}\n";
-      }
+      // compute mean and std-dev. Save in tensors if requested
 
       out << SP << "// Compute the mean\n";
-      // Loop over the normalized dimensions
+      // Loop over all the dims in [0, fAxis)
       for (size_t i = 0; i < fAxis; i++) {
          std::string iIdx = "axis_" + std::to_string(i);
          out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
                       << "; " << iIdx << "++) {\n";
       }
-      out << SP << SP << fType << " sum = 0.;\n";
-      // loop over all the dims in [0, fAxis)
+      out << SP << SP << fType << " mean = 0.;\n";
+      // loop over the normalized dimensions (fAxis,....,N-1)
       for (size_t j = fAxis; j < fSize; j++) {
          std::string jIdx = "axis_" + std::to_string(j);
          out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
                          << "; " << jIdx << "++) {\n";
       }
-      out << SP << SP << SP << "sum += tensor_" << fNX << "[" << InputIndex << "];\n";
+      out << SP << SP << SP << "mean += tensor_" << fNX << "[" << inputIndex << "];\n";
       for (size_t j = fAxis; j < fSize; j++) {
          out << SP << SP << "}\n";
       }
-      out << SP << SP << "tensor_" << fNMean << "[" << axesIndex << "] = sum / " << fType << "(";
-      out << fNormalizedLength << ");\n";
-      for (size_t i = fAxis; i < fSize; i++) {
-         out << SP << "}\n";
-      }
+      out << SP << SP << "mean  /= " << fType << "(" << fNormalizedLength << ");\n";
+
+      // for (size_t i = fAxis; i < fSize; i++) {
+      //    out << SP << "}\n";
+      // }
+      // tensor_" << fNMean << "[" << axesIndex << "]
 
       out << SP << "// Compute the inverse Standard Deviation\n";
       // Loop over the normalized dimensions
-      for (size_t i = 0; i < fAxis; i++) {
-         std::string iIdx = "axis_" + std::to_string(i);
-         out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
-                   << "; " << iIdx << "++){\n";
-      }
+      // for (size_t i = 0; i < fAxis; i++) {
+      //    std::string iIdx = "axis_" + std::to_string(i);
+      //    out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
+      //              << "; " << iIdx << "++){\n";
+      // }
+
       // Set sum = 0
       out << SP << SP << fType << " sum = 0.;\n";
       // loop over all the dims in [0, fAxis)
@@ -243,92 +283,63 @@ public:
          out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
                           << "; " << jIdx << "++){\n";
       }
-      out << SP << SP << SP << "float tmp = tensor_" << fNX << "[" << InputIndex << "] - tensor_"
-                            << fNMean << "[" << axesIndex << "];\n";
+      out << SP << SP << SP << "float tmp = tensor_" << fNX << "[" << inputIndex << "] - mean;\n";
       out << SP << SP << SP << "sum += tmp*tmp;\n";
       for (size_t j = fAxis; j < fSize; j++) {
          out << SP << SP << "}\n";
       }
-      out << SP << SP << "tensor_" << fNInvStdDev << "[" << axesIndex << "] = 1 / std::sqrt(";
+      out << SP << SP << fType << " invStdDev = 1 / std::sqrt(";
       out << "sum / " << fType << "(" << fNormalizedLength << ") + " << fAttrEpsilon << ");\n";
-      for (size_t i = 0; i < fAxis; i++) {
-         out << SP << "}\n";
-      }
 
-      if (!fNCastedX.empty()) {
-         out << "// NormalizedX = InvStdDev * (CastedX - Mean)\n";
-         for (size_t i = 0; i < fAxis; i++) {
-            std::string iIdx = "axis_" + std::to_string(i);
-            out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
-                          << "; " << iIdx << "++){\n";
-         }
-         for (size_t j = fAxis; j < fSize; j++) {
-            std::string jIdx = "axis_" + std::to_string(j);
-            out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
-                             << "; " << jIdx << "++){\n";
-         }
-         out << SP << SP << SP << "tensor_" << fNNormalizedX << "[" << InputIndex << "] = tensor_";
-         out << fNInvStdDev << "[" << axesIndex << "] * (tensor_" << fNCastedX << "[" << InputIndex;
-         out << "] - tensor_" << fNMean << "[" << axesIndex << "])\n";
-         for (size_t j = fAxis; j < fSize; j++) {
-            out << SP << SP << "}\n";
-         }
-         for (size_t i = fAxis; i < fSize; i++) {
-            out << SP << "}\n";
-         }
-         out << "// Y = Scale o NormalizedX";
-         for (size_t i = 0; i < fAxis; i++) {
-            std::string iIdx = "axis_" + std::to_string(i);
-            out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
-                      << "; " << iIdx << "++){\n";
-         }
-         for (size_t j = fAxis; j < fSize; j++) {
-            std::string jIdx = "axis_" + std::to_string(j);
-            out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
-                            << "; " << jIdx << "++){\n";
-         }
-         out << SP << SP << SP << "tensor_" << fNY << "[" << InputIndex << "] = tensor_" << fNScale;
-         out << "[" << axesIndex << "] * static_cast<" << fType << ">(tensor_" << fNCastedX << "[" << InputIndex;
-         out << "]);\n";
-         for (size_t j = fAxis; j < fSize; j++) {
-            out << SP << SP << "}\n";
-         }
-         for (size_t i = fAxis; i < fSize; i++) {
-            out << SP << "}\n";
-         }
-      } else {
-         out << SP << "// Y = Scale o InvStdDev (X - Mean)\n";
-         for (size_t i = 0; i < fAxis; i++) {
-            std::string iIdx = "axis_" + std::to_string(i);
-            out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
-                         << "; " << iIdx << "++){\n";
-         }
-         for (size_t j = fAxis; j < fSize; j++) {
-            std::string jIdx = "axis_" + std::to_string(j);
-            out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
-                           << "; " << jIdx << "++){\n";
-         }
-         out << SP << SP << SP << "tensor_" << fNY << "[" << InputIndex << "] = tensor_" << fNScale;
-         out << "[" << normalizedIndex << "] * tensor_" << fNInvStdDev << "[" << axesIndex;
-         out << "] * (tensor_" << fNX << "[" << InputIndex << "] - tensor_" << fNMean << "[";
-         out << axesIndex << "]);\n";
-         for (size_t j = fAxis; j < fSize; j++) {
-            out << SP << SP << "}\n";
-         }
-         for (size_t i = fAxis; i < fSize; i++) {
-            out << SP << "}\n";
-         }
+      // for (size_t i = 0; i < fAxis; i++) {
+      //    out << SP << "}\n";
+      // }
+
+      // set output mean and invStdDev if requested
+      if (!fNMean.empty())
+         out << SP << SP <<  "tensor_" << fNMean << "[" << axesIndex << "] = mean;\n";
+      if (!fNInvStdDev.empty())
+         out << SP << SP <<  "tensor_" << fNInvStdDev << "[" << axesIndex << "] = invStdDev;\n";
+
+      // scale and add bias
+
+      out << SP << "// Y = Scale o InvStdDev (X - Mean)\n";
+      // for (size_t i = 0; i < fAxis; i++) {
+      //    std::string iIdx = "axis_" + std::to_string(i);
+      //    out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
+      //                 << "; " << iIdx << "++){\n";
+      // }
+
+      for (size_t j = fAxis; j < fSize; j++) {
+         std::string jIdx = "axis_" + std::to_string(j);
+         out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] << "; " << jIdx
+             << "++){\n";
       }
+      out << SP << SP << SP << "tensor_" << fNY << "[" << inputIndex << "] = tensor_" << fNScale;
+      out << "[" << scaleIndex << "] * invStdDev * (tensor_" << fNX << "[" << inputIndex << "] - mean)";
 
-      if (!fNB.empty()) {
-         std::string bias = "tensor_" + (fNBroadcastedB.empty() ? fNB : fNBroadcastedB);
-         out << SP << "// Add the bias to Y\n";
-         out << SP << "int " << opName << "_n = " << fLength << ";\n";
-         out << SP << "float " << opName << "_alpha = 1.;\n";
-         out << SP << "int " << opName << "_inc = 1;\n";
-         out << SP << "BLAS::saxpy_(&" << opName << "_n, &" << opName << "_alpha, " << bias << ", &";
-         out << opName << "_inc, " << "tensor_" << fNY << ", &" << opName << "_inc);\n";
+      // add bias if needed
+      if (!fNB.empty())
+         // assume bias has index as scale
+         out << " + tensor_" << fNB << "[" << biasIndex << "]";
+      out << ";\n";
+
+      for (size_t j = fAxis; j < fSize; j++) {
+         out << SP << SP << "}\n";
       }
+      for (size_t i = fAxis; i < fSize; i++) {
+         out << SP << "}\n";
+      }
+
+      // if (!fNB.empty()) {
+      //    std::string bias = "tensor_" + (fNBroadcastedB.empty() ? fNB : fNBroadcastedB);
+      //    out << SP << "// Add the bias to Y\n";
+      //    out << SP << "int " << opName << "_n = " << fLength << ";\n";
+      //    out << SP << "float " << opName << "_alpha = 1.;\n";
+      //    out << SP << "int " << opName << "_inc = 1;\n";
+      //    out << SP << "BLAS::saxpy_(&" << opName << "_n, &" << opName << "_alpha, " << bias << ", &";
+      //    out << opName << "_inc, " << "tensor_" << fNY << ", &" << opName << "_inc);\n";
+      // }
 
       return out.str();
    }
diff --git a/tmva/sofie/inc/TMVA/ROperator_Range.hxx b/tmva/sofie/inc/TMVA/ROperator_Range.hxx
index 9cac15a14fc52..b91e45dd6d84b 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Range.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Range.hxx
@@ -37,15 +37,10 @@ public:
       }
       static_assert( (std::is_same_v<T, float> || std::is_same_v<T, int64_t>),
                   "TMVA::SOFIE - Unsupported type by Range operator");
-   }
-
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      return input;
-   }
-
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      auto ret = input; //suggest copy to compiler
-      return ret;
+      {
+         fInputTensorNames = { fNStart, fNLimit, fNDelta };
+         fOutputTensorNames = { fNOutput };
+      }
    }
 
    void Initialize(RModel& model) override {
@@ -63,32 +58,89 @@ public:
             std::runtime_error("TMVA SOFIE Range Op Input Tensor " + fNDelta + "is not found in model");
       }
       ETensorType type = ConvertStringToType(fType);
-      if (model.IsInitializedTensor(fNStart) && model.IsInitializedTensor(fNDelta) && model.IsInitializedTensor(fNLimit)) {
-         T * start = static_cast<T*>(model.GetInitializedTensorData(fNStart).get());
-         T * limit = static_cast<T*>(model.GetInitializedTensorData(fNLimit).get());
-         T * delta = static_cast<T*>(model.GetInitializedTensorData(fNDelta).get());
-         if (!start || !delta || !limit)
-            std::runtime_error("TMVA SOFIE Range Op Input Tensor has invalid input data");
-         T a = *start;
-         T b = *limit;
-         T d = *delta;
-         int number_of_elements = std::max( static_cast<double>(std::ceil( (b - a) / d )) , 0. );
+
+
+
+      auto analyzeInput = [&](const std::string & tName, T & value, Dim & dim) {
+         int ftype = 0; // type of input (0 intermediate, 1 constant , 2 shape)
+         if (model.IsInitializedTensor(tName)) {
+            T * data = static_cast<T*>(model.GetInitializedTensorData(tName).get());
+            if (!data)
+               std::runtime_error("TMVA SOFIE Range Op Input Tensor has invalid input  data");
+            value = *data;
+            ftype = 1;
+         } else if (model.IsShapeTensor(tName)) {
+            auto data = model.GetShapeTensorValues(tName);
+            dim = data[0];
+            if (!dim.isParam) {
+               value = static_cast<T>(dim.dim);
+               ftype = 1;
+            } else
+               ftype = 2;
+         }
+         return ftype;
+      };
+
+      T start_value;
+      T limit_value;
+      T delta_value;
+      Dim start_dim;
+      Dim limit_dim;
+      Dim delta_dim;
+      int res1 = analyzeInput(fNStart, start_value, start_dim);
+      int res2 = analyzeInput(fNLimit, limit_value, limit_dim);
+      int res3 = analyzeInput(fNDelta, delta_value, delta_dim);
+      if (res1 == 0 || res2 == 0 || res3 == 0) {
+         // cannot know at compile time- need to do fully at run time
+         //
+         fShape = {Dim{"range_size_" + fNStart + "_" + fNLimit}};
+         model.AddDynamicTensor(fNOutput, type, fShape);
+      } else if (res1 == 1 && res2 == 1 && res3 == 1) {
+         size_t number_of_elements = std::max(static_cast<int>(std::ceil((limit_value - start_value) / delta_value )) , 0 );
+         fIsOutputConstant = true;
+
+         // compute output
          std::vector<T> output(number_of_elements);
-         for (int i=0; i<number_of_elements; ++i) {
-            output[i] =  a + (i * d);
+         for (size_t i=0; i<number_of_elements; ++i) {
+            output[i] =  start_value + (i * delta_value);
          }
-         std::vector<size_t> shape = {static_cast<size_t>(number_of_elements)};
+         std::vector<size_t> shape = {number_of_elements};
          model.AddConstantTensor(fNOutput,shape, output.data());
-         fIsOutputConstant = true;
-         // set the input tensor not writable
-         model.SetNotWritableInitializedTensor(fNStart);
-         model.SetNotWritableInitializedTensor(fNDelta);
-         model.SetNotWritableInitializedTensor(fNLimit);
-      }
-      else {
-         fShape = {Dim{"range_size"}};
-         model.AddDynamicTensor(fNOutput, type, fShape);
+         fShape = ConvertShapeToDim(shape);
+
+      } else { // case of a shape tensor
+         std::string start = (res1 == 1) ? std::to_string(start_value) : start_dim.GetVal();
+         std::string limit = (res2 == 1) ? std::to_string(limit_value) : limit_dim.GetVal();
+         std::string delta = (res3 == 1) ? std::to_string(delta_value) : delta_dim.GetVal();
+         std::stringstream s;
+         if (type == ETensorType::FLOAT ) {
+            if (delta_value == 1)
+               s <<  "std::max(std::ceil("<< limit << " - " << start << "),0.0f)";
+            else
+               s <<  "std::max(std::ceil(("<< limit << " - " << start << ")/" << delta << "),0.0f)";
+         } else if (type == ETensorType::INT64 ) {
+            if (delta == "1") {
+               if (start == "0")
+                  s <<  limit;
+               else
+                  s << "std::max((" << limit << " - " << start << "),0L)";
+            } else {
+               if (start == "0")
+                  s <<  "((" << limit << ")/" << delta << ")";
+               else
+                  s << "std::max((" << limit << " - " << start << ")/"<< delta << "),0L)";
+            }
+         } else {
+            throw
+               std::runtime_error("TMVA SOFIE Range Op Input Tensor " + ConvertTypeToString(type) + "is not supported");
+         }
+
+
+         fShape = { Dim {s.str(), static_cast<size_t>(-1)} };
+         model.AddDynamicTensor(fNOutput,type, fShape);
       }
+
+
       if (model.Verbose()) {
          std::cout << "Range -> output is " << fNOutput << " : " << ConvertShapeToString(fShape);
          if (fIsOutputConstant) std::cout << " : " << ConvertValuesToString(model.GetTensorData<T>(fNOutput));
@@ -96,27 +148,31 @@ public:
       }
    }
 
-   std::string Generate(std::string OpName) override {
+   std::string Generate(std::string opName) override {
 
       std::stringstream out;
-      out << "\n//------ Range\n";
+      out << "\n//------ Range " << opName << "---> " << ConvertDimShapeToString(fShape) << "\n";
       if (fIsOutputConstant) return out.str();
 
-      OpName = "op_" + OpName;
+      opName = "op_" + opName;
       if (fShape.empty()) {
          throw std::runtime_error("TMVA SOFIE Range operator called to Generate without being initialized first");
       }
 
-      std::string sizeName = fShape[0].param;
-      out << SP << "size_t " << sizeName << " = static_cast<size_t>(std::max(std::ceil((static_cast<float>(*tensor_" << fNLimit << ") - static_cast<float>(*tensor_" << fNStart << ")) / static_cast<float>(*tensor_" << fNDelta << ")), 0.0f));\n";
-      out << SP << "if (" << sizeName << " > " << "fTensor_" << fNOutput << ".size() ){\n";
-      out << SP << SP << "fTensor_" << fNOutput << ".resize(" << sizeName << ");\n";
-      // need to re-initialized pointer to tensor data
-      out << SP << SP << "tensor_" << fNOutput << " = fTensor_" << fNOutput << ".data();\n";
-      out << SP << "}\n";
-      out << SP << "for (size_t i = 0; i < " << sizeName << "; i++) {\n";
-      out << SP << SP << "fTensor_" << fNOutput << "[i] = *tensor_" << fNStart << " + i * (*tensor_" << fNDelta << ");\n";
+      std::string outputSizeVar;
+      std::string outputSize = fShape[0].param;
+      if (outputSize.find("range_size") != std::string::npos) {
+         outputSizeVar = outputSize;
+         outputSize = "static_cast<size_t>(std::max(std::ceil((static_cast<float>(*tensor_" + fNLimit +
+                ") - static_cast<float>(*tensor_" + fNStart + ")) / static_cast<float>(*tensor_" + fNDelta + ")), 0.0f))";
+      } else {
+         outputSizeVar = "range_" + opName;
+      }
+      out << SP << "size_t " << outputSizeVar <<  " = " << outputSize << ";\n";
+      out << SP << "for (size_t i = 0; i < " << outputSizeVar << "; i++) {\n";
+      out << SP << SP << "tensor_" << fNOutput << "[i] = *tensor_" << fNStart << " + i * (*tensor_" << fNDelta << ");\n";
       out << SP << "}\n";
+
       return out.str();
    }
 };
diff --git a/tmva/sofie/inc/TMVA/ROperator_Reduce.hxx b/tmva/sofie/inc/TMVA/ROperator_Reduce.hxx
index 1204770d3d321..1da588e965a01 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Reduce.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Reduce.hxx
@@ -166,7 +166,7 @@ public:
       std::string reducedLength;
       if (fInputDimShape) {
          reducedLength = "reducedLength_" + opName;
-         out << SP << "size_t " << reducedLength << " = " <<  inputLength << " / " << outputLength << ";\n";
+         out << SP << "size_t " << reducedLength << " = (" <<  inputLength << ") / (" << outputLength << ");\n";
       } else {
          int rLength = std::stoi(inputLength) / std::stoi(outputLength);
          reducedLength = std::to_string(rLength);
diff --git a/tmva/sofie/inc/TMVA/ROperator_Reshape.hxx b/tmva/sofie/inc/TMVA/ROperator_Reshape.hxx
index 2634b68dbc875..a3ed28c4860bc 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Reshape.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Reshape.hxx
@@ -108,6 +108,9 @@ public:
 
                if (IsInteger(tmp_length) && IsInteger(input_length))
                   output_shape[i] = Dim{static_cast<size_t>(std::stoi(input_length) / std::stoi(tmp_length))};
+               else if (IsInteger(tmp_length) && std::stoi(tmp_length) == 1) {
+                  output_shape[i] = Dim{input_length, static_cast<size_t>(-1)};
+               }
                else {
                   //we can try simplifying expression if tmp_length is integer and part of input_length
                   // contains tmp_length
@@ -243,7 +246,7 @@ public:
       // check if optional tensor exists defining shape or axes
       if (!fNInput2.empty()) {
          if (model.CheckIfTensorAlreadyExist(fNInput2)) {
-            if (model.IsConstantTensor(fNInput2) || model.IsInitializedTensor(fNInput2)) {
+            if (model.IsInitializedTensor(fNInput2)) {
                // assume input shape is an initialized tensor
                auto dptr = model.GetInitializedTensorData(fNInput2);
                auto values = static_cast<int64_t *>(dptr.get());
@@ -260,6 +263,9 @@ public:
                fShapeOutput = ShapeInference({fShapeInput})[0];
                // set flag to not write tensor in weight file. Its data will be hard-coded in way model is constructed
                model.SetNotWritableInitializedTensor(fNInput2);
+            } else if (model.IsShapeTensor(fNInput2)) {
+               auto shapeData = model.GetShapeTensorValues(fNInput2);
+               fShapeOutput = shapeData;
             } else {
                // we cannot get shape at initialization time but at run-time
                fDynamicShape = true;
diff --git a/tmva/sofie/inc/TMVA/ROperator_ScatterElements.hxx b/tmva/sofie/inc/TMVA/ROperator_ScatterElements.hxx
index 626debd13038e..2525ea32629df 100644
--- a/tmva/sofie/inc/TMVA/ROperator_ScatterElements.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_ScatterElements.hxx
@@ -136,6 +136,17 @@ public:
          return strst.str();
       };
 
+      auto tensorIndexOpt = [](const std::vector<std::string> & sdx, const std::vector<std::string> & idx) {
+         std::stringstream strst;
+         int dims = idx.size();
+         for (int i = 0; i < dims-1; i++) {
+            strst << sdx[i];
+            strst << " + ";
+         }
+         strst << idx[dims-1];
+         return strst.str();
+      };
+
 
       // copy first input in output (maybe can be avoided??)
       out << SP << "std::copy(tensor_" << fNX << ", tensor_" << fNX << " + " << length << ", tensor_" << fNY << ");\n";
@@ -143,14 +154,24 @@ public:
       // loop on tensor rank
       int dims = fShapeY.size();
       std::vector<std::string> idx(dims);
+      std::vector<std::string> sdx(dims);  // stride for indices
       for (int i = 0; i < dims; i++) {
          idx[i] = std::string("i") + std::to_string(i);
+         sdx[i] = std::string("s") + std::to_string(i);
          for (int j = 0; j <= i; j++) out << SP;
          out << "for (int " << idx[i] << " = 0; " << idx[i] << " < " << fShapeI[i] << "; " << idx[i] << "++) {\n";
+         if (i < dims-1) {
+            for (int j = 0; j <= i+1 ; j++) out << SP;
+            if (strideI[i].GetVal() != "1")
+               out << "int "<< sdx[i] << " = " << strideI[i] << " * " << idx[i] << ";\n";
+            else
+               out << "int "<< sdx[i] << " = " << idx[i] << ";\n";
+         }
       }
       // correct index for specific axis
       for (int j = 0; j <= dims; j++) out << SP;
-      out << "int updateIndex = " << tensorIndex(strideI,idx) << ";\n";
+      // can use optimised formula for indices since the loop above is on fShapeI
+      out << "int updateIndex = " << tensorIndexOpt(sdx,idx) << ";\n";
       for (int j = 0; j <= dims; j++) out << SP;
       out << "int iAxis = tensor_" << fNI << "[updateIndex];\n";
       for (int j = 0; j <= dims; j++) out << SP;
diff --git a/tmva/sofie/inc/TMVA/ROperator_Slice.hxx b/tmva/sofie/inc/TMVA/ROperator_Slice.hxx
index b23e3b0a86d21..4e3c1319bd772 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Slice.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Slice.hxx
@@ -25,6 +25,7 @@ private:
    bool fIsStartUndef = false;
    bool fIsEndUndef = false;
    bool fIsStepUndef = false;
+   bool fIdentitySlice = false;
    std::string fNData;        // input data tensor name
    std::string fNOutput;      // output data name
    std::vector<std::string> fNames;       // tensor names for meta(axis) information
@@ -235,6 +236,8 @@ public:
                if (iend < 0) {
                   std::string send = std::string("(") + fShapeInput[fAxes[i]].param + "-" + std::to_string(-iend) +")";
                   fEnd[fAxes[i]] = Dim{send,size_t(-1)};
+               } else if (iend == std::numeric_limits<IType>::max()){
+                  fEnd[fAxes[i]] = fShapeInput[fAxes[i]];
                } else {
                  fEnd[fAxes[i]] = Dim{size_t(iend)};
                }
@@ -330,27 +333,58 @@ public:
          }
       }
       else {
+         // check if Slice is just an Identity operator in case start = 0, end = input_shape and step=1
+         size_t ndim = fShapeInput.size();
+         fIdentitySlice = fShapeOutput.size() == ndim;
+         for (size_t idim = 0; idim < ndim; idim++) {
+            if (!fIdentitySlice) break;
+            fIdentitySlice &= (fStart[idim].GetVal() == "0");
+            fIdentitySlice &= (fSteps[idim].GetVal() == "1");
+            fIdentitySlice &= (fEnd[idim].GetVal() == fShapeOutput[idim].GetVal());
+         }
+
          model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput);
+         if (fIdentitySlice)  model.AddAliasTensor(fNOutput, fNData);
+
          if (model.Verbose()) {
-            std::cout << "Slice ---> " << fNOutput << " " <<  ConvertShapeToString(fShapeOutput) << std::endl;
+            std::cout << "Slice " << fNData << "  " << ConvertShapeToString(fShapeInput)
+                      << "---> " << fNOutput << " " <<  ConvertShapeToString(fShapeOutput);
+            if (fIdentitySlice) std::cout << " (using alias tensor since slice is an identity) ";
+            std::cout << std::endl;
+
          }
       }
    }
 
-   std::string Generate(std::string OpName) override {
-      if (fIsOutputConstant) return "";  //no op for constant tensors
+   std::string Generate(std::string opName) override {
 
-      OpName = "op_" + OpName;
       if (fShapeInput.empty() || fShapeOutput.empty()){
          throw std::runtime_error("TMVA SOFIE Slice Op called to Generate without being initialized first");
       }
 
       std::stringstream out;
-      //std::string opName = "Slice";
 
-      out << SP << "///------- Slice operator\n" << std::endl;
-      // loop on the dimensions depending no the orders
+      out << "///------- Slice operator " << opName << "---> " << fNOutput << " "
+          << ConvertDimShapeToString(fShapeOutput) << "\n" << std::endl;
+      if (fIsOutputConstant) return out.str();  //no op for constant tensors
+
       size_t ndim = fShapeInput.size();
+      // check if Slice is just an Identity operator in case start = 0, end = input_shape and step=1
+      bool identitySlice = fShapeInput.size() == fShapeOutput.size();
+      for (size_t idim = 0; idim < ndim; idim++) {
+         if (!identitySlice) break;
+         identitySlice &= (fStart[idim].GetVal() == "0");
+         identitySlice &= (fSteps[idim].GetVal() == "1");
+         identitySlice &= (fEnd[idim].GetVal() == fShapeOutput[idim].GetVal());
+      }
+
+      if (identitySlice) {
+         out << "/// Slice is just an identity (copy pointers) \n";
+         out << SP << "tensor_" << fNOutput << " = tensor_" << fNData << ";\n";
+         return out.str();
+      }
+
+      // loop on the dimensions depending no the orders
       auto strides = UTILITY::ComputeStrideFromShape(fShapeInput);
 
 
diff --git a/tmva/sofie/inc/TMVA/ROperator_Tile.hxx b/tmva/sofie/inc/TMVA/ROperator_Tile.hxx
index 1086f72eae71c..9b291b40e0854 100644
--- a/tmva/sofie/inc/TMVA/ROperator_Tile.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_Tile.hxx
@@ -20,8 +20,8 @@ private:
    std::string fNRepeats;
    std::string fNInput;
    std::string fNY;
-   std::vector<size_t>fShapeInput;
-   std::vector<size_t> fShapeY;
+   std::vector<Dim>fShapeInput;
+   std::vector<Dim> fShapeY;
 
 public:
    ROperator_Tile(){}
@@ -35,13 +35,18 @@ public:
       return input;
    }
 
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      std::vector<size_t> ret = input[0];
-
-      for(size_t i=0; i < input[1].size(); i++) {
-            ret[i]=ret[i]*input[1][i];
+   std::vector<Dim> DoShapeInference(const std::vector<Dim> & input, const std::vector<size_t> repeat)  {
+      std::vector<Dim> ret = input;
+      for(size_t i=0; i < repeat.size(); i++) {
+         if (repeat[i] != 1) {
+            if (ret[i].isParam) {
+               ret[i] = Dim{ std::string(ret[i].GetVal() + "*" + std::to_string(repeat[i])), static_cast<size_t>(-1) };
+            } else {
+               ret[i]=Dim { ret[i].dim *repeat[i] };
+            }
+         }
       }
-      return {ret};
+      return ret;
    }
 
    void Initialize(RModel& model) override {
@@ -52,7 +57,7 @@ public:
       if (model.CheckIfTensorAlreadyExist(fNRepeats) == false){
         throw std::runtime_error("TMVA SOFIE Tile Op Input Tensor is not found in model");
       }
-      fShapeInput=model.GetTensorShape(fNInput);
+      fShapeInput=model.GetDimTensorShape(fNInput);
 
       // if repeats vector is not initialized we cannot deduce shape of output
       // not support for time being this case
@@ -79,12 +84,12 @@ public:
       std::copy(repeats_data, repeats_data + num_elements, repeats_vector.begin());
 
 
-      fShapeY = ShapeInference({fShapeInput,repeats_vector})[0];
+      fShapeY = DoShapeInference(fShapeInput,repeats_vector);
 
       model.AddIntermediateTensor(fNY, model.GetTensorType(fNInput), fShapeY);
 
       if (model.Verbose())
-         std::cout <<  "Tile: " << fNInput << " " << ConvertShapeToString(fShapeInput) << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY)
+         std::cout <<  "Tile: " << fNInput << " " << ConvertDimShapeToString(fShapeInput) << " -> " << fNY << " with shape " << ConvertDimShapeToString(fShapeY)
             << " given repeats " << ConvertShapeToString(repeats_vector) << std::endl;
    }
 
@@ -103,9 +108,9 @@ public:
       std::string output = "tensor_" + fNY;
       out << "///-------- Tile operator\n";
       out << "{\n"; // add scope to re-use same names
-      out << "const int input_shape[" << fShapeInput.size() << "] = " << ConvertShapeToString(fShapeInput) << ";\n";
+      out << "const size_t input_shape[" << fShapeInput.size() << "] = " << ConvertDimShapeToString(fShapeInput) << ";\n";
 
-      out << "int inputLength = " << ConvertShapeToLength(fShapeInput) << ";\n";
+      out << "int inputLength = " << ConvertDimShapeToLength(fShapeInput) << ";\n";
       out << "int s = 1;\n";
       // loop from inverse dim order
       out << "for (int i = " << fShapeInput.size()-1 << "; i >=0; i--) {\n";
diff --git a/tmva/sofie/inc/TMVA/ROperator_TopK.hxx b/tmva/sofie/inc/TMVA/ROperator_TopK.hxx
index 0869437bb6b0c..edee91de8eb57 100644
--- a/tmva/sofie/inc/TMVA/ROperator_TopK.hxx
+++ b/tmva/sofie/inc/TMVA/ROperator_TopK.hxx
@@ -19,13 +19,13 @@ private:
    int fAttrLargest;
    int fAttrSorted;
 
-   size_t fK;
+   Dim fK;
    std::string fNK;
    std::string fNX;
    std::string fNVal;
    std::string fNInd;
-   std::vector<size_t> fShapeX;
-   std::vector<size_t> fShapeY;
+   std::vector<Dim> fShapeX;
+   std::vector<Dim> fShapeY;
    std::string fType;
 
 public:
@@ -43,23 +43,10 @@ public:
         }
 
    std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-         ETensorType ret = input[0];
-         return {ret, ret};
-      }
-
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      if (input.size() != 2) {
-         throw std::runtime_error("TMVA SOFIE TopK Op Shape Inference needs exactly 2 input tensors");
-      }
-
-      auto shape = input[0]; // Shape format: [ m x n x o x p ... ]
-
-      // set the dimension at the specified axis to k  (fAttrAxis is checked before that is in the correct range
-      shape[fAttrAxis] = fK; // Modified shape: [ m x n x k x p ... ]
-      return {shape, shape};
+      ETensorType ret = input[0];
+      return {ret, ret};
    }
 
-
    void Initialize(RModel& model) override {
       if (model.CheckIfTensorAlreadyExist(fNX) == false) {
          // input must be a graph input, or already initialized intermediate tensor
@@ -70,10 +57,10 @@ public:
          throw std::runtime_error("TMVA SOFIE TopK Op Input Tensor i.e. K is not found in model");
       }
 
-      fShapeX = model.GetTensorShape(fNX);
+      fShapeX = model.GetDimTensorShape(fNX);
       auto fShapeK = model.GetTensorShape(fNK);
       auto kptr = static_cast<int64_t *>(model.GetInitializedTensorData(fNK).get());
-      fK = *kptr;
+      size_t kval = *kptr;
       model.SetNotWritableInitializedTensor(fNK);
       fAttrAxis = fAttrAxis < 0 ? fShapeX.size() + fAttrAxis : fAttrAxis;
       if(static_cast<size_t>(fAttrAxis) >=  fShapeX.size()){
@@ -81,14 +68,25 @@ public:
             std::runtime_error("TMVA::SOFIE ONNX TopK op axis = "+ std::to_string(fAttrAxis) +" value exeeds size of tensor " +fNX+" of size "+fShapeX.size()+" .");
       }
       // fK cannot be larger that axis dimension
-      fK = std::min(fK, fShapeX[fAttrAxis]);
+      if (fShapeX[fAttrAxis].isParam)
+         fK = Dim{std::string("std::min(size_t(" + std::to_string(kval) + "), " + fShapeX[fAttrAxis].GetVal() + ")" ), static_cast<size_t>(-1) };
+      else
+         fK = Dim { std::min(kval, fShapeX[fAttrAxis].dim) };
+
+      // output shape is equal to input shape apart for value in fAttrAxis
+      fShapeY = fShapeX;
+      fShapeY[fAttrAxis] = Dim{fK};
 
-      fShapeY = ShapeInference({fShapeX, fShapeK})[0];
       model.AddIntermediateTensor(fNVal, model.GetTensorType(fNX), fShapeY);
 
       // output indices should be an int64 tensor
       model.AddIntermediateTensor(fNInd, ETensorType::INT64, fShapeY);
       fType = ConvertTypeToString(model.GetTensorType(fNX));
+
+      if (model.Verbose()) {
+         std::cout << "TopK " << fNX << "  " << ConvertShapeToString(fShapeX)
+                      << "---> " << fNVal << " " <<  ConvertShapeToString(fShapeY) << std::endl;
+      }
    }
 
    std::string Generate(std::string OpName) override {
@@ -101,19 +99,20 @@ public:
       size_t axis = fAttrAxis < 0 ? size + fAttrAxis : fAttrAxis;
       out << "\n" << SP << "//------ TopK\n";
 
-      size_t length=ConvertShapeToLength(fShapeX);
+      auto length=ConvertDimShapeToLength(fShapeX);
       auto strideX = UTILITY::ComputeStrideFromShape(fShapeX);
       auto strideY = UTILITY::ComputeStrideFromShape(fShapeY);
       // we perform loop on dimension before sorted axis and after sorted axis
-      size_t n_before = (axis>0) ? length/strideX[axis-1] : 1;
-      size_t n_after = strideX[axis];
-      size_t n_elements = fShapeX[axis]; // number of elements to be sorted
+      std::vector<Dim> shape_before(fShapeX.begin(), fShapeX.begin() + axis);   // input shape before axis
+      std::string n_before = (axis>0) ? ConvertDimShapeToLength(shape_before) : "1";
+      std::string n_after = strideX[axis].GetVal();
+      std::string n_elements = fShapeX[axis].GetVal(); // number of elements to be sorted
 
       // }
       out << SP << "{\n"; // to define a separate scope for the operator code
       out << SP << "std::vector<std::pair<float,int64_t>> elements(" << n_elements << ");\n";
       // loop on elements before
-      if (n_before > 1) {
+      if (n_before != "1") {
          out << SP << "for (size_t i = 0; i < " << n_before << "; i++) {\n";
          out << SP << SP << "size_t xoffset = i*" << strideX[axis-1] << ";\n";
          out << SP << SP << "size_t yoffset = i*" << strideY[axis-1] << ";\n";
@@ -122,7 +121,7 @@ public:
          out << SP << "size_t xoffset = 0;\n";
          out << SP << "size_t yoffset = 0;\n";
       }
-      if (n_after > 1)
+      if (n_after !=  "1")
          out << SP << "for (size_t j = 0; j < " << n_after << "; j++) {\n";
       else
          out << SP << "const size_t j = 0;\n";
@@ -149,8 +148,8 @@ public:
       out << SP << SP << SP << "tensor_" << fNVal   << "[yoffset + " << strideY[axis] << "*l + j] = elements[l].first;\n";
       out << SP << SP << SP << "tensor_" << fNInd << "[yoffset + " << strideY[axis] << "*l + j] = elements[l].second;\n";
       out << SP << SP << "}\n";
-      if (n_after > 1) out << SP << SP << "}\n";
-      if (n_before> 1) out << SP << "}\n";
+      if (n_after != "1") out << SP << SP << "}\n";
+      if (n_before != "1") out << SP << "}\n";
       out << SP << "}\n"; // end operator scope
       return out.str();
    }
diff --git a/tmva/sofie/inc/TMVA/SOFIE_common.hxx b/tmva/sofie/inc/TMVA/SOFIE_common.hxx
index 2dae4f7d03ce7..68a74d08fd93a 100644
--- a/tmva/sofie/inc/TMVA/SOFIE_common.hxx
+++ b/tmva/sofie/inc/TMVA/SOFIE_common.hxx
@@ -252,8 +252,14 @@ public:
    bool IsConstantTensor() const { return fConstant;}
    // query if tensor needs to be written in a weight file. Constant tensors are not written in a file
    bool IsWeightTensor() const { return !fConstant && !fIsNotWritable;}
+   // check if a Tensor is Writable (need to be written in the file or in the generated code (e.g. as a constant tensor)
+   // if an initialized tensors is used in a constant operator at compile time does not need to be written and can be omitted in
+   // the generated code
+   bool IsNotWritable() const { return fIsNotWritable; }
    // set not writable initialized tensors - i.e. tensor that must not be written in a file
    void SetNotWritable() { fIsNotWritable = true;}
+   // set as constant (needed for non-float initialized tensors)
+   void SetConstant() { fConstant = true;}
 
    template <class T = void>
    T const *data() const
@@ -805,6 +811,22 @@ void ReadTensorFromStream(std::istream &is, T &target, std::string const &expect
    }
 }
 
+
+// code for the memory greeding allocations
+struct TensorLifeInfo {
+   int begin;   // start time (op index) lifetime
+   int end;     //  end time lifetime
+   size_t size; // size of tensors in bytes
+};
+
+struct MemoryResult {
+  std::size_t total_bytes = 0;  // total memory needed
+  std::vector<size_t> offsets; // resulted offsets for each tensor
+};
+
+/// Greedy best-fit planner with coalescing free list.
+MemoryResult OrganizeMemory(const std::vector<TensorLifeInfo> & tensorsInfo );
+
 } // namespace SOFIE
 } // namespace Experimental
 } // namespace TMVA
diff --git a/tmva/sofie/src/RFunction.cxx b/tmva/sofie/src/RFunction.cxx
index a6df8dcb43e61..505d84187ca9a 100644
--- a/tmva/sofie/src/RFunction.cxx
+++ b/tmva/sofie/src/RFunction.cxx
@@ -26,7 +26,7 @@ RFunction_Update::RFunction_Update(FunctionTarget target, GraphType gType): fTar
         throw std::runtime_error("Invalid target for Update function");
     }
     fType = FunctionType::UPDATE;
-    function_block = std::make_unique<RModel>(fFuncName);
+    fFunction_block = std::make_unique<RModel>(fFuncName);
 
     if(fGraphType == GraphType::GNN) {
         if(fTarget == FunctionTarget::EDGES) {
@@ -49,25 +49,23 @@ RFunction_Update::RFunction_Update(FunctionTarget target, GraphType gType): fTar
 // add input tensors, order of provided shapes must be the same as in fInputTensors
 void RFunction_Update::AddInputTensors(const std::vector<std::vector<std::size_t>>& inputShapes) {
     for(long unsigned int i=0; i<inputShapes.size(); ++i) {
-        function_block->AddInputTensorInfo(fInputTensors[i],ETensorType::FLOAT, inputShapes[i]);
-        function_block->AddInputTensorName(fInputTensors[i]);
+        fFunction_block->AddInputTensorInfo(fInputTensors[i],ETensorType::FLOAT, inputShapes[i]);
+        fFunction_block->AddInputTensorName(fInputTensors[i]);
     }
 }
 void RFunction_Update::AddInputTensors(const std::vector<std::vector<Dim>>& inputShapes) {
     for(long unsigned int i=0; i<inputShapes.size(); ++i) {
-        function_block->AddInputTensorInfo(fInputTensors[i],ETensorType::FLOAT, inputShapes[i]);
-        function_block->AddInputTensorName(fInputTensors[i]);
+        fFunction_block->AddInputTensorInfo(fInputTensors[i],ETensorType::FLOAT, inputShapes[i]);
+        fFunction_block->AddInputTensorName(fInputTensors[i]);
     }
 }
 
-std::string RFunction_Update::GenerateModel(const std::string& filename, long read_pos, long block_size) {
-    function_block->SetFilename(filename);
+std::string RFunction_Update::GenerateModel(const std::string& filename, long read_pos, long block_size, bool verbose) {
+    fFunction_block->SetFilename(filename);
     // use batch size as block size in RModel::generate
-    function_block->PrintRequiredInputTensors();
-    function_block->PrintDynamicTensors();
-    function_block->Generate(Options::kGNNComponent,block_size,read_pos);
+    fFunction_block->Generate(Options::kGNNComponent,block_size,read_pos, verbose);
     std::string modelGenerationString;
-    modelGenerationString = "\n//--------- GNN_Update_Function---"+fFuncName+"\n"+function_block->ReturnGenerated();
+    modelGenerationString = "\n//--------- GNN_Update_Function---"+fFuncName+"\n"+fFunction_block->ReturnGenerated();
     return modelGenerationString;
 }
 
diff --git a/tmva/sofie/src/RFunction_MLP.cxx b/tmva/sofie/src/RFunction_MLP.cxx
index 32148cae36794..c41135de49902 100644
--- a/tmva/sofie/src/RFunction_MLP.cxx
+++ b/tmva/sofie/src/RFunction_MLP.cxx
@@ -20,9 +20,9 @@ RFunction_MLP::RFunction_MLP(FunctionTarget target, Int_t numLayers, Activation
          throw std::runtime_error("TMVA SOFIE GNN doesn't currently supports the provided activation function for " +
                                   fFuncName + " update.");
       }
-      function_block->AddOutputTensorNameList({fFuncName + "Relu" + std::to_string(fNumLayers)});
+      fFunction_block->AddOutputTensorNameList({fFuncName + "Relu" + std::to_string(fNumLayers)});
    } else {
-      function_block->AddOutputTensorNameList({fFuncName + "Gemm" + std::to_string(fNumLayers)});
+      fFunction_block->AddOutputTensorNameList({fFuncName + "Gemm" + std::to_string(fNumLayers)});
    }
 }
 
@@ -32,7 +32,7 @@ void RFunction_MLP::Initialize() {
     if(fGraphType == GraphType::GNN) {
         std::unique_ptr<ROperator> op_concat;
         op_concat.reset(new ROperator_Concat(fInputTensors,1,0,fFuncName+"InputConcat"));
-        function_block->AddOperator(std::move(op_concat));
+        fFunction_block->AddOperator(std::move(op_concat));
         fGemmInput = fFuncName+"InputConcat";
 
     } else if(fGraphType == GraphType::GraphIndependent) {
@@ -43,24 +43,24 @@ void RFunction_MLP::Initialize() {
     for(int i=0; i<fNumLayers-1; ++i) {
         double beta = (fBiasTensors[i].empty()) ? 0. : 1.;
         op_gemm.reset(new ROperator_Gemm<float>(1.0,beta,0,0,fGemmInput,UTILITY::Clean_name(fKernelTensors[i]),UTILITY::Clean_name(fBiasTensors[i]),fFuncName+"Gemm"+std::to_string(i)));
-        function_block->AddOperator(std::move(op_gemm));
+        fFunction_block->AddOperator(std::move(op_gemm));
         fGemmInput = fFuncName+"Gemm"+i;
         if (fActivationFunction == Activation::RELU) {
             std::unique_ptr<ROperator> op_relu;
             op_relu.reset(new ROperator_Relu<float>(fFuncName+"Gemm"+std::to_string(i), fFuncName+"Relu"+std::to_string(i)));
-            function_block->AddOperator(std::move(op_relu));
+            fFunction_block->AddOperator(std::move(op_relu));
             fGemmInput = fFuncName+"Relu"+i;
 
         }
     }
     double beta = (fBiasTensors.back().empty()) ? 0. : 1.;
     op_gemm.reset(new ROperator_Gemm<float>(1.0,beta,0,0,fGemmInput,UTILITY::Clean_name(fKernelTensors.back()),UTILITY::Clean_name(fBiasTensors.back()),fFuncName+"Gemm"+std::to_string(fNumLayers)));
-    function_block->AddOperator(std::move(op_gemm));
+    fFunction_block->AddOperator(std::move(op_gemm));
     if(fActivateFinal) {
         if (fActivationFunction == Activation::RELU) {
             std::unique_ptr<ROperator> op_relu;
             op_relu.reset(new ROperator_Relu<float>(fFuncName+"Gemm"+std::to_string(fNumLayers), fFuncName+"Relu"+std::to_string(fNumLayers)));
-            function_block->AddOperator(std::move(op_relu));
+            fFunction_block->AddOperator(std::move(op_relu));
         }
     }
 
@@ -68,7 +68,7 @@ void RFunction_MLP::Initialize() {
     if(fAddlOp.size()) {
         for(auto &i:fAddlOp) {
             std::unique_ptr<ROperator> tmp(i);
-            function_block->AddOperator(std::move(tmp));
+            fFunction_block->AddOperator(std::move(tmp));
         }
     }
 }
diff --git a/tmva/sofie/src/RModel.cxx b/tmva/sofie/src/RModel.cxx
index 2fa6df3f04f8f..32da75fdc045b 100644
--- a/tmva/sofie/src/RModel.cxx
+++ b/tmva/sofie/src/RModel.cxx
@@ -9,6 +9,7 @@
 #endif
 
 #include "TMVA/RModel.hxx"
+#include "TMVA/RModelProfiler.hxx"
 #include "TMVA/SOFIE_common.hxx"
 
 namespace TMVA {
@@ -164,19 +165,19 @@ void RModel::AddOperator(std::unique_ptr<ROperator> op, int order_execution) {
         fOperators.insert(fOperators.begin() + order_execution, std::move(op));
     } else {
         fOperators.push_back(std::move(op));
+        order_execution = fOperators.size()-1;
     }
 
-    // storing the last usage of tensors which are input to
-    // operators (but are not inputs to the model, i.e. they are intermediate
-    // tensors). This information is needed to keep a check on when a
-    // particular intermediate tensor can be flushed to free up memory for reuse.
+    // storing the last usage of tensors which are input to the operator
+    // (excluding tensors which are inputs to the model or the initialized (weights) tensors)
+    // We call this function during parsing so we don't have yet initialized the operators
    for(size_t index = 0; index<op_input_tensors.size() &&
-         fInitializedTensors.find(UTILITY::Clean_name(std::string(op_input_tensors[index]))) == fInitializedTensors.end() &&
-         std::find(fInputTensorNames.begin(), fInputTensorNames.end(),
-                   UTILITY::Clean_name(std::string(op_input_tensors[index]))) == fInputTensorNames.end() &&
-         fDynamicTensorInfos.find(UTILITY::Clean_name(std::string(op_input_tensors[index]))) == fDynamicTensorInfos.end();
-         ++index){
-            fIntermediateTensorFrequencyLookup[op_input_tensors[index]] = order_execution;
+            fInitializedTensors.find(UTILITY::Clean_name(std::string(op_input_tensors[index]))) == fInitializedTensors.end() &&
+            std::find(fInputTensorNames.begin(), fInputTensorNames.end(),
+                      UTILITY::Clean_name(std::string(op_input_tensors[index]))) == fInputTensorNames.end();
+            ++index)
+   {
+      fIntermediateTensorFrequencyLookup[op_input_tensors[index]] = order_execution;
    }
 }
 
@@ -208,10 +209,24 @@ void RModel::AddShapeTensor(const std::string & name, const std::vector<Dim> & s
    fShapeTensors[tensor_name] = std::make_pair(shape_values, scalar);
 }
 
+void RModel::AddAliasTensor(const std::string & name, const std::string & origin){
+   // add an alias tensor to origin
+   auto tensor_name = UTILITY::Clean_name(name);
+   auto origin_name = UTILITY::Clean_name(origin);
+   if (fAliasTensors.count(tensor_name) != 0) {
+      throw std::runtime_error("TMVA-SOFIE: alias tensor with name " + tensor_name + " already exists \n");
+   }
+   fAliasTensors[tensor_name] = origin_name;
+}
+
 bool RModel::IsShapeTensor(const std::string & tensor_name) const {
    return fShapeTensors.count(tensor_name) != 0;
 }
 
+bool RModel::IsAliasTensor(const std::string & tensor_name) const {
+   return fAliasTensors.count(tensor_name) != 0;
+}
+
 const std::vector<Dim> & RModel::GetShapeTensorValues(const std::string & tensor_name) const {
    //if (!IsShapeTensor(tensor_name) ) return std::vector<Dim>{};
    return fShapeTensors.at(tensor_name).first;
@@ -222,6 +237,7 @@ bool RModel::IsInitializedTensor(const std::string& tensorName) const {
     return fInitializedTensors.find(name) != fInitializedTensors.end();
 }
 bool RModel::IsConstantTensor(const std::string& tensorName) const {
+   // a constant tensor is an initialized tensor but has the constant flag set
     std::string name = UTILITY::Clean_name(tensorName);
     auto itr = fInitializedTensors.find(name);
     if (itr == fInitializedTensors.end()) return false;
@@ -355,6 +371,11 @@ std::string RModel::AllocateIntermediateMemory(std::span<const std::string_view>
           fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end())
          continue;
 
+      // case of alias tensor
+      if (IsAliasTensor(name)) {
+         continue;
+      }
+
       auto tensor_size = GetTypeSize(GetTensorType(name)) * ConvertShapeToLength(GetTensorShape(name));
       // important fill the pair in the ordered output tensors with the string view and not the string
       TensorMemoryInfo tmi = {it, tensor_size};
@@ -434,9 +455,14 @@ void RModel::CheckAndFlushIntermediateMemory(std::span<const std::string_view> o
         chunk != fIntermediateMemoryInfo.available_stack.end(); chunk++) {
       if (fVerbose) std::cout << "-- free chunk " << chunk->first <<  " size = " << chunk->second << std::endl;
    }
-   for (auto &it : op_input_tensors) {
+   for (auto &iv : op_input_tensors) {
       // last occurrence of the tensor is reached => flush it from memory
-      if (fVerbose) std::cout << ".. input tensors : " << it;
+      if (fVerbose) std::cout << ".. input tensors : " << iv;
+
+      // for alias tensors replace name with its alias
+      std::string it{iv};  // convert view to string
+      if (IsAliasTensor(it))
+         it = fAliasTensors[it];
       if (fIntermediateTensorFrequencyLookup[it] == op_idx) {
          if (fVerbose) std::cout << "  flash condition is met - looping on chunks to find matching one \n";
          for (auto chunk = fIntermediateMemoryInfo.total_stack.begin();
@@ -522,6 +548,7 @@ void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool
    fIntermediateTensorInfos.clear();
    fDynamicTensorInfos.clear();
 
+
    // loop on inputs and see if shape can be  full specified
    // if the batch size is provided it can be used to specify the full shape
    // Add the full specified tensors in fReadyInputTensors collection
@@ -576,19 +603,6 @@ void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool
       PrintDynamicTensors();
    }
 
-   // check if there are initialized tensors to write in a weight file
-   // support for the time being only weight of FLOAT type
-   if (fUseWeightFile) {
-      bool modelHasWeights = false;
-      for (auto &i : fInitializedTensors) {
-         if (i.second.type() == ETensorType::FLOAT) {
-            modelHasWeights = true;
-            break;
-         }
-      }
-      if (!modelHasWeights)
-         fUseWeightFile = false;
-   }
    // Go through model and initialize each operator
    int i = 0;
 
@@ -602,16 +616,49 @@ void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool
       fOperators[op_idx]->Initialize(*this);
       for(auto &it:fOperators[op_idx]->GetOpOutputTensors()){
          std::string name = std::string{it};
+         // check if tensor is not an initialized or output tensor and it is not already in the list
          if (fIntermediateTensorFrequencyLookup.find(it) == fIntermediateTensorFrequencyLookup.end() &&
              std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), name) == fOutputTensorNames.end() &&
-             fInitializedTensors.find(name) == fInitializedTensors.end() &&
-             fDynamicTensorInfos.find(name) == fDynamicTensorInfos.end()){
+             fInitializedTensors.find(name) == fInitializedTensors.end())
+         {
             fIntermediateTensorFrequencyLookup[it] = op_idx;
          }
       }
       i++;
    }
 
+   // loop on initialized tensors and make the integers as constant to be
+   // not written in a weight file
+   for (auto &it : fInitializedTensors) {
+      if (it.second.IsWeightTensor() && it.second.type() !=  ETensorType::FLOAT)
+         it.second.SetConstant();
+   }
+
+   // check if there are initialized tensors to write in a weight file
+   // support for the time being only weight of FLOAT type
+   if (fUseWeightFile) {
+      bool modelHasWeights = false;
+      for (auto &it : fInitializedTensors) {
+         if (it.second.IsWeightTensor()) {
+            modelHasWeights = true;
+            break;
+         }
+      }
+      if (!modelHasWeights)
+         fUseWeightFile = false;
+   }
+
+   // update fIntermediateTensorFrequencyLookup for alias tensors
+   for (auto & it : fAliasTensors) {
+      if (fIntermediateTensorFrequencyLookup.find(it.first) == fIntermediateTensorFrequencyLookup.end()) continue;
+      if (fIntermediateTensorFrequencyLookup.find(it.second) == fIntermediateTensorFrequencyLookup.end() )
+         fIntermediateTensorFrequencyLookup[it.second] = fIntermediateTensorFrequencyLookup[it.first];
+      else {
+         // take the largest one
+         fIntermediateTensorFrequencyLookup[it.second] = std::max(fIntermediateTensorFrequencyLookup[it.second],fIntermediateTensorFrequencyLookup[it.first] );
+      }
+   }
+
    fIsInitialized = true;
 }
 
@@ -653,7 +700,8 @@ std::string GenerateConstantTensorCode(const std::pair<std::string, InitializedT
    std::string type = ConvertTypeToString(t.second.type());
    size_t length = ConvertShapeToLength(t.second.shape());
    // avoid using stack sizes for constant tensors to reduce compilation time
-   bool allocateOnStack = (length > 100) ? false : true;
+   // also for weights which can be broadcasted do not use stack but allocate as a std::vector
+   bool allocateOnStack = (length > 100 || t.second.IsWeightTensor()) ? false : true;
 
    const T *data = t.second.data<T>();
 
@@ -676,7 +724,7 @@ std::string GenerateConstantTensorCode(const std::pair<std::string, InitializedT
       else {
          strs << ConvertValuesToString(length, data) << ";\n";
       }
-      strs << "const " << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n";
+      strs << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n";
    }
    return strs.str();
 }
@@ -684,10 +732,12 @@ std::string GenerateConstantTensorCode(const std::pair<std::string, InitializedT
 void RModel::GenerateInitializedTensorInfo()
 {
    if (!fInitializedTensors.empty())
-      fGC += "// initialized tensors\n";
+      fGC += "// initialized (weights and constant) tensors\n";
 
+   // here are constant tensor or initialized ones which are not weights (e.g. int64_t tensors )
    for (auto &i : fInitializedTensors) {
-      if (!fUseWeightFile || i.second.IsConstantTensor()) {
+      if (i.second.IsNotWritable())  continue;
+      if (!fUseWeightFile || i.second.IsConstantTensor() || !i.second.IsWeightTensor() ) {
          if (i.second.type() == ETensorType::FLOAT) {
             fGC += GenerateConstantTensorCode<float>(i);
             fConstantTensorSize += ConvertShapeToLength(i.second.shape()) * 4;
@@ -723,7 +773,8 @@ void RModel::GenerateIntermediateTensorInfo() {
    if (!fIntermediateTensorInfos.empty()) {
       std::string tensor_declaration_block = "";
       for (auto &i : fIntermediateTensorInfos) {
-         if (i.second.type == ETensorType::BOOL) {
+         bool  is_alias = (IsAliasTensor(i.first));
+         if (i.second.type == ETensorType::BOOL && !is_alias) {
                tensor_declaration_block += "std::vector<std::uint8_t> fTensor_" + i.first + " = std::vector<std::uint8_t>(" + std::to_string(ConvertShapeToLength(i.second.shape)) + ");\n";
                tensor_declaration_block += "std::uint8_t * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
                continue;
@@ -734,7 +785,7 @@ void RModel::GenerateIntermediateTensorInfo() {
          bool not_in_output_names =
             (std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) == fOutputTensorNames.end());
 
-         if ((not_in_freq_map && not_in_output_names) || (!not_in_freq_map && !is_extended && not_in_output_names)) {
+         if (((not_in_freq_map && not_in_output_names) || (!not_in_freq_map && !is_extended && not_in_output_names) ) && !is_alias) {
             size_t length = ConvertShapeToLength(i.second.shape);
 
             if (i.second.type == ETensorType::FLOAT) {
@@ -753,6 +804,10 @@ void RModel::GenerateIntermediateTensorInfo() {
                fOtherTensorSize += 8 * length;
             }
          }
+         if (is_alias) {
+             tensor_declaration_block += ConvertTypeToString(i.second.type) + " * tensor_" + i.first + " = nullptr;\n";
+         }
+
       }
 
       if (tensor_declaration_block.length()) {
@@ -763,17 +818,10 @@ void RModel::GenerateIntermediateTensorInfo() {
    if (!fDynamicTensorInfos.empty()) {
       fGC += "//--- declare the dynamic tensors\n";
       for (auto &i : fDynamicTensorInfos) {
-         if (i.second.type == ETensorType::FLOAT) {
-            fGC += "std::vector<float> fTensor_" + i.first + ";\n";
-            fGC += "float * tensor_" + i.first + " = nullptr;\n";
-         } else if (i.second.type == ETensorType::DOUBLE) {
-            fGC += "std::vector<double> fTensor_" + i.first + ";\n";
-            fGC += "double * tensor_" + i.first + " = nullptr;\n";
-         } else if (i.second.type == ETensorType::INT64) {
-            fGC += "std::vector<int64_t> fTensor_" + i.first + ";\n";
-            fGC += "int64_t * tensor_" + i.first + " = nullptr;\n";
-         }
+         fGC += ConvertTypeToString(i.second.type) + " * tensor_" + i.first + " = nullptr;\n";
       }
+      fGC += "//--- dynamic tensors pool\n";
+      fGC += "std::vector<char> fDynamicMemoryPool;\n";
    }
 }
 
@@ -791,14 +839,87 @@ void RModel::GenerateOperatorDeclarations() {
 
 void RModel::GenerateDynamicTensorInfo()
 {
+   // generate code for allocating dynamic tensors using the greedy memory allocations
+   if (fDynamicTensorInfos.empty())
+      return;
+
+   if (fVerbose) {
+      std::cout << "generating code for dynamic tensor management" << std::endl;
+      PrintDynamicTensors();
+   }
+
    std::stringstream out;
+   out << "//  dynamic tensor memory management\n";
+   out << SP << "std::vector<TMVA::Experimental::SOFIE::TensorLifeInfo> dynamicTensorInfos;\n";
+   out << SP << "dynamicTensorInfos.reserve(" << fDynamicTensorInfos.size() << ");\n";
+
+   // loop on all the operators to find begin/end life of the tensors
+   int op_index = 0;
+   std::vector<std::pair<std::string, ETensorType>> tensors;
+   tensors.reserve(fDynamicTensorInfos.size());
+   for (auto & op : fOperators) {
+      // loop on output tensors -
+      for (auto &it : op->GetOpOutputTensors()) {
+         if (fVerbose) {
+            auto op_ptr = op.get();
+            std::cout << "Looping on operator " << op_index << "   " << typeid(*op_ptr).name() << std::endl;
+         }
+         // check if is a dynamic tensor and not an alias tensor
+         std::string name = std::string(it);
+         if ( fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end() && !IsAliasTensor(name)) {
+            auto tensor_size =  ConvertDimShapeToLength(GetDimTensorShape(name));
+            auto type = GetTensorType(name);
+            size_t type_size = GetTypeSize(type);
+            int begin = op_index;
+            int end = fOperators.size();
+            // look for end
+            auto it_lookup = fIntermediateTensorFrequencyLookup.find(name);
+            if (it_lookup != fIntermediateTensorFrequencyLookup.end())
+               end = it_lookup->second + 1;  // end is last time used + 1
+            // // some tensors (like xcol in convolutions) are just used within the operators
+            // if (end == 0 && begin > 0) end = begin+1;
+
+            if (begin> end) {
+               std::cout << "op " << op_index << "tensor_" << name << " begin " << begin << "  "  << " end " << end << std::endl;
+               throw std::runtime_error("TMVA-SOFIE: RModel::GenerateDynamicTensorInfo: tensor_" + name + " has end before begin");
+            }
+
+            // write in code
+            out << SP << "dynamicTensorInfos.push_back( {" << begin << ", " << end << ", " << type_size << "* (" << tensor_size << ") });"
+                << " // tensor_" << name << std::endl;
+            tensors.push_back({name,type});
+         }
+      }
+      op_index++; // increment operator index
+   }
+   out << "\n" << SP << "auto memory_result = OrganizeMemory(dynamicTensorInfos);\n\n";
+   out << "//  allocating now the memory\n";
+   out << SP << "fDynamicMemoryPool = std::vector<char>(memory_result.total_bytes);\n";
+   out << SP << "int idx = 0;\n";
+   for (auto & it : tensors) {
+      out << SP << "tensor_" << it.first << " = reinterpret_cast<" << ConvertTypeToString(it.second) << " *>(fDynamicMemoryPool.data() + memory_result.offsets[idx++]);\n";
+   }
+   // check that all dynamic tensors are covered
+   bool missingTensor = false;
    for (auto &i : fDynamicTensorInfos) {
-      auto length = ConvertDynamicShapeToLength(i.second.shape);
-      out << SP << "if (" << length << " > 0) {\n";
-      out << SP << SP << "fTensor_" << i.first << ".resize(" << length << ");\n";
-      out << SP << SP << "tensor_" << i.first << " = fTensor_" << i.first << ".data();\n";
-      out << SP << "}\n";
+      if (IsAliasTensor(i.first)) continue;
+      if (std::find(tensors.begin(), tensors.end(), std::pair<std::string,ETensorType>{i.first, i.second.type}) == tensors.end()) {
+         std::cout << "Dynamic tensors " << i.first << " is not in list of operator input/output " << std::endl;
+         missingTensor = true;
+      }
    }
+   if (missingTensor)
+      throw std::runtime_error("TMVA-SOFIE: RModel::GenerateDynamicTensorInfo - some tensors are not in input/output list");
+
+
+
+   // for (auto &i : fDynamicTensorInfos) {
+   //    auto length = ConvertDynamicShapeToLength(i.second.shape);
+   //    out << SP << "if (" << length << " > 0) {\n";
+   //    out << SP << SP << "fTensor_" << i.first << ".resize(" << length << ");\n";
+   //    out << SP << SP << "tensor_" << i.first << " = fTensor_" << i.first << ".data();\n";
+   //    out << SP << "}\n";
+   // }
    fGC += out.str();
 }
 
@@ -941,7 +1062,7 @@ void RModel::GenerateSessionCode()
          CheckAndFlushIntermediateMemory(fOperators[op_idx]->GetOpInputTensors(), op_idx);
       }
 
-      // to check remaining unused fragments after memory allocation (lesser the better)
+  // to check remaining unused fragments after memory allocation (lesser the better)
       // for (const auto &it: fIntermediateMemoryInfo.available_stack){
       //    std::cout<<"chunk_idx: "<<it.first<<", chunk_size: "<<it.second<<"\n";
       // }
@@ -969,13 +1090,13 @@ void RModel::GenerateSessionCode()
    // Generate code for Session constructor
    if (fUseSession) {
       std::string sessionName = "Session";
-      if (fIsSubGraph)
+      if (fIsSubGraph) 
          sessionName += "_" + fName;
       // add here specific operator code that needs to define session data members
       fGC += "\n";
       for (size_t id = 0; id < fOperators.size(); id++) {
          std::string opName = std::to_string(id);
-         fGC += fOperators[id]->GenerateSessionMembersCode(opName);
+         fGC += fOperators[id]->GenerateSessionMembersCode(opName);        
       }
       fGC += "\n";
       // here add initialization and reading of weight tensors
@@ -996,6 +1117,8 @@ void RModel::GenerateSessionCode()
       // add initialization of shape parameters
       // assume all parameters are of type size_t
       if (!fDimShapeNames.empty()) {
+         // sort first the shape parameters in alphabetical order to avoid a random order
+         std::sort(fDimShapeNames.begin(), fDimShapeNames.end() );
          for (auto &p : fDimShapeNames) {
             fGC += ",\n";
             fGC += "        size_t " + p + " = " + fShapeParams[p];
@@ -1021,23 +1144,28 @@ void RModel::GenerateSessionCode()
       fGC += "}\n\n";
    }
 
-   fGC += doInferSignature + "{\n";
-   fGC += "\n";
+   if (fProfile) {
+      RModelProfiler profiler(*this);
+      profiler.Generate();
+      fGC += fProfilerGC; 
+   } else {
+      fGC += doInferSignature + "{\n";
+      fGC += "\n";
 
-   // generate the inference code
-   if (fVerbose)
-      std::cout << "Generating main inference code for " << fName << std::endl;
+      // generate the inference code
+      if (fVerbose)
+         std::cout << "Generating main inference code for " << fName << std::endl;
 
-   if (fOutputTensorNames.size() == 0)
-      throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported");
+      if (fOutputTensorNames.size() == 0)
+         throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported");
 
-   for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
-      if (fVerbose)
+      for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
+         if (fVerbose)
          std::cout << "Generating code for operator .... " << op_idx << std::endl;
-      fGC += (fOperators[op_idx]->Generate(std::to_string(op_idx)));
-   }
+         fGC += (fOperators[op_idx]->Generate(std::to_string(op_idx)));
+      }
 
-   fGC += SP + "using TMVA::Experimental::SOFIE::UTILITY::FillOutput;\n\n";
+      fGC += SP + "using TMVA::Experimental::SOFIE::UTILITY::FillOutput;\n\n";
 
    for (std::string const &name : fOutputTensorNames) {
       // need to check is size is the same (don't want to return a vector with
@@ -1048,7 +1176,8 @@ void RModel::GenerateSessionCode()
       fGC += SP + "FillOutput(tensor_" + name + ", output_tensor_" + name + ", " + n + ");\n";
    }
 
-   fGC += "}\n\n";
+      fGC += "}\n\n";
+   }
 
    // generate the inference overload that returns an output struct
    GenerateOutput();
@@ -1061,9 +1190,11 @@ void RModel::GenerateSessionCode()
 
 void RModel::Generate(std::underlying_type_t<Options> options, int batchSize, long pos, bool verbose)
 {
+   bool profile = (options & static_cast<std::underlying_type_t<Options>>(Options::kProfile));
    fVerbose = verbose;
    fBatchSize = batchSize;
    fReadPos = pos;
+   fProfile = profile;
 
    // session flag is used in operator initialize
    if (static_cast<std::underlying_type_t<Options>>(Options::kNoSession) & options) {
@@ -1083,14 +1214,21 @@ void RModel::Generate(std::underlying_type_t<Options> options, int batchSize, lo
          "TMVA-SOFIE: RModel::Generate: cannot use a separate weight file without generating a Session class");
    }
 
-   if (static_cast<std::underlying_type_t<Options>>(Options::kGNN) & options)
+   if (static_cast<std::underlying_type_t<Options>>(Options::kGNN) & options) 
       fIsGNN = true;
-   if (static_cast<std::underlying_type_t<Options>>(Options::kGNNComponent) & options)
+   if (static_cast<std::underlying_type_t<Options>>(Options::kGNNComponent) & options) 
       fIsGNNComponent = true;
 
    // initialize the model including all operators and sub-graphs
    Initialize(batchSize, verbose);
 
+   // if having dynamic tensor we need to have a Session
+   if (!fDynamicTensorInfos.empty()) {
+      fUseSession = true;
+      if (verbose)
+         std::cout << "Warning: Force having a Session since model has dynamic tensors " << std::endl;
+   }
+
    std::string hgname;
    if (!fIsGNNComponent && !fIsSubGraph) {
       fGC.clear();
@@ -1099,13 +1237,13 @@ void RModel::Generate(std::underlying_type_t<Options> options, int batchSize, lo
 
    // generate first code for the subgraphs
    for (auto &graph : fSubGraphs) {
-      if (fVerbose)
+      if (fVerbose) 
          std::cout << "generate session code for subgraph " << graph->fName << std::endl;
       graph->GenerateSessionCode();
       fGC += graph->fGC;
    }
 
-   if (fVerbose)
+   if (fVerbose) 
       std::cout << "generate Main session code - model  " << fName << std::endl;
 
    // generate main session code
@@ -1120,7 +1258,9 @@ void RModel::Generate(std::underlying_type_t<Options> options, int batchSize, lo
 void RModel::ReadInitializedTensorsFromFile(long pos) {
     // generate the code to read initialized tensors from a text data file
     if (fWeightFile == WeightFileType::Text) {
-        if (fInitializedTensors.empty()) return;
+        // check if there are tensors to write
+
+        if (!fUseWeightFile) return;
 
         fGC += "   std::ifstream f;\n";
         fGC += "   f.open(filename);\n";
@@ -1143,7 +1283,7 @@ void RModel::ReadInitializedTensorsFromFile(long pos) {
                std::string length = std::to_string(ConvertShapeToLength(i.second.shape()));
                fGC += "   ReadTensorFromStream(f, " + tensor_name + ", \"" + tensor_name + "\", " + length + ");\n";
             } else {
-               std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a file");
+               throw std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a file");
             }
         }
         fGC += "   f.close();\n";
@@ -1288,7 +1428,7 @@ long RModel::WriteInitializedTensorsToFile(std::string filename) {
                }
             }
             else {
-               std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be written to a file");
+               throw std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be written to a file");
             }
             if (f.fail())
                std::runtime_error("tmva-sofie failed to write tensor data to file for  " + tensor_name);
@@ -1301,7 +1441,7 @@ long RModel::WriteInitializedTensorsToFile(std::string filename) {
     }
 }
 
-void RModel::PrintRequiredInputTensors() {
+void RModel::PrintRequiredInputTensors() const {
     std::cout << "Model requires following inputs:\n";
     for (auto& inputInfo: fInputTensorInfos) {
         std::cout << "Parametrised Tensor name: " << inputInfo.first << "\t";
@@ -1331,7 +1471,7 @@ void RModel::PrintRequiredInputTensors() {
     std::cout << "\n";
 }
 
-void RModel::PrintInitializedTensors() {
+void RModel::PrintInitializedTensors() const {
     std::cout << "Model initialized the following tensors:\n";
     for (auto& it: fInitializedTensors) {
         std::cout << "Tensor name: \"" << it.first << "\"\t";
@@ -1349,7 +1489,7 @@ void RModel::PrintInitializedTensors() {
     std::cout << "\n";
 }
 
-void RModel::PrintIntermediateTensors() {
+void RModel::PrintIntermediateTensors() const {
     std::cout << "Model specify the following intermediate tensors:\n";
     for (auto& it: fIntermediateTensorInfos) {
         std::cout << "Tensor name: \"" << it.first << "\"\t";
@@ -1364,7 +1504,7 @@ void RModel::PrintIntermediateTensors() {
     std::cout << "\n";
 }
 
-void RModel::PrintDynamicTensors() {
+void RModel::PrintDynamicTensors() const {
     std::cout << "Model specify the following dynamic tensors:\n";
     for (auto& it: fDynamicTensorInfos) {
         std::cout << "Tensor name: \"" << it.first << "\"\t";
@@ -1379,14 +1519,16 @@ void RModel::PrintDynamicTensors() {
     std::cout << "\n";
 }
 
-void RModel::PrintOutputTensors() {
+void RModel::PrintOutputTensors() const {
     std::cout << "Model specify the following output tensors:\n";
     for (auto& it: fOutputTensorNames) {
         std::cout << "Tensor name: \"" << it << "\"\t";
-        if (!IsDynamicTensor(it))
-           std::cout << "shape: " << ConvertShapeToString(GetTensorShape(it)) << std::endl;
-        else
-          std::cout << "shape: " << ConvertShapeToString(GetDynamicTensorShape(it)) << std::endl;
+        try {
+         auto shape = GetDimTensorShape(it);
+         std::cout << "with shape: " << ConvertShapeToString(shape) << std::endl;
+        } catch (...) {
+          std::cout << "with shape not yet defined" << std::endl;
+        }
     }
     std::cout << "\n";
 }
diff --git a/tmva/sofie/src/RModelProfiler.cxx b/tmva/sofie/src/RModelProfiler.cxx
new file mode 100644
index 0000000000000..c56d4127e99b7
--- /dev/null
+++ b/tmva/sofie/src/RModelProfiler.cxx
@@ -0,0 +1,176 @@
+#include "TMVA/RModelProfiler.hxx"
+#include "TMVA/SOFIE_common.hxx"
+
+namespace TMVA {
+namespace Experimental {
+namespace SOFIE {
+
+// The constructor now just registers the necessary C++ libraries.
+RModelProfiler::RModelProfiler(RModel &model) : fModel(model)
+{
+   fModel.AddNeededStdLib("chrono");      // for timing operators
+   fModel.AddNeededStdLib("vector");      // for storing profiling results
+   fModel.AddNeededStdLib("string");      // for operator names
+   fModel.AddNeededStdLib("map");         // for the results map
+   fModel.AddNeededStdLib("iostream");    // for printing results
+   fModel.AddNeededStdLib("iomanip");     // for printing results
+}
+
+// This function generates the helper functions inside the Session struct.
+void RModelProfiler::GenerateUtilityFunctions()
+{
+   auto &gc = fModel.fProfilerGC;
+
+   // Generate PrintProfilingResults function
+   gc += "   // generate code for printing operator results. By default order according to time (from higher to lower)\n";
+   gc += "   void PrintProfilingResults(bool order = true) const {\n";
+   gc += "      if (fProfilingResults.empty()) {\n";
+   gc += "         std::cout << \"No profiling results to display.\" << std::endl;\n";
+   gc += "         return;\n";
+   gc += "      }\n";
+   gc += "\n";
+   gc += "      // compute summary statistics of profiling results and sort them in decreasing time\n";
+   gc += "      std::vector<std::tuple<std::string, double, double, int>> averageResults;\n";
+   gc += "      std::cout << \"\\n\" << std::string(50, '=') << std::endl;\n";
+   gc += "      std::cout << \"         AVERAGE PROFILING RESULTS\" << std::endl;\n";
+   gc += "      std::cout << std::string(50, '=') << std::endl;\n";
+   gc += "      for (const auto& op : fProfilingResults) {\n";
+   gc += "         double sum = 0.0;\n";
+   gc += "         double sum2 = 0.0;\n";
+   gc += "         for (double time : op.second) {\n";
+   gc += "            sum += time;\n";
+   gc += "            sum2 += time*time;\n";
+   gc += "         }\n";
+   gc += "         double average = sum / op.second.size();\n";
+   gc += "         double stddev = std::sqrt(( sum2 - sum *average)/ (op.second.size()-1));\n";
+   gc += "         averageResults.push_back({op.first, average, stddev, op.second.size()});\n";
+   gc += "      }\n";
+   gc += "\n";
+   gc += "      // sort average results in decreasing time\n";
+   gc += "      std::sort(averageResults.begin(), averageResults.end(),\n";
+   gc += "          []( std::tuple<std::string,double,double,int> a, std::tuple<std::string,double,double,int> b) {return std::get<1>(a) > std::get<1>(b); });\n";
+   gc += "\n";
+   gc += "      for (const auto & r : averageResults) {\n";
+   gc += "         std::cout << \"  \" << std::left << std::setw(20) << std::get<0>(r)\n";
+   gc += "                   << \": \" << std::fixed << std::setprecision(6) << std::get<1>(r) << \" +/- \" \n";
+   gc += "                   << std::get<2>(r)/std::sqrt(std::get<3>(r)) << \" us\"\n";
+   gc += "                   << \"  (over \" << std::get<3>(r) << \" runs)\" << std::endl;\n";
+   gc += "      }\n";
+   gc += "      std::cout << std::string(50, '=') << \"\\n\" << std::endl;\n";
+   gc += "   }\n";
+   gc += "\n";
+
+   // Generate ResetProfilingResults function
+   gc += "   void ResetProfilingResults() {\n";
+   gc += "      fProfilingResults.clear();\n";
+   gc += "   }\n";
+   gc += "\n";
+
+   // Generate GetOpAvgTime function
+   gc += "   std::map<std::string, double> GetOpAvgTime() const {\n";
+   gc += "      if (fProfilingResults.empty()) {\n";
+   gc += "         return {};\n";
+   gc += "      }\n";
+   gc += "\n";
+   gc += "      std::map<std::string, double> avg;\n";
+   gc += "      for (const auto& op : fProfilingResults) {\n";
+   gc += "         double mean = 0.0;\n";
+   gc += "         for (double time : op.second) {\n";
+   gc += "            mean += time;\n";
+   gc += "         }\n";
+   gc += "         mean /= op.second.size();\n";
+   gc += "         avg[op.first] = mean;\n";
+   gc += "      }\n";
+   gc += "\n";
+   gc += "      return avg;\n";
+   gc += "   }\n";
+   gc += "\n";
+
+   // Generate GetOpVariance function
+   gc += "   std::map<std::string, double> GetOpVariance() const {\n";
+   gc += "      if (fProfilingResults.empty()) {\n";
+   gc += "         return {};\n";
+   gc += "      }\n";
+   gc += "\n";
+   gc += "      std::map<std::string, double> variance;\n";
+   gc += "      for (const auto& op : fProfilingResults) {\n";
+   gc += "         // Var[X] = E[X^2] - E[X]^2\n";
+   gc += "         double mean = 0.0, mean2 = 0.0;\n";
+   gc += "         for (double time : op.second) {\n";
+   gc += "            mean += time;\n";
+   gc += "            mean2 += time * time;\n";
+   gc += "         }\n";
+   gc += "         mean /= op.second.size();\n";
+   gc += "         mean2 /= op.second.size();\n";
+   gc += "         variance[op.first] = mean2 - mean * mean;\n";
+   gc += "      }\n";
+   gc += "\n";
+   gc += "      return variance;\n";
+   gc += "   }\n";
+}
+
+// Main generation function for the profiler.
+void RModelProfiler::Generate()
+{
+   // Clear the profiler's code string to start fresh.
+   fModel.fProfilerGC.clear();
+   auto &gc = fModel.fProfilerGC;
+
+   // 1. Add the data member to the Session struct to store results.
+   gc += "public:\n";
+   gc += "   // Maps an operator name to a vector of its execution times (in microseconds).\n";
+   gc += "   std::map<std::string, std::vector<double>> fProfilingResults;\n\n";
+
+   // 2. Generate and add the utility functions like PrintProfilingResults.
+   GenerateUtilityFunctions();
+
+   // 3. Generate the signature for the profiled doInfer method.
+   std::string doInferSignature = fModel.GenerateInferSignature();
+   if (!doInferSignature.empty()) doInferSignature += ", ";
+   for (auto const &name : fModel.GetOutputTensorNames()) {
+      doInferSignature += " std::vector<" + ConvertTypeToString(fModel.GetTensorType(name)) + "> &output_tensor_" + name + ",";
+   }
+   if (!fModel.GetOutputTensorNames().empty()) {
+      doInferSignature.back() = ' ';
+   }
+   gc += "void doInfer(" + doInferSignature + ") {\n";
+
+   // 4. Generate the body of the doInfer method with timing instrumentation.
+   gc += "   // Timer variable for profiling\n";
+   gc += "   std::chrono::steady_clock::time_point tp_start, tp_overall_start;\n\n";
+   gc += "   tp_overall_start = std::chrono::steady_clock::now();\n\n";
+
+   for (size_t op_idx = 0; op_idx < fModel.fOperators.size(); ++op_idx) {
+      const auto& op = fModel.fOperators[op_idx];
+      gc += "   // -- Profiling for operator " + op->name + " --\n";
+      gc += "   tp_start = std::chrono::steady_clock::now();\n\n";
+
+      // Add the actual operator inference code
+      gc += op->Generate(std::to_string(op_idx));
+
+      // Add the code to stop the timer and store the result
+      gc += "\n   fProfilingResults[\"" + op->name + "\"].push_back(\n";
+      gc += "      std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(\n";
+      gc += "         std::chrono::steady_clock::now() - tp_start).count());\n\n";
+   }
+
+   // 5. Generate the code to fill the output tensors.
+   gc += "   using TMVA::Experimental::SOFIE::UTILITY::FillOutput;\n\n";
+   for (std::string const &name : fModel.GetOutputTensorNames()) {
+      bool isIntermediate = fModel.fIntermediateTensorInfos.count(name) > 0;
+      std::string n = isIntermediate ? std::to_string(ConvertShapeToLength(fModel.GetTensorShape(name)))
+                                     : ConvertDynamicShapeToLength(fModel.GetDynamicTensorShape(name));
+      gc += "   FillOutput(tensor_" + name + ", output_tensor_" + name + ", " + n + ");\n";
+   }
+
+   gc += "\n   // -- Record overall inference time --\n";
+   gc += "   fProfilingResults[\"Overall_Time\"].push_back(\n";
+   gc += "      std::chrono::duration_cast<std::chrono::duration<double, std::micro>>(\n";
+   gc += "         std::chrono::steady_clock::now() - tp_overall_start).count());\n";
+
+   gc += "}\n\n"; // End of doInfer function
+}
+
+} // namespace SOFIE
+} // namespace Experimental
+} // namespace TMVA
diff --git a/tmva/sofie/src/SOFIE_common.cxx b/tmva/sofie/src/SOFIE_common.cxx
index c107b489be19e..54fed04ba42b1 100644
--- a/tmva/sofie/src/SOFIE_common.cxx
+++ b/tmva/sofie/src/SOFIE_common.cxx
@@ -4,6 +4,8 @@
 #include <sstream>
 #include <stdexcept>
 #include <charconv>
+#include <unordered_map>
+#include <set>
 
 namespace TMVA {
 namespace Experimental {
@@ -89,7 +91,7 @@ std::string ConvertTypeToString(ETensorType type){
          return "double";
       }
       case ETensorType::BOOL : {
-         return "bool";
+         return "uint8_t";
       }
       default:{
          return "other_" + std::to_string( (int) type);
@@ -130,7 +132,7 @@ std::string ConvertDimShapeToString(const std::vector<Dim> & shape) {
    std::stringstream out;
    out << "{ ";
    for (size_t i = 0; i < shape.size(); i++) {
-      out << shape[i].GetVal();
+      out << shape[i];
       if (i < shape.size()-1) out << " , ";
    }
    out << " }";
@@ -412,14 +414,15 @@ std::pair<int, std::vector<size_t>>  UTILITY::MultidirectionalBroadcastShape(std
             + " to a common shape.");
    }
 }
-// unidirectional broadcast- only B changes
+// unidirectional broadcast- of shape A to target B
 std::vector<size_t>  UTILITY::UnidirectionalBroadcastShape(std::vector<size_t> & shapeA, std::vector<size_t> & shapeB)
 {
-   auto ret = UTILITY::MultidirectionalBroadcastShape(shapeA, shapeB);
+   auto ret = UTILITY::MultidirectionalBroadcastShape(shapeB, shapeA);
    if (ret.first > 1) {
-      std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape "
-            + ConvertShapeToString(shapeA) + " and " + ConvertShapeToString(shapeB)
-            + " to a common shape.");
+      throw
+         std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape "
+            + ConvertShapeToString(shapeA) + " to  " + ConvertShapeToString(shapeB)
+            + " in a common shape.");
    }
    return ret.second;
 }
@@ -547,6 +550,130 @@ std::vector<Dim> UTILITY::ComputeStrideFromShape(const std::vector<Dim> & shape)
    return strides;
 }
 
+struct FreeBlock {
+  std::size_t offset;
+  std::size_t size;
+  bool operator<(const FreeBlock& other) const {
+    // order by offset for deterministic coalescing
+    return offset < other.offset;
+  }
+};
+
+struct MemoryEvent {
+  int t;      // time (i.e. operator index)
+  int type;   // 0 = END first, 1 = START
+  int idx;    // tensor index
+  bool operator<(const MemoryEvent& o) const {
+    if (t != o.t) return t < o.t;
+    return type < o.type; // END before START at the same time
+  }
+};
+
+/// Greedy best-fit planner with coalescing free list.
+MemoryResult OrganizeMemory(const std::vector<TensorLifeInfo> & tensorsInfo )
+{
+   // Basic validation
+   for (const auto &t : tensorsInfo) {
+      if (!(t.end > t.begin)) {
+         throw std::runtime_error("Each tensor must have end > begin.");
+      }
+   }
+
+   // Build events: free before allocate at equal times.
+   std::vector<MemoryEvent> events;
+   events.reserve(tensorsInfo.size() * 2);
+   for (int i = 0; i < (int)tensorsInfo.size(); ++i) {
+      events.push_back({tensorsInfo[i].end, 0, i});   // END
+      events.push_back({tensorsInfo[i].begin, 1, i}); // START
+   }
+   std::sort(events.begin(), events.end());
+
+   std::vector<size_t> tensorsOffset(tensorsInfo.size());
+
+   // Free list ordered by offset (for O(log n) coalescing)
+   // and faster insert/erase with respect to a vector
+   std::set<FreeBlock> free_list;
+
+   // Bookkeeping: size/offset map for frees.
+   std::unordered_map<int, std::size_t> live_size;
+   std::unordered_map<int, std::size_t> live_offset;
+
+   std::size_t total_bytes = 0;
+
+   auto allocate_best_fit = [&](std::size_t need) -> std::size_t {
+      // Find the *smallest* block whose size >= need (best-fit).
+      // Since free_list is ordered by offset, we scan to find best by size.
+      // (For very large sets you could maintain a multimap by size as well.)
+      auto best = free_list.end();
+      for (auto it = free_list.begin(); it != free_list.end(); ++it) {
+         if (it->size >= need) {
+            if (best == free_list.end() || it->size < best->size)
+               best = it;
+         }
+      }
+      if (best != free_list.end()) {
+         std::size_t off = best->offset;
+         if (best->size == need) {
+            free_list.erase(best);
+         } else {
+            FreeBlock updated{best->offset + need, best->size - need};
+            free_list.erase(best);
+            free_list.insert(updated);
+         }
+         return off;
+      }
+      // No free block large enough; grow the heap.
+      std::size_t off = total_bytes;
+      total_bytes += need;
+      return off;
+   };
+
+   auto try_coalesce = [&](std::set<FreeBlock>::iterator it) {
+      // Coalesce with previous
+      if (it != free_list.begin()) {
+         auto prev = std::prev(it);
+         if (prev->offset + prev->size == it->offset) {
+            FreeBlock merged{prev->offset, prev->size + it->size};
+            free_list.erase(prev);
+            it = free_list.erase(it);
+            it = free_list.insert(merged).first;
+         }
+      }
+      // Coalesce with next
+      auto next = std::next(it);
+      if (next != free_list.end() && it->offset + it->size == next->offset) {
+         FreeBlock merged{it->offset, it->size + next->size};
+         free_list.erase(next);
+         it = free_list.erase(it);
+         free_list.insert(merged);
+      }
+   };
+
+   // Sweep through time.
+   for (const auto &e : events) {
+      if (e.type == 0) { // END: free
+         auto it_sz = live_size.find(e.idx);
+         auto it_off = live_offset.find(e.idx);
+         if (it_sz != live_size.end() && it_off != live_offset.end()) {
+            FreeBlock fb{it_off->second, it_sz->second};
+            // Insert and coalesce with neighbors
+            auto it = free_list.insert(fb).first;
+            try_coalesce(it);
+            live_size.erase(it_sz);
+            live_offset.erase(it_off);
+         }
+      } else { // START: allocate
+         auto &t = tensorsInfo[e.idx];
+         std::size_t off = allocate_best_fit(t.size);
+         tensorsOffset[e.idx] = off;
+         live_size[e.idx] = t.size;
+         live_offset[e.idx] = off;
+      }
+   }
+
+   return MemoryResult{total_bytes, std::move(tensorsOffset)};
+}
+
 } // namespace SOFIE
 } // namespace Experimental
 } // namespace TMVA
diff --git a/tmva/sofie/test/TestCustomModelsFromONNX.cxx b/tmva/sofie/test/TestCustomModelsFromONNX.cxx
index 5b77caf2aed1d..401afb8257e25 100644
--- a/tmva/sofie/test/TestCustomModelsFromONNX.cxx
+++ b/tmva/sofie/test/TestCustomModelsFromONNX.cxx
@@ -323,6 +323,8 @@
 
 #include "ScatterElements_FromONNX.hxx"
 
+#include "MatMul_Stacked_FromONNX.hxx"
+
 #include "gtest/gtest.h"
 
 constexpr float DEFAULT_TOLERANCE = 1e-3f;
@@ -2856,7 +2858,7 @@ TEST(ONNX, RangeFloat) {
    float start = 1.;
    float limit = 10.;
    float delta = 2.;
-   TMVA_SOFIE_RangeFloat::Session s("RangeFloat_FromONNX.dat");
+   TMVA_SOFIE_RangeFloat::Session s("RangeFloat_FromONNX.dat",5);
    std::vector<float> output(s.infer(&start, &limit, &delta));
 
    // Checking the output size
@@ -2875,7 +2877,7 @@ TEST(ONNX, RangeInt) {
    int64_t start = 1;
    int64_t limit = 10;
    int64_t delta = 2;
-   TMVA_SOFIE_RangeInt::Session s("RangeInt_FromONNX.dat");
+   TMVA_SOFIE_RangeInt::Session s("RangeInt_FromONNX.dat",5);
    std::vector<int64_t> output(s.infer(&start, &limit, &delta));
 
    // Checking the output size
@@ -2947,7 +2949,7 @@ TEST(ONNX, Where) {
    // test also the broadcast of boolean tensors
    std::vector<float> input1 = {1,2};
    std::vector<float> input2 = {3,4,5,6};
-   bool cond[] = {true, false, true}; // need to pass arrays for booleans
+   uint8_t cond[] = {true, false, true}; // need to pass arrays for booleans
    std::vector<float> correct = {1,2,5,6,1,2};
    TMVA_SOFIE_Where::Session s("Where_FromONNX.dat");
    std::vector<float> output(s.infer(input1.data(), input2.data(), cond));
@@ -3214,3 +3216,24 @@ TEST(ONNX, ScatterElements)
       EXPECT_LE(std::abs(output[i] - correct_output[i]), DEFAULT_TOLERANCE);
    }
 }
+
+TEST(ONNX, MatMul_Stacked)
+{
+   // test scatter elements (similar test as in ONNX doc)
+   std::vector<float> input1 = {1,2,3,4,5,6,7,8};    // input tensor shape is (2,2,2)
+   std::vector<float> input2 = {2,3};                // shape is (2,1)
+
+   std::vector<float> correct_output = {8,18, 28,38};
+
+   // model is dynamic , use N = 2
+   TMVA_SOFIE_MatMul_Stacked::Session s("MatMul_Stacked_FromONNX.dat", 2);
+
+   auto output = s.infer(2, input1.data(), input2.data());
+
+   // Checking output size
+   EXPECT_EQ(output.size(), correct_output.size());
+   // Checking output
+   for (size_t i = 0; i < output.size(); ++i) {
+      EXPECT_LE(std::abs(output[i] - correct_output[i]), DEFAULT_TOLERANCE);
+   }
+}
diff --git a/tmva/sofie/test/TestCustomModelsFromROOT.cxx b/tmva/sofie/test/TestCustomModelsFromROOT.cxx
index d077aede3e2e6..7e3c8c9c2fc09 100644
--- a/tmva/sofie/test/TestCustomModelsFromROOT.cxx
+++ b/tmva/sofie/test/TestCustomModelsFromROOT.cxx
@@ -891,7 +891,8 @@ TEST(ROOT, RangeFloat) {
    float start = 1.;
    float limit = 10.;
    float delta = 2.;
-   std::vector<float> output = TMVA_SOFIE_RangeFloat::infer(&start, &limit, &delta);
+   TMVA_SOFIE_RangeFloat::Session s("",5);
+   std::vector<float> output(s.infer(&start, &limit, &delta));
 
    // Checking the output size
    EXPECT_EQ(output.size(), sizeof(RangeFloat_ExpectedOutput::outputs) / sizeof(float));
@@ -909,7 +910,8 @@ TEST(ROOT, RangeInt) {
    int64_t start = 1;
    int64_t limit = 10;
    int64_t delta = 2;
-   std::vector<int64_t> output = TMVA_SOFIE_RangeInt::infer(&start, &limit, &delta);
+   TMVA_SOFIE_RangeInt::Session s("",5);
+   std::vector<int64_t> output(s.infer(&start, &limit, &delta));
 
    // Checking the output size
    EXPECT_EQ(output.size(), sizeof(RangeInt_ExpectedOutput::outputs) / sizeof(int64_t));
diff --git a/tmva/sofie/test/input_models/MatMul_Stacked.onnx b/tmva/sofie/test/input_models/MatMul_Stacked.onnx
new file mode 100644
index 0000000000000..19c39ee2adddd
--- /dev/null
+++ b/tmva/sofie/test/input_models/MatMul_Stacked.onnx
@@ -0,0 +1,19 @@
+
+onnx-example:�
+ 
+input1
+input2output"MatMulAddGraphZ
+input1
+
+N
+
+Z
+input2
+
+
+b
+output
+
+N
+
+B
\ No newline at end of file
diff --git a/tmva/sofie_parsers/src/RModelParser_ONNX.cxx b/tmva/sofie_parsers/src/RModelParser_ONNX.cxx
index 7b4ade2b6bc09..4903c8d1c6511 100644
--- a/tmva/sofie_parsers/src/RModelParser_ONNX.cxx
+++ b/tmva/sofie_parsers/src/RModelParser_ONNX.cxx
@@ -731,7 +731,8 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto &
          std::cout << "\t" << i << "  " << nodesOrder[i] << " parsing operator " << op_type << std::endl;
       }
 
-      std::unique_ptr<ROperator> op = ParseOperator(i, graph, nodesOrder, nodesChildren[i]);
+      std::unique_ptr<ROperator> op = ParseOperator(i, graph, nodesOrder, nodesChildren[nodesOrder[i]]);
+
       if (!op) {
          if (verbose) {
             std::cout << "\t\tskipping operator since it is fused with previous one" << std::endl;
@@ -739,6 +740,12 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto &
          // for skipping the fused nodes like Add after MatMul
          continue;
       }
+      const auto &nodeproto = graph.node(nodesOrder[i]);
+      op->name = nodeproto.name();
+      if (op->name.empty()) {
+          op->name = op_type + "_" + std::to_string(i);
+      }
+
       rmodel.AddOperator(std::move(op), node_order_exec++);
    }
 
diff --git a/tutorials/machine_learning/TMVA_SOFIE_ONNX.C b/tutorials/machine_learning/TMVA_SOFIE_ONNX.C
index 8c192789e1210..878167db8c791 100644
--- a/tutorials/machine_learning/TMVA_SOFIE_ONNX.C
+++ b/tutorials/machine_learning/TMVA_SOFIE_ONNX.C
@@ -19,7 +19,7 @@ void TMVA_SOFIE_ONNX(std::string inputFile = ""){
     SOFIE::RModel model = parser.Parse(inputFile, true);
 
     //Generating inference code
-    model.Generate();
+    model.Generate(SOFIE::Options::kProfile);
     // write the code in a file (by default Linear_16.hxx and Linear_16.dat
     model.OutputGenerated();